* [PATCH v3 1/8] PCI/AER: Check log level once and propagate down
2025-03-19 8:40 [PATCH v3 0/8] Rate limit AER logs Jon Pan-Doh
@ 2025-03-19 8:40 ` Jon Pan-Doh
2025-03-20 0:07 ` Sathyanarayanan Kuppuswamy
2025-03-19 8:40 ` [PATCH v3 2/8] PCI/AER: Make all pci_print_aer() log levels depend on error type Jon Pan-Doh
` (7 subsequent siblings)
8 siblings, 1 reply; 28+ messages in thread
From: Jon Pan-Doh @ 2025-03-19 8:40 UTC (permalink / raw)
To: Bjorn Helgaas, Karolina Stolarek
Cc: linux-pci, Martin Petersen, Ben Fuller, Drew Walton, Anil Agrawal,
Tony Luck, Ilpo Järvinen, Sathyanarayanan Kuppuswamy,
Lukas Wunner, Jonathan Cameron, Terry Bowman, Jon Pan-Doh
From: Karolina Stolarek <karolina.stolarek@oracle.com>
When reporting an AER error, we check its type multiple times
to determine the log level for each message. Do this check only
in the top-level functions (aer_isr_one_error(), pci_print_aer()) and
propagate the result down the call chain.
Signed-off-by: Karolina Stolarek <karolina.stolarek@oracle.com>
Signed-off-by: Jon Pan-Doh <pandoh@google.com>
Reviewed-by: Jon Pan-Doh <pandoh@google.com>
---
drivers/pci/pci.h | 2 +-
drivers/pci/pcie/aer.c | 37 ++++++++++++++++++++-----------------
drivers/pci/pcie/dpc.c | 2 +-
3 files changed, 22 insertions(+), 19 deletions(-)
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index b8911d1e10dc..75985b96ecc1 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -551,7 +551,7 @@ struct aer_err_info {
};
int aer_get_device_error_info(struct pci_dev *dev, struct aer_err_info *info);
-void aer_print_error(struct pci_dev *dev, struct aer_err_info *info);
+void aer_print_error(struct pci_dev *dev, struct aer_err_info *info, const char *level);
int pcie_read_tlp_log(struct pci_dev *dev, int where, int where2,
unsigned int tlp_len, bool flit,
diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
index 9cff7069577e..cc9c80cd88f3 100644
--- a/drivers/pci/pcie/aer.c
+++ b/drivers/pci/pcie/aer.c
@@ -670,20 +670,18 @@ static void pci_rootport_aer_stats_incr(struct pci_dev *pdev,
}
static void __aer_print_error(struct pci_dev *dev,
- struct aer_err_info *info)
+ struct aer_err_info *info,
+ const char *level)
{
const char **strings;
unsigned long status = info->status & ~info->mask;
- const char *level, *errmsg;
+ const char *errmsg;
int i;
- if (info->severity == AER_CORRECTABLE) {
+ if (info->severity == AER_CORRECTABLE)
strings = aer_correctable_error_string;
- level = KERN_WARNING;
- } else {
+ else
strings = aer_uncorrectable_error_string;
- level = KERN_ERR;
- }
for_each_set_bit(i, &status, 32) {
errmsg = strings[i];
@@ -696,11 +694,11 @@ static void __aer_print_error(struct pci_dev *dev,
pci_dev_aer_stats_incr(dev, info);
}
-void aer_print_error(struct pci_dev *dev, struct aer_err_info *info)
+void aer_print_error(struct pci_dev *dev, struct aer_err_info *info,
+ const char *level)
{
int layer, agent;
int id = pci_dev_id(dev);
- const char *level;
if (!info->status) {
pci_err(dev, "PCIe Bus Error: severity=%s, type=Inaccessible, (Unregistered Agent ID)\n",
@@ -711,8 +709,6 @@ void aer_print_error(struct pci_dev *dev, struct aer_err_info *info)
layer = AER_GET_LAYER_ERROR(info->severity, info->status);
agent = AER_GET_AGENT(info->severity, info->status);
- level = (info->severity == AER_CORRECTABLE) ? KERN_WARNING : KERN_ERR;
-
aer_printk(level, dev, "PCIe Bus Error: severity=%s, type=%s, (%s)\n",
aer_error_severity_string[info->severity],
aer_error_layer[layer], aer_agent_string[agent]);
@@ -720,7 +716,7 @@ void aer_print_error(struct pci_dev *dev, struct aer_err_info *info)
aer_printk(level, dev, " device [%04x:%04x] error status/mask=%08x/%08x\n",
dev->vendor, dev->device, info->status, info->mask);
- __aer_print_error(dev, info);
+ __aer_print_error(dev, info, level);
if (info->tlp_header_valid)
pcie_print_tlp_log(dev, &info->tlp, dev_fmt(" "));
@@ -765,15 +761,18 @@ void pci_print_aer(struct pci_dev *dev, int aer_severity,
{
int layer, agent, tlp_header_valid = 0;
u32 status, mask;
+ const char *level;
struct aer_err_info info;
if (aer_severity == AER_CORRECTABLE) {
status = aer->cor_status;
mask = aer->cor_mask;
+ level = KERN_WARNING;
} else {
status = aer->uncor_status;
mask = aer->uncor_mask;
tlp_header_valid = status & AER_LOG_TLP_MASKS;
+ level = KERN_ERR;
}
layer = AER_GET_LAYER_ERROR(aer_severity, status);
@@ -786,7 +785,7 @@ void pci_print_aer(struct pci_dev *dev, int aer_severity,
info.first_error = PCI_ERR_CAP_FEP(aer->cap_control);
pci_err(dev, "aer_status: 0x%08x, aer_mask: 0x%08x\n", status, mask);
- __aer_print_error(dev, &info);
+ __aer_print_error(dev, &info, level);
pci_err(dev, "aer_layer=%s, aer_agent=%s\n",
aer_error_layer[layer], aer_agent_string[agent]);
@@ -1257,14 +1256,15 @@ int aer_get_device_error_info(struct pci_dev *dev, struct aer_err_info *info)
return 1;
}
-static inline void aer_process_err_devices(struct aer_err_info *e_info)
+static inline void aer_process_err_devices(struct aer_err_info *e_info,
+ const char *level)
{
int i;
/* Report all before handle them, not to lost records by reset etc. */
for (i = 0; i < e_info->error_dev_num && e_info->dev[i]; i++) {
if (aer_get_device_error_info(e_info->dev[i], e_info))
- aer_print_error(e_info->dev[i], e_info);
+ aer_print_error(e_info->dev[i], e_info, level);
}
for (i = 0; i < e_info->error_dev_num && e_info->dev[i]; i++) {
if (aer_get_device_error_info(e_info->dev[i], e_info))
@@ -1282,6 +1282,7 @@ static void aer_isr_one_error(struct aer_rpc *rpc,
{
struct pci_dev *pdev = rpc->rpd;
struct aer_err_info e_info;
+ const char *level;
pci_rootport_aer_stats_incr(pdev, e_src);
@@ -1292,6 +1293,7 @@ static void aer_isr_one_error(struct aer_rpc *rpc,
if (e_src->status & PCI_ERR_ROOT_COR_RCV) {
e_info.id = ERR_COR_ID(e_src->id);
e_info.severity = AER_CORRECTABLE;
+ level = KERN_WARNING;
if (e_src->status & PCI_ERR_ROOT_MULTI_COR_RCV)
e_info.multi_error_valid = 1;
@@ -1300,11 +1302,12 @@ static void aer_isr_one_error(struct aer_rpc *rpc,
aer_print_port_info(pdev, &e_info);
if (find_source_device(pdev, &e_info))
- aer_process_err_devices(&e_info);
+ aer_process_err_devices(&e_info, level);
}
if (e_src->status & PCI_ERR_ROOT_UNCOR_RCV) {
e_info.id = ERR_UNCOR_ID(e_src->id);
+ level = KERN_ERR;
if (e_src->status & PCI_ERR_ROOT_FATAL_RCV)
e_info.severity = AER_FATAL;
@@ -1319,7 +1322,7 @@ static void aer_isr_one_error(struct aer_rpc *rpc,
aer_print_port_info(pdev, &e_info);
if (find_source_device(pdev, &e_info))
- aer_process_err_devices(&e_info);
+ aer_process_err_devices(&e_info, level);
}
}
diff --git a/drivers/pci/pcie/dpc.c b/drivers/pci/pcie/dpc.c
index df42f15c9829..9e4c9ac737a7 100644
--- a/drivers/pci/pcie/dpc.c
+++ b/drivers/pci/pcie/dpc.c
@@ -289,7 +289,7 @@ void dpc_process_error(struct pci_dev *pdev)
else if (reason == PCI_EXP_DPC_STATUS_TRIGGER_RSN_UNCOR &&
dpc_get_aer_uncorrect_severity(pdev, &info) &&
aer_get_device_error_info(pdev, &info)) {
- aer_print_error(pdev, &info);
+ aer_print_error(pdev, &info, KERN_ERR);
pci_aer_clear_nonfatal_status(pdev);
pci_aer_clear_fatal_status(pdev);
}
--
2.49.0.rc1.451.g8f38331e32-goog
^ permalink raw reply related [flat|nested] 28+ messages in thread* Re: [PATCH v3 1/8] PCI/AER: Check log level once and propagate down
2025-03-19 8:40 ` [PATCH v3 1/8] PCI/AER: Check log level once and propagate down Jon Pan-Doh
@ 2025-03-20 0:07 ` Sathyanarayanan Kuppuswamy
0 siblings, 0 replies; 28+ messages in thread
From: Sathyanarayanan Kuppuswamy @ 2025-03-20 0:07 UTC (permalink / raw)
To: Jon Pan-Doh, Bjorn Helgaas, Karolina Stolarek
Cc: linux-pci, Martin Petersen, Ben Fuller, Drew Walton, Anil Agrawal,
Tony Luck, Ilpo Järvinen, Lukas Wunner, Jonathan Cameron,
Terry Bowman
Hi,
On 3/19/25 1:40 AM, Jon Pan-Doh wrote:
> From: Karolina Stolarek <karolina.stolarek@oracle.com>
>
> When reporting an AER error, we check its type multiple times
> to determine the log level for each message. Do this check only
> in the top-level functions (aer_isr_one_error(), pci_print_aer()) and
> propagate the result down the call chain.
>
> Signed-off-by: Karolina Stolarek <karolina.stolarek@oracle.com>
> Signed-off-by: Jon Pan-Doh <pandoh@google.com>
> Reviewed-by: Jon Pan-Doh <pandoh@google.com>
> ---
With my comments fixed, you can add
Reviewed-by: Kuppuswamy Sathyanarayanan
<sathyanarayanan.kuppuswamy@linux.intel.com>
> drivers/pci/pci.h | 2 +-
> drivers/pci/pcie/aer.c | 37 ++++++++++++++++++++-----------------
> drivers/pci/pcie/dpc.c | 2 +-
> 3 files changed, 22 insertions(+), 19 deletions(-)
>
> diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
> index b8911d1e10dc..75985b96ecc1 100644
> --- a/drivers/pci/pci.h
> +++ b/drivers/pci/pci.h
> @@ -551,7 +551,7 @@ struct aer_err_info {
> };
>
> int aer_get_device_error_info(struct pci_dev *dev, struct aer_err_info *info);
> -void aer_print_error(struct pci_dev *dev, struct aer_err_info *info);
> +void aer_print_error(struct pci_dev *dev, struct aer_err_info *info, const char *level);
>
> int pcie_read_tlp_log(struct pci_dev *dev, int where, int where2,
> unsigned int tlp_len, bool flit,
> diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
> index 9cff7069577e..cc9c80cd88f3 100644
> --- a/drivers/pci/pcie/aer.c
> +++ b/drivers/pci/pcie/aer.c
> @@ -670,20 +670,18 @@ static void pci_rootport_aer_stats_incr(struct pci_dev *pdev,
> }
>
> static void __aer_print_error(struct pci_dev *dev,
> - struct aer_err_info *info)
> + struct aer_err_info *info,
> + const char *level)
> {
> const char **strings;
> unsigned long status = info->status & ~info->mask;
> - const char *level, *errmsg;
> + const char *errmsg;
> int i;
>
> - if (info->severity == AER_CORRECTABLE) {
> + if (info->severity == AER_CORRECTABLE)
> strings = aer_correctable_error_string;
> - level = KERN_WARNING;
> - } else {
> + else
> strings = aer_uncorrectable_error_string;
> - level = KERN_ERR;
> - }
>
> for_each_set_bit(i, &status, 32) {
> errmsg = strings[i];
> @@ -696,11 +694,11 @@ static void __aer_print_error(struct pci_dev *dev,
> pci_dev_aer_stats_incr(dev, info);
> }
>
> -void aer_print_error(struct pci_dev *dev, struct aer_err_info *info)
> +void aer_print_error(struct pci_dev *dev, struct aer_err_info *info,
> + const char *level)
> {
> int layer, agent;
> int id = pci_dev_id(dev);
> - const char *level;
>
> if (!info->status) {
> pci_err(dev, "PCIe Bus Error: severity=%s, type=Inaccessible, (Unregistered Agent ID)\n",
> @@ -711,8 +709,6 @@ void aer_print_error(struct pci_dev *dev, struct aer_err_info *info)
> layer = AER_GET_LAYER_ERROR(info->severity, info->status);
> agent = AER_GET_AGENT(info->severity, info->status);
>
> - level = (info->severity == AER_CORRECTABLE) ? KERN_WARNING : KERN_ERR;
> -
> aer_printk(level, dev, "PCIe Bus Error: severity=%s, type=%s, (%s)\n",
> aer_error_severity_string[info->severity],
> aer_error_layer[layer], aer_agent_string[agent]);
> @@ -720,7 +716,7 @@ void aer_print_error(struct pci_dev *dev, struct aer_err_info *info)
> aer_printk(level, dev, " device [%04x:%04x] error status/mask=%08x/%08x\n",
> dev->vendor, dev->device, info->status, info->mask);
>
> - __aer_print_error(dev, info);
> + __aer_print_error(dev, info, level);
>
> if (info->tlp_header_valid)
> pcie_print_tlp_log(dev, &info->tlp, dev_fmt(" "));
> @@ -765,15 +761,18 @@ void pci_print_aer(struct pci_dev *dev, int aer_severity,
> {
> int layer, agent, tlp_header_valid = 0;
> u32 status, mask;
> + const char *level;
> struct aer_err_info info;
>
> if (aer_severity == AER_CORRECTABLE) {
> status = aer->cor_status;
> mask = aer->cor_mask;
> + level = KERN_WARNING;
> } else {
> status = aer->uncor_status;
> mask = aer->uncor_mask;
> tlp_header_valid = status & AER_LOG_TLP_MASKS;
> + level = KERN_ERR;
> }
>
> layer = AER_GET_LAYER_ERROR(aer_severity, status);
> @@ -786,7 +785,7 @@ void pci_print_aer(struct pci_dev *dev, int aer_severity,
> info.first_error = PCI_ERR_CAP_FEP(aer->cap_control);
>
> pci_err(dev, "aer_status: 0x%08x, aer_mask: 0x%08x\n", status, mask);
> - __aer_print_error(dev, &info);
> + __aer_print_error(dev, &info, level);
> pci_err(dev, "aer_layer=%s, aer_agent=%s\n",
> aer_error_layer[layer], aer_agent_string[agent]);
>
> @@ -1257,14 +1256,15 @@ int aer_get_device_error_info(struct pci_dev *dev, struct aer_err_info *info)
> return 1;
> }
>
> -static inline void aer_process_err_devices(struct aer_err_info *e_info)
> +static inline void aer_process_err_devices(struct aer_err_info *e_info,
> + const char *level)
> {
> int i;
>
> /* Report all before handle them, not to lost records by reset etc. */
> for (i = 0; i < e_info->error_dev_num && e_info->dev[i]; i++) {
> if (aer_get_device_error_info(e_info->dev[i], e_info))
> - aer_print_error(e_info->dev[i], e_info);
> + aer_print_error(e_info->dev[i], e_info, level);
> }
> for (i = 0; i < e_info->error_dev_num && e_info->dev[i]; i++) {
> if (aer_get_device_error_info(e_info->dev[i], e_info))
> @@ -1282,6 +1282,7 @@ static void aer_isr_one_error(struct aer_rpc *rpc,
> {
> struct pci_dev *pdev = rpc->rpd;
> struct aer_err_info e_info;
> + const char *level;
>
> pci_rootport_aer_stats_incr(pdev, e_src);
>
> @@ -1292,6 +1293,7 @@ static void aer_isr_one_error(struct aer_rpc *rpc,
> if (e_src->status & PCI_ERR_ROOT_COR_RCV) {
> e_info.id = ERR_COR_ID(e_src->id);
> e_info.severity = AER_CORRECTABLE;
> + level = KERN_WARNING;
>
> if (e_src->status & PCI_ERR_ROOT_MULTI_COR_RCV)
> e_info.multi_error_valid = 1;
> @@ -1300,11 +1302,12 @@ static void aer_isr_one_error(struct aer_rpc *rpc,
> aer_print_port_info(pdev, &e_info);
>
> if (find_source_device(pdev, &e_info))
> - aer_process_err_devices(&e_info);
> + aer_process_err_devices(&e_info, level);
Since this path is only taken for correctable error, you can directly use
aer_process_err_devices(&e_info, KERN_WARNING)
> }
>
> if (e_src->status & PCI_ERR_ROOT_UNCOR_RCV) {
> e_info.id = ERR_UNCOR_ID(e_src->id);
> + level = KERN_ERR;
>
> if (e_src->status & PCI_ERR_ROOT_FATAL_RCV)
> e_info.severity = AER_FATAL;
> @@ -1319,7 +1322,7 @@ static void aer_isr_one_error(struct aer_rpc *rpc,
> aer_print_port_info(pdev, &e_info);
>
> if (find_source_device(pdev, &e_info))
> - aer_process_err_devices(&e_info);
> + aer_process_err_devices(&e_info, level);
Same as above.
aer_process_err_devices(&e_info, KERN_ERR)
> }
> }
>
> diff --git a/drivers/pci/pcie/dpc.c b/drivers/pci/pcie/dpc.c
> index df42f15c9829..9e4c9ac737a7 100644
> --- a/drivers/pci/pcie/dpc.c
> +++ b/drivers/pci/pcie/dpc.c
> @@ -289,7 +289,7 @@ void dpc_process_error(struct pci_dev *pdev)
> else if (reason == PCI_EXP_DPC_STATUS_TRIGGER_RSN_UNCOR &&
> dpc_get_aer_uncorrect_severity(pdev, &info) &&
> aer_get_device_error_info(pdev, &info)) {
> - aer_print_error(pdev, &info);
> + aer_print_error(pdev, &info, KERN_ERR);
> pci_aer_clear_nonfatal_status(pdev);
> pci_aer_clear_fatal_status(pdev);
> }
--
Sathyanarayanan Kuppuswamy
Linux Kernel Developer
^ permalink raw reply [flat|nested] 28+ messages in thread
* [PATCH v3 2/8] PCI/AER: Make all pci_print_aer() log levels depend on error type
2025-03-19 8:40 [PATCH v3 0/8] Rate limit AER logs Jon Pan-Doh
2025-03-19 8:40 ` [PATCH v3 1/8] PCI/AER: Check log level once and propagate down Jon Pan-Doh
@ 2025-03-19 8:40 ` Jon Pan-Doh
2025-03-19 9:52 ` Ilpo Järvinen
2025-03-20 2:39 ` Sathyanarayanan Kuppuswamy
2025-03-19 8:40 ` [PATCH v3 3/8] PCI/AER: Move AER stat collection out of __aer_print_error() Jon Pan-Doh
` (6 subsequent siblings)
8 siblings, 2 replies; 28+ messages in thread
From: Jon Pan-Doh @ 2025-03-19 8:40 UTC (permalink / raw)
To: Bjorn Helgaas, Karolina Stolarek
Cc: linux-pci, Martin Petersen, Ben Fuller, Drew Walton, Anil Agrawal,
Tony Luck, Ilpo Järvinen, Sathyanarayanan Kuppuswamy,
Lukas Wunner, Jonathan Cameron, Terry Bowman, Jon Pan-Doh
From: Karolina Stolarek <karolina.stolarek@oracle.com>
Some existing logs in pci_print_aer() log with error severity
by default. Convert them to depend on error type (consistent
with rest of AER logging).
Signed-off-by: Karolina Stolarek <karolina.stolarek@oracle.com>
Signed-off-by: Jon Pan-Doh <pandoh@google.com>
---
drivers/pci/pcie/aer.c | 10 +++++-----
1 file changed, 5 insertions(+), 5 deletions(-)
diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
index cc9c80cd88f3..7eeaad917134 100644
--- a/drivers/pci/pcie/aer.c
+++ b/drivers/pci/pcie/aer.c
@@ -784,14 +784,14 @@ void pci_print_aer(struct pci_dev *dev, int aer_severity,
info.mask = mask;
info.first_error = PCI_ERR_CAP_FEP(aer->cap_control);
- pci_err(dev, "aer_status: 0x%08x, aer_mask: 0x%08x\n", status, mask);
+ aer_printk(level, dev, "aer_status: 0x%08x, aer_mask: 0x%08x\n", status, mask);
__aer_print_error(dev, &info, level);
- pci_err(dev, "aer_layer=%s, aer_agent=%s\n",
- aer_error_layer[layer], aer_agent_string[agent]);
+ aer_printk(level, dev, "aer_layer=%s, aer_agent=%s\n",
+ aer_error_layer[layer], aer_agent_string[agent]);
if (aer_severity != AER_CORRECTABLE)
- pci_err(dev, "aer_uncor_severity: 0x%08x\n",
- aer->uncor_severity);
+ aer_printk(level, dev, "aer_uncor_severity: 0x%08x\n",
+ aer->uncor_severity);
if (tlp_header_valid)
pcie_print_tlp_log(dev, &aer->header_log, dev_fmt(" "));
--
2.49.0.rc1.451.g8f38331e32-goog
^ permalink raw reply related [flat|nested] 28+ messages in thread* Re: [PATCH v3 2/8] PCI/AER: Make all pci_print_aer() log levels depend on error type
2025-03-19 8:40 ` [PATCH v3 2/8] PCI/AER: Make all pci_print_aer() log levels depend on error type Jon Pan-Doh
@ 2025-03-19 9:52 ` Ilpo Järvinen
2025-03-20 2:39 ` Sathyanarayanan Kuppuswamy
1 sibling, 0 replies; 28+ messages in thread
From: Ilpo Järvinen @ 2025-03-19 9:52 UTC (permalink / raw)
To: Jon Pan-Doh
Cc: Bjorn Helgaas, Karolina Stolarek, linux-pci, Martin Petersen,
Ben Fuller, Drew Walton, Anil Agrawal, Tony Luck,
Sathyanarayanan Kuppuswamy, Lukas Wunner, Jonathan Cameron,
Terry Bowman
[-- Attachment #1: Type: text/plain, Size: 1655 bytes --]
On Wed, 19 Mar 2025, Jon Pan-Doh wrote:
> From: Karolina Stolarek <karolina.stolarek@oracle.com>
>
> Some existing logs in pci_print_aer() log with error severity
> by default. Convert them to depend on error type (consistent
> with rest of AER logging).
>
> Signed-off-by: Karolina Stolarek <karolina.stolarek@oracle.com>
> Signed-off-by: Jon Pan-Doh <pandoh@google.com>
> ---
> drivers/pci/pcie/aer.c | 10 +++++-----
> 1 file changed, 5 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
> index cc9c80cd88f3..7eeaad917134 100644
> --- a/drivers/pci/pcie/aer.c
> +++ b/drivers/pci/pcie/aer.c
> @@ -784,14 +784,14 @@ void pci_print_aer(struct pci_dev *dev, int aer_severity,
> info.mask = mask;
> info.first_error = PCI_ERR_CAP_FEP(aer->cap_control);
>
> - pci_err(dev, "aer_status: 0x%08x, aer_mask: 0x%08x\n", status, mask);
> + aer_printk(level, dev, "aer_status: 0x%08x, aer_mask: 0x%08x\n", status, mask);
> __aer_print_error(dev, &info, level);
> - pci_err(dev, "aer_layer=%s, aer_agent=%s\n",
> - aer_error_layer[layer], aer_agent_string[agent]);
> + aer_printk(level, dev, "aer_layer=%s, aer_agent=%s\n",
> + aer_error_layer[layer], aer_agent_string[agent]);
>
> if (aer_severity != AER_CORRECTABLE)
> - pci_err(dev, "aer_uncor_severity: 0x%08x\n",
> - aer->uncor_severity);
> + aer_printk(level, dev, "aer_uncor_severity: 0x%08x\n",
> + aer->uncor_severity);
>
> if (tlp_header_valid)
> pcie_print_tlp_log(dev, &aer->header_log, dev_fmt(" "));
>
Reviewed-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
--
i.
^ permalink raw reply [flat|nested] 28+ messages in thread* Re: [PATCH v3 2/8] PCI/AER: Make all pci_print_aer() log levels depend on error type
2025-03-19 8:40 ` [PATCH v3 2/8] PCI/AER: Make all pci_print_aer() log levels depend on error type Jon Pan-Doh
2025-03-19 9:52 ` Ilpo Järvinen
@ 2025-03-20 2:39 ` Sathyanarayanan Kuppuswamy
2025-03-20 8:27 ` Jon Pan-Doh
1 sibling, 1 reply; 28+ messages in thread
From: Sathyanarayanan Kuppuswamy @ 2025-03-20 2:39 UTC (permalink / raw)
To: Jon Pan-Doh, Bjorn Helgaas, Karolina Stolarek
Cc: linux-pci, Martin Petersen, Ben Fuller, Drew Walton, Anil Agrawal,
Tony Luck, Ilpo Järvinen, Lukas Wunner, Jonathan Cameron,
Terry Bowman
On 3/19/25 1:40 AM, Jon Pan-Doh wrote:
> From: Karolina Stolarek <karolina.stolarek@oracle.com>
>
> Some existing logs in pci_print_aer() log with error severity
> by default. Convert them to depend on error type (consistent
> with rest of AER logging).
>
> Signed-off-by: Karolina Stolarek <karolina.stolarek@oracle.com>
> Signed-off-by: Jon Pan-Doh <pandoh@google.com>
> ---
Since the original patch from Ilpo is not yet merged, may be it
worth considering add this change part of the same patch.
Reviewed-by: Kuppuswamy Sathyanarayanan
<sathyanarayanan.kuppuswamy@linux.intel.com>
> drivers/pci/pcie/aer.c | 10 +++++-----
> 1 file changed, 5 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
> index cc9c80cd88f3..7eeaad917134 100644
> --- a/drivers/pci/pcie/aer.c
> +++ b/drivers/pci/pcie/aer.c
> @@ -784,14 +784,14 @@ void pci_print_aer(struct pci_dev *dev, int aer_severity,
> info.mask = mask;
> info.first_error = PCI_ERR_CAP_FEP(aer->cap_control);
>
> - pci_err(dev, "aer_status: 0x%08x, aer_mask: 0x%08x\n", status, mask);
> + aer_printk(level, dev, "aer_status: 0x%08x, aer_mask: 0x%08x\n", status, mask);
> __aer_print_error(dev, &info, level);
> - pci_err(dev, "aer_layer=%s, aer_agent=%s\n",
> - aer_error_layer[layer], aer_agent_string[agent]);
> + aer_printk(level, dev, "aer_layer=%s, aer_agent=%s\n",
> + aer_error_layer[layer], aer_agent_string[agent]);
>
> if (aer_severity != AER_CORRECTABLE)
> - pci_err(dev, "aer_uncor_severity: 0x%08x\n",
> - aer->uncor_severity);
> + aer_printk(level, dev, "aer_uncor_severity: 0x%08x\n",
> + aer->uncor_severity);
>
> if (tlp_header_valid)
> pcie_print_tlp_log(dev, &aer->header_log, dev_fmt(" "));
--
Sathyanarayanan Kuppuswamy
Linux Kernel Developer
^ permalink raw reply [flat|nested] 28+ messages in thread* Re: [PATCH v3 2/8] PCI/AER: Make all pci_print_aer() log levels depend on error type
2025-03-20 2:39 ` Sathyanarayanan Kuppuswamy
@ 2025-03-20 8:27 ` Jon Pan-Doh
2025-03-20 14:23 ` Sathyanarayanan Kuppuswamy
0 siblings, 1 reply; 28+ messages in thread
From: Jon Pan-Doh @ 2025-03-20 8:27 UTC (permalink / raw)
To: Sathyanarayanan Kuppuswamy
Cc: Bjorn Helgaas, Karolina Stolarek, linux-pci, Martin Petersen,
Ben Fuller, Drew Walton, Anil Agrawal, Tony Luck,
Ilpo Järvinen, Lukas Wunner, Jonathan Cameron, Terry Bowman
On Wed, Mar 19, 2025 at 7:39 PM Sathyanarayanan Kuppuswamy
<sathyanarayanan.kuppuswamy@linux.intel.com> wrote:
> Since the original patch from Ilpo is not yet merged, may be it
> worth considering add this change part of the same patch.
Which patch are you referring to?
Thanks,
Jon
^ permalink raw reply [flat|nested] 28+ messages in thread
* Re: [PATCH v3 2/8] PCI/AER: Make all pci_print_aer() log levels depend on error type
2025-03-20 8:27 ` Jon Pan-Doh
@ 2025-03-20 14:23 ` Sathyanarayanan Kuppuswamy
2025-03-20 19:06 ` Jon Pan-Doh
0 siblings, 1 reply; 28+ messages in thread
From: Sathyanarayanan Kuppuswamy @ 2025-03-20 14:23 UTC (permalink / raw)
To: Jon Pan-Doh
Cc: Bjorn Helgaas, Karolina Stolarek, linux-pci, Martin Petersen,
Ben Fuller, Drew Walton, Anil Agrawal, Tony Luck,
Ilpo Järvinen, Lukas Wunner, Jonathan Cameron, Terry Bowman
On 3/20/25 1:27 AM, Jon Pan-Doh wrote:
> On Wed, Mar 19, 2025 at 7:39 PM Sathyanarayanan Kuppuswamy
> <sathyanarayanan.kuppuswamy@linux.intel.com> wrote:
>> Since the original patch from Ilpo is not yet merged, may be it
>> worth considering add this change part of the same patch.
> Which patch are you referring to?
https://web.git.kernel.org/pub/scm/linux/kernel/git/pci/pci.git/commit/?h=aer&id=fab874e12593b68f9a7fcb1a31a7dcf4829e88f7
>
> Thanks,
> Jon
>
--
Sathyanarayanan Kuppuswamy
Linux Kernel Developer
^ permalink raw reply [flat|nested] 28+ messages in thread
* Re: [PATCH v3 2/8] PCI/AER: Make all pci_print_aer() log levels depend on error type
2025-03-20 14:23 ` Sathyanarayanan Kuppuswamy
@ 2025-03-20 19:06 ` Jon Pan-Doh
0 siblings, 0 replies; 28+ messages in thread
From: Jon Pan-Doh @ 2025-03-20 19:06 UTC (permalink / raw)
To: Sathyanarayanan Kuppuswamy
Cc: Bjorn Helgaas, Karolina Stolarek, linux-pci, Martin Petersen,
Ben Fuller, Drew Walton, Anil Agrawal, Tony Luck,
Ilpo Järvinen, Lukas Wunner, Jonathan Cameron, Terry Bowman
On Thu, Mar 20, 2025 at 7:23 AM Sathyanarayanan Kuppuswamy
<sathyanarayanan.kuppuswamy@linux.intel.com> wrote:
> https://web.git.kernel.org/pub/scm/linux/kernel/git/pci/pci.git/commit/?h=aer&id=fab874e12593b68f9a7fcb1a31a7dcf4829e88f7
Ah. I have no objections if this patch is squashed with Ilpo's.
Thanks,
Jon
^ permalink raw reply [flat|nested] 28+ messages in thread
* [PATCH v3 3/8] PCI/AER: Move AER stat collection out of __aer_print_error()
2025-03-19 8:40 [PATCH v3 0/8] Rate limit AER logs Jon Pan-Doh
2025-03-19 8:40 ` [PATCH v3 1/8] PCI/AER: Check log level once and propagate down Jon Pan-Doh
2025-03-19 8:40 ` [PATCH v3 2/8] PCI/AER: Make all pci_print_aer() log levels depend on error type Jon Pan-Doh
@ 2025-03-19 8:40 ` Jon Pan-Doh
2025-03-19 18:19 ` Bjorn Helgaas
2025-03-20 3:22 ` Sathyanarayanan Kuppuswamy
2025-03-19 8:40 ` [PATCH v3 4/8] PCI/AER: Rename struct aer_stats to aer_report Jon Pan-Doh
` (5 subsequent siblings)
8 siblings, 2 replies; 28+ messages in thread
From: Jon Pan-Doh @ 2025-03-19 8:40 UTC (permalink / raw)
To: Bjorn Helgaas, Karolina Stolarek
Cc: linux-pci, Martin Petersen, Ben Fuller, Drew Walton, Anil Agrawal,
Tony Luck, Ilpo Järvinen, Sathyanarayanan Kuppuswamy,
Lukas Wunner, Jonathan Cameron, Terry Bowman, Jon Pan-Doh
Decouple stat collection from internal AER print functions. AERs from ghes
or cxl drivers have stat collection in pci_print_aer() as that is where
aer_err_info is populated.
Signed-off-by: Jon Pan-Doh <pandoh@google.com>
---
drivers/pci/pci.h | 1 +
drivers/pci/pcie/aer.c | 10 ++++++----
drivers/pci/pcie/dpc.c | 1 +
3 files changed, 8 insertions(+), 4 deletions(-)
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 75985b96ecc1..9d63d32f041c 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -551,6 +551,7 @@ struct aer_err_info {
};
int aer_get_device_error_info(struct pci_dev *dev, struct aer_err_info *info);
+void pci_dev_aer_stats_incr(struct pci_dev *pdev, struct aer_err_info *info);
void aer_print_error(struct pci_dev *dev, struct aer_err_info *info, const char *level);
int pcie_read_tlp_log(struct pci_dev *dev, int where, int where2,
diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
index 7eeaad917134..8e4d4f9326e1 100644
--- a/drivers/pci/pcie/aer.c
+++ b/drivers/pci/pcie/aer.c
@@ -617,8 +617,7 @@ const struct attribute_group aer_stats_attr_group = {
.is_visible = aer_stats_attrs_are_visible,
};
-static void pci_dev_aer_stats_incr(struct pci_dev *pdev,
- struct aer_err_info *info)
+void pci_dev_aer_stats_incr(struct pci_dev *pdev, struct aer_err_info *info)
{
unsigned long status = info->status & ~info->mask;
int i, max = -1;
@@ -691,7 +690,6 @@ static void __aer_print_error(struct pci_dev *dev,
aer_printk(level, dev, " [%2d] %-22s%s\n", i, errmsg,
info->first_error == i ? " (First)" : "");
}
- pci_dev_aer_stats_incr(dev, info);
}
void aer_print_error(struct pci_dev *dev, struct aer_err_info *info,
@@ -784,6 +782,8 @@ void pci_print_aer(struct pci_dev *dev, int aer_severity,
info.mask = mask;
info.first_error = PCI_ERR_CAP_FEP(aer->cap_control);
+ pci_dev_aer_stats_incr(dev, &info);
+
aer_printk(level, dev, "aer_status: 0x%08x, aer_mask: 0x%08x\n", status, mask);
__aer_print_error(dev, &info, level);
aer_printk(level, dev, "aer_layer=%s, aer_agent=%s\n",
@@ -1263,8 +1263,10 @@ static inline void aer_process_err_devices(struct aer_err_info *e_info,
/* Report all before handle them, not to lost records by reset etc. */
for (i = 0; i < e_info->error_dev_num && e_info->dev[i]; i++) {
- if (aer_get_device_error_info(e_info->dev[i], e_info))
+ if (aer_get_device_error_info(e_info->dev[i], e_info)) {
+ pci_dev_aer_stats_incr(e_info->dev[i], e_info);
aer_print_error(e_info->dev[i], e_info, level);
+ }
}
for (i = 0; i < e_info->error_dev_num && e_info->dev[i]; i++) {
if (aer_get_device_error_info(e_info->dev[i], e_info))
diff --git a/drivers/pci/pcie/dpc.c b/drivers/pci/pcie/dpc.c
index 9e4c9ac737a7..81cd6e8ff3a4 100644
--- a/drivers/pci/pcie/dpc.c
+++ b/drivers/pci/pcie/dpc.c
@@ -289,6 +289,7 @@ void dpc_process_error(struct pci_dev *pdev)
else if (reason == PCI_EXP_DPC_STATUS_TRIGGER_RSN_UNCOR &&
dpc_get_aer_uncorrect_severity(pdev, &info) &&
aer_get_device_error_info(pdev, &info)) {
+ pci_dev_aer_stats_incr(pdev, &info);
aer_print_error(pdev, &info, KERN_ERR);
pci_aer_clear_nonfatal_status(pdev);
pci_aer_clear_fatal_status(pdev);
--
2.49.0.rc1.451.g8f38331e32-goog
^ permalink raw reply related [flat|nested] 28+ messages in thread* Re: [PATCH v3 3/8] PCI/AER: Move AER stat collection out of __aer_print_error()
2025-03-19 8:40 ` [PATCH v3 3/8] PCI/AER: Move AER stat collection out of __aer_print_error() Jon Pan-Doh
@ 2025-03-19 18:19 ` Bjorn Helgaas
2025-03-20 8:27 ` Jon Pan-Doh
2025-03-20 3:22 ` Sathyanarayanan Kuppuswamy
1 sibling, 1 reply; 28+ messages in thread
From: Bjorn Helgaas @ 2025-03-19 18:19 UTC (permalink / raw)
To: Jon Pan-Doh
Cc: Bjorn Helgaas, Karolina Stolarek, linux-pci, Martin Petersen,
Ben Fuller, Drew Walton, Anil Agrawal, Tony Luck,
Ilpo Järvinen, Sathyanarayanan Kuppuswamy, Lukas Wunner,
Jonathan Cameron, Terry Bowman
On Wed, Mar 19, 2025 at 01:40:44AM -0700, Jon Pan-Doh wrote:
> Decouple stat collection from internal AER print functions. AERs from ghes
> or cxl drivers have stat collection in pci_print_aer() as that is where
> aer_err_info is populated.
This moves pci_dev_aer_stats_incr() from __aer_print_error() to
- pci_print_aer(), used by CXL via cxl_handle_rdport_errors() and
GHES via aer_recover_queue() and aer_recover_work_func()
- aer_process_err_devices(), used by native AER handling via
aer_irq(), aer_isr(), aer_isr_one_error(), and
- dpc_process_error(), used by native DPC handling via dpc_handler()
and by ACPI EDR Notify events via edr_handle_event()
And the reason for this is ...?
Maybe just to separate stats from printing, which does seem reasonable
to me, although pci_print_aer() is still primarily a printing
function, albeit also an external interface.
In any event, I would like to include the motivation.
> Signed-off-by: Jon Pan-Doh <pandoh@google.com>
> ---
> drivers/pci/pci.h | 1 +
> drivers/pci/pcie/aer.c | 10 ++++++----
> drivers/pci/pcie/dpc.c | 1 +
> 3 files changed, 8 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
> index 75985b96ecc1..9d63d32f041c 100644
> --- a/drivers/pci/pci.h
> +++ b/drivers/pci/pci.h
> @@ -551,6 +551,7 @@ struct aer_err_info {
> };
>
> int aer_get_device_error_info(struct pci_dev *dev, struct aer_err_info *info);
> +void pci_dev_aer_stats_incr(struct pci_dev *pdev, struct aer_err_info *info);
> void aer_print_error(struct pci_dev *dev, struct aer_err_info *info, const char *level);
>
> int pcie_read_tlp_log(struct pci_dev *dev, int where, int where2,
> diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
> index 7eeaad917134..8e4d4f9326e1 100644
> --- a/drivers/pci/pcie/aer.c
> +++ b/drivers/pci/pcie/aer.c
> @@ -617,8 +617,7 @@ const struct attribute_group aer_stats_attr_group = {
> .is_visible = aer_stats_attrs_are_visible,
> };
>
> -static void pci_dev_aer_stats_incr(struct pci_dev *pdev,
> - struct aer_err_info *info)
> +void pci_dev_aer_stats_incr(struct pci_dev *pdev, struct aer_err_info *info)
> {
> unsigned long status = info->status & ~info->mask;
> int i, max = -1;
> @@ -691,7 +690,6 @@ static void __aer_print_error(struct pci_dev *dev,
> aer_printk(level, dev, " [%2d] %-22s%s\n", i, errmsg,
> info->first_error == i ? " (First)" : "");
> }
> - pci_dev_aer_stats_incr(dev, info);
> }
>
> void aer_print_error(struct pci_dev *dev, struct aer_err_info *info,
> @@ -784,6 +782,8 @@ void pci_print_aer(struct pci_dev *dev, int aer_severity,
> info.mask = mask;
> info.first_error = PCI_ERR_CAP_FEP(aer->cap_control);
>
> + pci_dev_aer_stats_incr(dev, &info);
> +
> aer_printk(level, dev, "aer_status: 0x%08x, aer_mask: 0x%08x\n", status, mask);
> __aer_print_error(dev, &info, level);
> aer_printk(level, dev, "aer_layer=%s, aer_agent=%s\n",
> @@ -1263,8 +1263,10 @@ static inline void aer_process_err_devices(struct aer_err_info *e_info,
>
> /* Report all before handle them, not to lost records by reset etc. */
> for (i = 0; i < e_info->error_dev_num && e_info->dev[i]; i++) {
> - if (aer_get_device_error_info(e_info->dev[i], e_info))
> + if (aer_get_device_error_info(e_info->dev[i], e_info)) {
> + pci_dev_aer_stats_incr(e_info->dev[i], e_info);
> aer_print_error(e_info->dev[i], e_info, level);
> + }
> }
> for (i = 0; i < e_info->error_dev_num && e_info->dev[i]; i++) {
> if (aer_get_device_error_info(e_info->dev[i], e_info))
> diff --git a/drivers/pci/pcie/dpc.c b/drivers/pci/pcie/dpc.c
> index 9e4c9ac737a7..81cd6e8ff3a4 100644
> --- a/drivers/pci/pcie/dpc.c
> +++ b/drivers/pci/pcie/dpc.c
> @@ -289,6 +289,7 @@ void dpc_process_error(struct pci_dev *pdev)
> else if (reason == PCI_EXP_DPC_STATUS_TRIGGER_RSN_UNCOR &&
> dpc_get_aer_uncorrect_severity(pdev, &info) &&
> aer_get_device_error_info(pdev, &info)) {
> + pci_dev_aer_stats_incr(pdev, &info);
> aer_print_error(pdev, &info, KERN_ERR);
> pci_aer_clear_nonfatal_status(pdev);
> pci_aer_clear_fatal_status(pdev);
> --
> 2.49.0.rc1.451.g8f38331e32-goog
>
^ permalink raw reply [flat|nested] 28+ messages in thread* Re: [PATCH v3 3/8] PCI/AER: Move AER stat collection out of __aer_print_error()
2025-03-19 18:19 ` Bjorn Helgaas
@ 2025-03-20 8:27 ` Jon Pan-Doh
0 siblings, 0 replies; 28+ messages in thread
From: Jon Pan-Doh @ 2025-03-20 8:27 UTC (permalink / raw)
To: Bjorn Helgaas
Cc: Bjorn Helgaas, Karolina Stolarek, linux-pci, Martin Petersen,
Ben Fuller, Drew Walton, Anil Agrawal, Tony Luck,
Ilpo Järvinen, Sathyanarayanan Kuppuswamy, Lukas Wunner,
Jonathan Cameron, Terry Bowman
On Wed, Mar 19, 2025 at 11:19 AM Bjorn Helgaas <helgaas@kernel.org> wrote:
> This moves pci_dev_aer_stats_incr() from __aer_print_error() to
>
> - pci_print_aer(), used by CXL via cxl_handle_rdport_errors() and
> GHES via aer_recover_queue() and aer_recover_work_func()
>
> - aer_process_err_devices(), used by native AER handling via
> aer_irq(), aer_isr(), aer_isr_one_error(), and
>
> - dpc_process_error(), used by native DPC handling via dpc_handler()
> and by ACPI EDR Notify events via edr_handle_event()
>
> And the reason for this is ...?
>
> Maybe just to separate stats from printing, which does seem reasonable
> to me, although pci_print_aer() is still primarily a printing
> function, albeit also an external interface.
Separating stats from internal print functions allows us to:
- easily add ratelimits to logs while having stats unaffected
- simplify control flow as stats collection is no longer buried
pci_print_aer() is the odd one out, but that should be temporary as
Karolina's log
consolidation patch[1] should allow pci_dev_aer_stats_incr() to be pulled out of
print functions and called after the newly created populate_aer_err_info().
> In any event, I would like to include the motivation.
Ack. Added in v4.
[1] https://lore.kernel.org/linux-pci/8bcb8c9a7b38ce3bdaca5a64fe76f08b0b337511.1742202797.git.karolina.stolarek@oracle.com/
Thanks,
Jon
^ permalink raw reply [flat|nested] 28+ messages in thread
* Re: [PATCH v3 3/8] PCI/AER: Move AER stat collection out of __aer_print_error()
2025-03-19 8:40 ` [PATCH v3 3/8] PCI/AER: Move AER stat collection out of __aer_print_error() Jon Pan-Doh
2025-03-19 18:19 ` Bjorn Helgaas
@ 2025-03-20 3:22 ` Sathyanarayanan Kuppuswamy
2025-03-20 8:29 ` Jon Pan-Doh
1 sibling, 1 reply; 28+ messages in thread
From: Sathyanarayanan Kuppuswamy @ 2025-03-20 3:22 UTC (permalink / raw)
To: Jon Pan-Doh, Bjorn Helgaas, Karolina Stolarek
Cc: linux-pci, Martin Petersen, Ben Fuller, Drew Walton, Anil Agrawal,
Tony Luck, Ilpo Järvinen, Lukas Wunner, Jonathan Cameron,
Terry Bowman
On 3/19/25 1:40 AM, Jon Pan-Doh wrote:
> Decouple stat collection from internal AER print functions. AERs from ghes
> or cxl drivers have stat collection in pci_print_aer() as that is where
> aer_err_info is populated.
pci_print_aer() also seems to do more than printing the AER stat. Why only
care about stat collection in __aer_print_error(). If the motivation is
to just improve the code readability, I am not sure it is worth the
effort. Please correct me if my understanding is incorrect.
>
> Signed-off-by: Jon Pan-Doh <pandoh@google.com>
> ---
> drivers/pci/pci.h | 1 +
> drivers/pci/pcie/aer.c | 10 ++++++----
> drivers/pci/pcie/dpc.c | 1 +
> 3 files changed, 8 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
> index 75985b96ecc1..9d63d32f041c 100644
> --- a/drivers/pci/pci.h
> +++ b/drivers/pci/pci.h
> @@ -551,6 +551,7 @@ struct aer_err_info {
> };
>
> int aer_get_device_error_info(struct pci_dev *dev, struct aer_err_info *info);
> +void pci_dev_aer_stats_incr(struct pci_dev *pdev, struct aer_err_info *info);
> void aer_print_error(struct pci_dev *dev, struct aer_err_info *info, const char *level);
>
> int pcie_read_tlp_log(struct pci_dev *dev, int where, int where2,
> diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
> index 7eeaad917134..8e4d4f9326e1 100644
> --- a/drivers/pci/pcie/aer.c
> +++ b/drivers/pci/pcie/aer.c
> @@ -617,8 +617,7 @@ const struct attribute_group aer_stats_attr_group = {
> .is_visible = aer_stats_attrs_are_visible,
> };
>
> -static void pci_dev_aer_stats_incr(struct pci_dev *pdev,
> - struct aer_err_info *info)
> +void pci_dev_aer_stats_incr(struct pci_dev *pdev, struct aer_err_info *info)
> {
> unsigned long status = info->status & ~info->mask;
> int i, max = -1;
> @@ -691,7 +690,6 @@ static void __aer_print_error(struct pci_dev *dev,
> aer_printk(level, dev, " [%2d] %-22s%s\n", i, errmsg,
> info->first_error == i ? " (First)" : "");
> }
> - pci_dev_aer_stats_incr(dev, info);
> }
>
> void aer_print_error(struct pci_dev *dev, struct aer_err_info *info,
> @@ -784,6 +782,8 @@ void pci_print_aer(struct pci_dev *dev, int aer_severity,
> info.mask = mask;
> info.first_error = PCI_ERR_CAP_FEP(aer->cap_control);
>
> + pci_dev_aer_stats_incr(dev, &info);
> +
> aer_printk(level, dev, "aer_status: 0x%08x, aer_mask: 0x%08x\n", status, mask);
> __aer_print_error(dev, &info, level);
> aer_printk(level, dev, "aer_layer=%s, aer_agent=%s\n",
> @@ -1263,8 +1263,10 @@ static inline void aer_process_err_devices(struct aer_err_info *e_info,
>
> /* Report all before handle them, not to lost records by reset etc. */
> for (i = 0; i < e_info->error_dev_num && e_info->dev[i]; i++) {
> - if (aer_get_device_error_info(e_info->dev[i], e_info))
> + if (aer_get_device_error_info(e_info->dev[i], e_info)) {
> + pci_dev_aer_stats_incr(e_info->dev[i], e_info);
> aer_print_error(e_info->dev[i], e_info, level);
> + }
> }
> for (i = 0; i < e_info->error_dev_num && e_info->dev[i]; i++) {
> if (aer_get_device_error_info(e_info->dev[i], e_info))
> diff --git a/drivers/pci/pcie/dpc.c b/drivers/pci/pcie/dpc.c
> index 9e4c9ac737a7..81cd6e8ff3a4 100644
> --- a/drivers/pci/pcie/dpc.c
> +++ b/drivers/pci/pcie/dpc.c
> @@ -289,6 +289,7 @@ void dpc_process_error(struct pci_dev *pdev)
> else if (reason == PCI_EXP_DPC_STATUS_TRIGGER_RSN_UNCOR &&
> dpc_get_aer_uncorrect_severity(pdev, &info) &&
> aer_get_device_error_info(pdev, &info)) {
> + pci_dev_aer_stats_incr(pdev, &info);
> aer_print_error(pdev, &info, KERN_ERR);
> pci_aer_clear_nonfatal_status(pdev);
> pci_aer_clear_fatal_status(pdev);
--
Sathyanarayanan Kuppuswamy
Linux Kernel Developer
^ permalink raw reply [flat|nested] 28+ messages in thread* Re: [PATCH v3 3/8] PCI/AER: Move AER stat collection out of __aer_print_error()
2025-03-20 3:22 ` Sathyanarayanan Kuppuswamy
@ 2025-03-20 8:29 ` Jon Pan-Doh
0 siblings, 0 replies; 28+ messages in thread
From: Jon Pan-Doh @ 2025-03-20 8:29 UTC (permalink / raw)
To: Sathyanarayanan Kuppuswamy
Cc: Bjorn Helgaas, Karolina Stolarek, linux-pci, Martin Petersen,
Ben Fuller, Drew Walton, Anil Agrawal, Tony Luck,
Ilpo Järvinen, Lukas Wunner, Jonathan Cameron, Terry Bowman
On Wed, Mar 19, 2025 at 8:23 PM Sathyanarayanan Kuppuswamy
<sathyanarayanan.kuppuswamy@linux.intel.com> wrote:
> pci_print_aer() also seems to do more than printing the AER stat. Why only
> care about stat collection in __aer_print_error(). If the motivation is
> to just improve the code readability, I am not sure it is worth the
> effort. Please correct me if my understanding is incorrect.
Bjorn had similar concerns. Hopefully my response[1] answers your questions.
[1] https://lore.kernel.org/linux-pci/CAMC_AXW85x_LRT5UTD0C_VvK8WTG6kbfvp5k_7RjnLzGM3Bg-g@mail.gmail.com/
Thanks,
Jon
^ permalink raw reply [flat|nested] 28+ messages in thread
* [PATCH v3 4/8] PCI/AER: Rename struct aer_stats to aer_report
2025-03-19 8:40 [PATCH v3 0/8] Rate limit AER logs Jon Pan-Doh
` (2 preceding siblings ...)
2025-03-19 8:40 ` [PATCH v3 3/8] PCI/AER: Move AER stat collection out of __aer_print_error() Jon Pan-Doh
@ 2025-03-19 8:40 ` Jon Pan-Doh
2025-03-20 3:29 ` Sathyanarayanan Kuppuswamy
2025-03-19 8:40 ` [PATCH v3 5/8] PCI/AER: Introduce ratelimit for error logs Jon Pan-Doh
` (4 subsequent siblings)
8 siblings, 1 reply; 28+ messages in thread
From: Jon Pan-Doh @ 2025-03-19 8:40 UTC (permalink / raw)
To: Bjorn Helgaas, Karolina Stolarek
Cc: linux-pci, Martin Petersen, Ben Fuller, Drew Walton, Anil Agrawal,
Tony Luck, Ilpo Järvinen, Sathyanarayanan Kuppuswamy,
Lukas Wunner, Jonathan Cameron, Terry Bowman, Jon Pan-Doh
Update name to reflect the broader definition of structs/variables that
are stored (e.g. ratelimits).
Signed-off-by: Karolina Stolarek <karolina.stolarek@oracle.com>
Signed-off-by: Jon Pan-Doh <pandoh@google.com>
Reviewed-by: Karolina Stolarek <karolina.stolarek@oracle.com>
---
drivers/pci/pcie/aer.c | 50 +++++++++++++++++++++---------------------
include/linux/pci.h | 2 +-
2 files changed, 26 insertions(+), 26 deletions(-)
diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
index 8e4d4f9326e1..3a12980f7dd7 100644
--- a/drivers/pci/pcie/aer.c
+++ b/drivers/pci/pcie/aer.c
@@ -54,11 +54,11 @@ struct aer_rpc {
DECLARE_KFIFO(aer_fifo, struct aer_err_source, AER_ERROR_SOURCES_MAX);
};
-/* AER stats for the device */
-struct aer_stats {
+/* AER report for the device */
+struct aer_report {
/*
- * Fields for all AER capable devices. They indicate the errors
+ * Stats for all AER capable devices. They indicate the errors
* "as seen by this device". Note that this may mean that if an
* end point is causing problems, the AER counters may increment
* at its link partner (e.g. root port) because the errors will be
@@ -80,7 +80,7 @@ struct aer_stats {
u64 dev_total_nonfatal_errs;
/*
- * Fields for Root ports & root complex event collectors only, these
+ * Stats for Root ports & root complex event collectors only, these
* indicate the total number of ERR_COR, ERR_FATAL, and ERR_NONFATAL
* messages received by the root port / event collector, INCLUDING the
* ones that are generated internally (by the rootport itself)
@@ -377,7 +377,7 @@ void pci_aer_init(struct pci_dev *dev)
if (!dev->aer_cap)
return;
- dev->aer_stats = kzalloc(sizeof(struct aer_stats), GFP_KERNEL);
+ dev->aer_report = kzalloc(sizeof(*dev->aer_report), GFP_KERNEL);
/*
* We save/restore PCI_ERR_UNCOR_MASK, PCI_ERR_UNCOR_SEVER,
@@ -398,8 +398,8 @@ void pci_aer_init(struct pci_dev *dev)
void pci_aer_exit(struct pci_dev *dev)
{
- kfree(dev->aer_stats);
- dev->aer_stats = NULL;
+ kfree(dev->aer_report);
+ dev->aer_report = NULL;
}
#define AER_AGENT_RECEIVER 0
@@ -537,10 +537,10 @@ static const char *aer_agent_string[] = {
{ \
unsigned int i; \
struct pci_dev *pdev = to_pci_dev(dev); \
- u64 *stats = pdev->aer_stats->stats_array; \
+ u64 *stats = pdev->aer_report->stats_array; \
size_t len = 0; \
\
- for (i = 0; i < ARRAY_SIZE(pdev->aer_stats->stats_array); i++) {\
+ for (i = 0; i < ARRAY_SIZE(pdev->aer_report->stats_array); i++) {\
if (strings_array[i]) \
len += sysfs_emit_at(buf, len, "%s %llu\n", \
strings_array[i], \
@@ -551,7 +551,7 @@ static const char *aer_agent_string[] = {
i, stats[i]); \
} \
len += sysfs_emit_at(buf, len, "TOTAL_%s %llu\n", total_string, \
- pdev->aer_stats->total_field); \
+ pdev->aer_report->total_field); \
return len; \
} \
static DEVICE_ATTR_RO(name)
@@ -572,7 +572,7 @@ aer_stats_dev_attr(aer_dev_nonfatal, dev_nonfatal_errs,
char *buf) \
{ \
struct pci_dev *pdev = to_pci_dev(dev); \
- return sysfs_emit(buf, "%llu\n", pdev->aer_stats->field); \
+ return sysfs_emit(buf, "%llu\n", pdev->aer_report->field); \
} \
static DEVICE_ATTR_RO(name)
@@ -599,7 +599,7 @@ static umode_t aer_stats_attrs_are_visible(struct kobject *kobj,
struct device *dev = kobj_to_dev(kobj);
struct pci_dev *pdev = to_pci_dev(dev);
- if (!pdev->aer_stats)
+ if (!pdev->aer_report)
return 0;
if ((a == &dev_attr_aer_rootport_total_err_cor.attr ||
@@ -622,25 +622,25 @@ void pci_dev_aer_stats_incr(struct pci_dev *pdev, struct aer_err_info *info)
unsigned long status = info->status & ~info->mask;
int i, max = -1;
u64 *counter = NULL;
- struct aer_stats *aer_stats = pdev->aer_stats;
+ struct aer_report *aer_report = pdev->aer_report;
- if (!aer_stats)
+ if (!aer_report)
return;
switch (info->severity) {
case AER_CORRECTABLE:
- aer_stats->dev_total_cor_errs++;
- counter = &aer_stats->dev_cor_errs[0];
+ aer_report->dev_total_cor_errs++;
+ counter = &aer_report->dev_cor_errs[0];
max = AER_MAX_TYPEOF_COR_ERRS;
break;
case AER_NONFATAL:
- aer_stats->dev_total_nonfatal_errs++;
- counter = &aer_stats->dev_nonfatal_errs[0];
+ aer_report->dev_total_nonfatal_errs++;
+ counter = &aer_report->dev_nonfatal_errs[0];
max = AER_MAX_TYPEOF_UNCOR_ERRS;
break;
case AER_FATAL:
- aer_stats->dev_total_fatal_errs++;
- counter = &aer_stats->dev_fatal_errs[0];
+ aer_report->dev_total_fatal_errs++;
+ counter = &aer_report->dev_fatal_errs[0];
max = AER_MAX_TYPEOF_UNCOR_ERRS;
break;
}
@@ -652,19 +652,19 @@ void pci_dev_aer_stats_incr(struct pci_dev *pdev, struct aer_err_info *info)
static void pci_rootport_aer_stats_incr(struct pci_dev *pdev,
struct aer_err_source *e_src)
{
- struct aer_stats *aer_stats = pdev->aer_stats;
+ struct aer_report *aer_report = pdev->aer_report;
- if (!aer_stats)
+ if (!aer_report)
return;
if (e_src->status & PCI_ERR_ROOT_COR_RCV)
- aer_stats->rootport_total_cor_errs++;
+ aer_report->rootport_total_cor_errs++;
if (e_src->status & PCI_ERR_ROOT_UNCOR_RCV) {
if (e_src->status & PCI_ERR_ROOT_FATAL_RCV)
- aer_stats->rootport_total_fatal_errs++;
+ aer_report->rootport_total_fatal_errs++;
else
- aer_stats->rootport_total_nonfatal_errs++;
+ aer_report->rootport_total_nonfatal_errs++;
}
}
diff --git a/include/linux/pci.h b/include/linux/pci.h
index e4bf67bf8172..900edb6f8f62 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -346,7 +346,7 @@ struct pci_dev {
u8 hdr_type; /* PCI header type (`multi' flag masked out) */
#ifdef CONFIG_PCIEAER
u16 aer_cap; /* AER capability offset */
- struct aer_stats *aer_stats; /* AER stats for this device */
+ struct aer_report *aer_report; /* AER report for this device */
#endif
#ifdef CONFIG_PCIEPORTBUS
struct rcec_ea *rcec_ea; /* RCEC cached endpoint association */
--
2.49.0.rc1.451.g8f38331e32-goog
^ permalink raw reply related [flat|nested] 28+ messages in thread* Re: [PATCH v3 4/8] PCI/AER: Rename struct aer_stats to aer_report
2025-03-19 8:40 ` [PATCH v3 4/8] PCI/AER: Rename struct aer_stats to aer_report Jon Pan-Doh
@ 2025-03-20 3:29 ` Sathyanarayanan Kuppuswamy
2025-03-20 8:28 ` Jon Pan-Doh
0 siblings, 1 reply; 28+ messages in thread
From: Sathyanarayanan Kuppuswamy @ 2025-03-20 3:29 UTC (permalink / raw)
To: Jon Pan-Doh, Bjorn Helgaas, Karolina Stolarek
Cc: linux-pci, Martin Petersen, Ben Fuller, Drew Walton, Anil Agrawal,
Tony Luck, Ilpo Järvinen, Lukas Wunner, Jonathan Cameron,
Terry Bowman
On 3/19/25 1:40 AM, Jon Pan-Doh wrote:
> Update name to reflect the broader definition of structs/variables that
> are stored (e.g. ratelimits).
>
> Signed-off-by: Karolina Stolarek <karolina.stolarek@oracle.com>
> Signed-off-by: Jon Pan-Doh <pandoh@google.com>
> Reviewed-by: Karolina Stolarek <karolina.stolarek@oracle.com>
> ---
> drivers/pci/pcie/aer.c | 50 +++++++++++++++++++++---------------------
> include/linux/pci.h | 2 +-
> 2 files changed, 26 insertions(+), 26 deletions(-)
>
> diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
> index 8e4d4f9326e1..3a12980f7dd7 100644
> --- a/drivers/pci/pcie/aer.c
> +++ b/drivers/pci/pcie/aer.c
> @@ -54,11 +54,11 @@ struct aer_rpc {
> DECLARE_KFIFO(aer_fifo, struct aer_err_source, AER_ERROR_SOURCES_MAX);
> };
>
> -/* AER stats for the device */
> -struct aer_stats {
> +/* AER report for the device */
> +struct aer_report {
This struct only seems to have details related to AER error status. Why
rename
it to aer_report() ? Are you extending it in future patches? If yes,
include that
motivation in the commit log.
>
> /*
> - * Fields for all AER capable devices. They indicate the errors
> + * Stats for all AER capable devices. They indicate the errors
> * "as seen by this device". Note that this may mean that if an
> * end point is causing problems, the AER counters may increment
> * at its link partner (e.g. root port) because the errors will be
> @@ -80,7 +80,7 @@ struct aer_stats {
> u64 dev_total_nonfatal_errs;
>
> /*
> - * Fields for Root ports & root complex event collectors only, these
> + * Stats for Root ports & root complex event collectors only, these
> * indicate the total number of ERR_COR, ERR_FATAL, and ERR_NONFATAL
> * messages received by the root port / event collector, INCLUDING the
> * ones that are generated internally (by the rootport itself)
> @@ -377,7 +377,7 @@ void pci_aer_init(struct pci_dev *dev)
> if (!dev->aer_cap)
> return;
>
> - dev->aer_stats = kzalloc(sizeof(struct aer_stats), GFP_KERNEL);
> + dev->aer_report = kzalloc(sizeof(*dev->aer_report), GFP_KERNEL);
>
> /*
> * We save/restore PCI_ERR_UNCOR_MASK, PCI_ERR_UNCOR_SEVER,
> @@ -398,8 +398,8 @@ void pci_aer_init(struct pci_dev *dev)
>
> void pci_aer_exit(struct pci_dev *dev)
> {
> - kfree(dev->aer_stats);
> - dev->aer_stats = NULL;
> + kfree(dev->aer_report);
> + dev->aer_report = NULL;
> }
>
> #define AER_AGENT_RECEIVER 0
> @@ -537,10 +537,10 @@ static const char *aer_agent_string[] = {
> { \
> unsigned int i; \
> struct pci_dev *pdev = to_pci_dev(dev); \
> - u64 *stats = pdev->aer_stats->stats_array; \
> + u64 *stats = pdev->aer_report->stats_array; \
> size_t len = 0; \
> \
> - for (i = 0; i < ARRAY_SIZE(pdev->aer_stats->stats_array); i++) {\
> + for (i = 0; i < ARRAY_SIZE(pdev->aer_report->stats_array); i++) {\
> if (strings_array[i]) \
> len += sysfs_emit_at(buf, len, "%s %llu\n", \
> strings_array[i], \
> @@ -551,7 +551,7 @@ static const char *aer_agent_string[] = {
> i, stats[i]); \
> } \
> len += sysfs_emit_at(buf, len, "TOTAL_%s %llu\n", total_string, \
> - pdev->aer_stats->total_field); \
> + pdev->aer_report->total_field); \
> return len; \
> } \
> static DEVICE_ATTR_RO(name)
> @@ -572,7 +572,7 @@ aer_stats_dev_attr(aer_dev_nonfatal, dev_nonfatal_errs,
> char *buf) \
> { \
> struct pci_dev *pdev = to_pci_dev(dev); \
> - return sysfs_emit(buf, "%llu\n", pdev->aer_stats->field); \
> + return sysfs_emit(buf, "%llu\n", pdev->aer_report->field); \
> } \
> static DEVICE_ATTR_RO(name)
>
> @@ -599,7 +599,7 @@ static umode_t aer_stats_attrs_are_visible(struct kobject *kobj,
> struct device *dev = kobj_to_dev(kobj);
> struct pci_dev *pdev = to_pci_dev(dev);
>
> - if (!pdev->aer_stats)
> + if (!pdev->aer_report)
> return 0;
>
> if ((a == &dev_attr_aer_rootport_total_err_cor.attr ||
> @@ -622,25 +622,25 @@ void pci_dev_aer_stats_incr(struct pci_dev *pdev, struct aer_err_info *info)
> unsigned long status = info->status & ~info->mask;
> int i, max = -1;
> u64 *counter = NULL;
> - struct aer_stats *aer_stats = pdev->aer_stats;
> + struct aer_report *aer_report = pdev->aer_report;
>
> - if (!aer_stats)
> + if (!aer_report)
> return;
>
> switch (info->severity) {
> case AER_CORRECTABLE:
> - aer_stats->dev_total_cor_errs++;
> - counter = &aer_stats->dev_cor_errs[0];
> + aer_report->dev_total_cor_errs++;
> + counter = &aer_report->dev_cor_errs[0];
> max = AER_MAX_TYPEOF_COR_ERRS;
> break;
> case AER_NONFATAL:
> - aer_stats->dev_total_nonfatal_errs++;
> - counter = &aer_stats->dev_nonfatal_errs[0];
> + aer_report->dev_total_nonfatal_errs++;
> + counter = &aer_report->dev_nonfatal_errs[0];
> max = AER_MAX_TYPEOF_UNCOR_ERRS;
> break;
> case AER_FATAL:
> - aer_stats->dev_total_fatal_errs++;
> - counter = &aer_stats->dev_fatal_errs[0];
> + aer_report->dev_total_fatal_errs++;
> + counter = &aer_report->dev_fatal_errs[0];
> max = AER_MAX_TYPEOF_UNCOR_ERRS;
> break;
> }
> @@ -652,19 +652,19 @@ void pci_dev_aer_stats_incr(struct pci_dev *pdev, struct aer_err_info *info)
> static void pci_rootport_aer_stats_incr(struct pci_dev *pdev,
> struct aer_err_source *e_src)
> {
> - struct aer_stats *aer_stats = pdev->aer_stats;
> + struct aer_report *aer_report = pdev->aer_report;
>
> - if (!aer_stats)
> + if (!aer_report)
> return;
>
> if (e_src->status & PCI_ERR_ROOT_COR_RCV)
> - aer_stats->rootport_total_cor_errs++;
> + aer_report->rootport_total_cor_errs++;
>
> if (e_src->status & PCI_ERR_ROOT_UNCOR_RCV) {
> if (e_src->status & PCI_ERR_ROOT_FATAL_RCV)
> - aer_stats->rootport_total_fatal_errs++;
> + aer_report->rootport_total_fatal_errs++;
> else
> - aer_stats->rootport_total_nonfatal_errs++;
> + aer_report->rootport_total_nonfatal_errs++;
> }
> }
>
> diff --git a/include/linux/pci.h b/include/linux/pci.h
> index e4bf67bf8172..900edb6f8f62 100644
> --- a/include/linux/pci.h
> +++ b/include/linux/pci.h
> @@ -346,7 +346,7 @@ struct pci_dev {
> u8 hdr_type; /* PCI header type (`multi' flag masked out) */
> #ifdef CONFIG_PCIEAER
> u16 aer_cap; /* AER capability offset */
> - struct aer_stats *aer_stats; /* AER stats for this device */
> + struct aer_report *aer_report; /* AER report for this device */
> #endif
> #ifdef CONFIG_PCIEPORTBUS
> struct rcec_ea *rcec_ea; /* RCEC cached endpoint association */
--
Sathyanarayanan Kuppuswamy
Linux Kernel Developer
^ permalink raw reply [flat|nested] 28+ messages in thread* Re: [PATCH v3 4/8] PCI/AER: Rename struct aer_stats to aer_report
2025-03-20 3:29 ` Sathyanarayanan Kuppuswamy
@ 2025-03-20 8:28 ` Jon Pan-Doh
0 siblings, 0 replies; 28+ messages in thread
From: Jon Pan-Doh @ 2025-03-20 8:28 UTC (permalink / raw)
To: Sathyanarayanan Kuppuswamy
Cc: Bjorn Helgaas, Karolina Stolarek, linux-pci, Martin Petersen,
Ben Fuller, Drew Walton, Anil Agrawal, Tony Luck,
Ilpo Järvinen, Lukas Wunner, Jonathan Cameron, Terry Bowman
On Wed, Mar 19, 2025 at 8:29 PM Sathyanarayanan Kuppuswamy
<sathyanarayanan.kuppuswamy@linux.intel.com> wrote:
>
> On 3/19/25 1:40 AM, Jon Pan-Doh wrote:
> > -/* AER stats for the device */
> > -struct aer_stats {
> > +/* AER report for the device */
> > +struct aer_report {
>
> This struct only seems to have details related to AER error status. Why
> rename
> it to aer_report() ? Are you extending it in future patches? If yes,
> include that
> motivation in the commit log.
Karolina suggested aer_report[1] as a more suitable name than the
previously proposed aer_info. aer_stats is no longer suitable as the
struct contains more than just error counters (e.g. ratelimits).
I did not have a plan on extending it (other than maybe adding
ratelimits for IRQs in a follow-up series) at this time.
Do you have an alternative suggestion instead of aer_report?
[1] https://lore.kernel.org/linux-pci/12d24891-c32e-4a84-a6ee-4501106a6957@oracle.com/
Thanks,
Jon
^ permalink raw reply [flat|nested] 28+ messages in thread
* [PATCH v3 5/8] PCI/AER: Introduce ratelimit for error logs
2025-03-19 8:40 [PATCH v3 0/8] Rate limit AER logs Jon Pan-Doh
` (3 preceding siblings ...)
2025-03-19 8:40 ` [PATCH v3 4/8] PCI/AER: Rename struct aer_stats to aer_report Jon Pan-Doh
@ 2025-03-19 8:40 ` Jon Pan-Doh
2025-03-19 18:47 ` Bjorn Helgaas
2025-03-19 8:40 ` [PATCH v3 6/8] PCI/AER: Add ratelimits to PCI AER Documentation Jon Pan-Doh
` (3 subsequent siblings)
8 siblings, 1 reply; 28+ messages in thread
From: Jon Pan-Doh @ 2025-03-19 8:40 UTC (permalink / raw)
To: Bjorn Helgaas, Karolina Stolarek
Cc: linux-pci, Martin Petersen, Ben Fuller, Drew Walton, Anil Agrawal,
Tony Luck, Ilpo Järvinen, Sathyanarayanan Kuppuswamy,
Lukas Wunner, Jonathan Cameron, Terry Bowman, Jon Pan-Doh
Spammy devices can flood kernel logs with AER errors and slow/stall
execution. Add per-device ratelimits for AER correctable and uncorrectable
errors that use the kernel defaults (10 per 5s).
Tested using aer-inject[1]. Sent 11 AER errors. Observed 10 errors logged
while AER stats (cat /sys/bus/pci/devices/<dev>/aer_dev_correctable) show
true count of 11.
[1] https://git.kernel.org/pub/scm/linux/kernel/git/gong.chen/aer-inject.git
Signed-off-by: Jon Pan-Doh <pandoh@google.com>
Reviewed-by: Karolina Stolarek <karolina.stolarek@oracle.com>
---
drivers/pci/pcie/aer.c | 76 +++++++++++++++++++++++++++++++++---------
1 file changed, 61 insertions(+), 15 deletions(-)
diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
index 3a12980f7dd7..0bd20c4993d4 100644
--- a/drivers/pci/pcie/aer.c
+++ b/drivers/pci/pcie/aer.c
@@ -28,6 +28,7 @@
#include <linux/interrupt.h>
#include <linux/delay.h>
#include <linux/kfifo.h>
+#include <linux/ratelimit.h>
#include <linux/slab.h>
#include <acpi/apei.h>
#include <acpi/ghes.h>
@@ -88,6 +89,10 @@ struct aer_report {
u64 rootport_total_cor_errs;
u64 rootport_total_fatal_errs;
u64 rootport_total_nonfatal_errs;
+
+ /* Ratelimits for errors */
+ struct ratelimit_state cor_log_ratelimit;
+ struct ratelimit_state uncor_log_ratelimit;
};
#define AER_LOG_TLP_MASKS (PCI_ERR_UNC_POISON_TLP| \
@@ -379,6 +384,15 @@ void pci_aer_init(struct pci_dev *dev)
dev->aer_report = kzalloc(sizeof(*dev->aer_report), GFP_KERNEL);
+ /*
+ * Ratelimits are doubled as a given error produces 2 logs (root port
+ * and endpoint) that should be under same ratelimit.
+ */
+ ratelimit_state_init(&dev->aer_report->cor_log_ratelimit,
+ DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST * 2);
+ ratelimit_state_init(&dev->aer_report->uncor_log_ratelimit,
+ DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST * 2);
+
/*
* We save/restore PCI_ERR_UNCOR_MASK, PCI_ERR_UNCOR_SEVER,
* PCI_ERR_COR_MASK, and PCI_ERR_CAP. Root and Root Complex Event
@@ -697,6 +711,7 @@ void aer_print_error(struct pci_dev *dev, struct aer_err_info *info,
{
int layer, agent;
int id = pci_dev_id(dev);
+ struct ratelimit_state *ratelimit;
if (!info->status) {
pci_err(dev, "PCIe Bus Error: severity=%s, type=Inaccessible, (Unregistered Agent ID)\n",
@@ -704,6 +719,17 @@ void aer_print_error(struct pci_dev *dev, struct aer_err_info *info,
goto out;
}
+ if (info->severity == AER_CORRECTABLE)
+ ratelimit = &dev->aer_report->cor_log_ratelimit;
+ else
+ ratelimit = &dev->aer_report->uncor_log_ratelimit;
+
+ trace_aer_event(dev_name(&dev->dev), (info->status & ~info->mask),
+ info->severity, info->tlp_header_valid, &info->tlp);
+
+ if (!__ratelimit(ratelimit))
+ return;
+
layer = AER_GET_LAYER_ERROR(info->severity, info->status);
agent = AER_GET_AGENT(info->severity, info->status);
@@ -722,21 +748,33 @@ void aer_print_error(struct pci_dev *dev, struct aer_err_info *info,
out:
if (info->id && info->error_dev_num > 1 && info->id == id)
pci_err(dev, " Error of this Agent is reported first\n");
-
- trace_aer_event(dev_name(&dev->dev), (info->status & ~info->mask),
- info->severity, info->tlp_header_valid, &info->tlp);
}
static void aer_print_port_info(struct pci_dev *dev, struct aer_err_info *info)
{
u8 bus = info->id >> 8;
u8 devfn = info->id & 0xff;
+ struct ratelimit_state *ratelimit;
+ struct pci_dev *endpoint;
+ int i;
- pci_info(dev, "%s%s error message received from %04x:%02x:%02x.%d\n",
- info->multi_error_valid ? "Multiple " : "",
- aer_error_severity_string[info->severity],
- pci_domain_nr(dev->bus), bus, PCI_SLOT(devfn),
- PCI_FUNC(devfn));
+ /* extract endpoint device ratelimit */
+ for (i = 0; i < info->error_dev_num; i++) {
+ endpoint = info->dev[i];
+ if (info->id == pci_dev_id(endpoint))
+ break;
+ }
+ if (info->severity == AER_CORRECTABLE)
+ ratelimit = &endpoint->aer_report->cor_log_ratelimit;
+ else
+ ratelimit = &endpoint->aer_report->uncor_log_ratelimit;
+
+ if (__ratelimit(ratelimit))
+ pci_info(dev, "%s%s error message received from %04x:%02x:%02x.%d\n",
+ info->multi_error_valid ? "Multiple " : "",
+ aer_error_severity_string[info->severity],
+ pci_domain_nr(dev->bus), bus, PCI_SLOT(devfn),
+ PCI_FUNC(devfn));
}
#ifdef CONFIG_ACPI_APEI_PCIEAER
@@ -761,12 +799,15 @@ void pci_print_aer(struct pci_dev *dev, int aer_severity,
u32 status, mask;
const char *level;
struct aer_err_info info;
+ struct ratelimit_state *ratelimit;
if (aer_severity == AER_CORRECTABLE) {
+ ratelimit = &dev->aer_report->cor_log_ratelimit;
status = aer->cor_status;
mask = aer->cor_mask;
level = KERN_WARNING;
} else {
+ ratelimit = &dev->aer_report->uncor_log_ratelimit;
status = aer->uncor_status;
mask = aer->uncor_mask;
tlp_header_valid = status & AER_LOG_TLP_MASKS;
@@ -784,6 +825,12 @@ void pci_print_aer(struct pci_dev *dev, int aer_severity,
pci_dev_aer_stats_incr(dev, &info);
+ trace_aer_event(dev_name(&dev->dev), (status & ~mask),
+ aer_severity, tlp_header_valid, &aer->header_log);
+
+ if (!__ratelimit(ratelimit))
+ return;
+
aer_printk(level, dev, "aer_status: 0x%08x, aer_mask: 0x%08x\n", status, mask);
__aer_print_error(dev, &info, level);
aer_printk(level, dev, "aer_layer=%s, aer_agent=%s\n",
@@ -795,9 +842,6 @@ void pci_print_aer(struct pci_dev *dev, int aer_severity,
if (tlp_header_valid)
pcie_print_tlp_log(dev, &aer->header_log, dev_fmt(" "));
-
- trace_aer_event(dev_name(&dev->dev), (status & ~mask),
- aer_severity, tlp_header_valid, &aer->header_log);
}
EXPORT_SYMBOL_NS_GPL(pci_print_aer, "CXL");
@@ -1301,10 +1345,11 @@ static void aer_isr_one_error(struct aer_rpc *rpc,
e_info.multi_error_valid = 1;
else
e_info.multi_error_valid = 0;
- aer_print_port_info(pdev, &e_info);
- if (find_source_device(pdev, &e_info))
+ if (find_source_device(pdev, &e_info)) {
+ aer_print_port_info(pdev, &e_info);
aer_process_err_devices(&e_info, level);
+ }
}
if (e_src->status & PCI_ERR_ROOT_UNCOR_RCV) {
@@ -1321,10 +1366,11 @@ static void aer_isr_one_error(struct aer_rpc *rpc,
else
e_info.multi_error_valid = 0;
- aer_print_port_info(pdev, &e_info);
- if (find_source_device(pdev, &e_info))
+ if (find_source_device(pdev, &e_info)) {
+ aer_print_port_info(pdev, &e_info);
aer_process_err_devices(&e_info, level);
+ }
}
}
--
2.49.0.rc1.451.g8f38331e32-goog
^ permalink raw reply related [flat|nested] 28+ messages in thread* Re: [PATCH v3 5/8] PCI/AER: Introduce ratelimit for error logs
2025-03-19 8:40 ` [PATCH v3 5/8] PCI/AER: Introduce ratelimit for error logs Jon Pan-Doh
@ 2025-03-19 18:47 ` Bjorn Helgaas
2025-03-20 8:27 ` Jon Pan-Doh
0 siblings, 1 reply; 28+ messages in thread
From: Bjorn Helgaas @ 2025-03-19 18:47 UTC (permalink / raw)
To: Jon Pan-Doh
Cc: Bjorn Helgaas, Karolina Stolarek, linux-pci, Martin Petersen,
Ben Fuller, Drew Walton, Anil Agrawal, Tony Luck,
Ilpo Järvinen, Sathyanarayanan Kuppuswamy, Lukas Wunner,
Jonathan Cameron, Terry Bowman
On Wed, Mar 19, 2025 at 01:40:46AM -0700, Jon Pan-Doh wrote:
> Spammy devices can flood kernel logs with AER errors and slow/stall
> execution. Add per-device ratelimits for AER correctable and uncorrectable
> errors that use the kernel defaults (10 per 5s).
>
> Tested using aer-inject[1]. Sent 11 AER errors. Observed 10 errors logged
> while AER stats (cat /sys/bus/pci/devices/<dev>/aer_dev_correctable) show
> true count of 11.
I think this is really on the right track. A few minor comments
below.
> @@ -697,6 +711,7 @@ void aer_print_error(struct pci_dev *dev, struct aer_err_info *info,
> {
> int layer, agent;
> int id = pci_dev_id(dev);
> + struct ratelimit_state *ratelimit;
>
> if (!info->status) {
> pci_err(dev, "PCIe Bus Error: severity=%s, type=Inaccessible, (Unregistered Agent ID)\n",
> @@ -704,6 +719,17 @@ void aer_print_error(struct pci_dev *dev, struct aer_err_info *info,
> goto out;
> }
>
> + if (info->severity == AER_CORRECTABLE)
> + ratelimit = &dev->aer_report->cor_log_ratelimit;
> + else
> + ratelimit = &dev->aer_report->uncor_log_ratelimit;
> +
> + trace_aer_event(dev_name(&dev->dev), (info->status & ~info->mask),
> + info->severity, info->tlp_header_valid, &info->tlp);
> +
> + if (!__ratelimit(ratelimit))
> + return;
- I think the ratelimit lookup and __ratelimit() call should be
together since there's no need for trace_aer_event() to be in the
middle.
- The lookup and __ratelimit() calls are repeated and are probably
worth factoring out into something like this:
static int aer_ratelimit(struct pci_dev *dev, unsigned int severity)
- Previously we *always* called trace_aer_event(), but now we don't
in the !info->status case. Maybe an unintentional change? I
think we should call trace_aer_event() always, or change that in a
separate patch if we need to. This would always have been simpler
if trace_aer_event() had been the very first thing in the
function.
- The !info->status case message is not rate-limited. Seems like
maybe it should be?
> layer = AER_GET_LAYER_ERROR(info->severity, info->status);
> agent = AER_GET_AGENT(info->severity, info->status);
>
> @@ -722,21 +748,33 @@ void aer_print_error(struct pci_dev *dev, struct aer_err_info *info,
> out:
> if (info->id && info->error_dev_num > 1 && info->id == id)
> pci_err(dev, " Error of this Agent is reported first\n");
> -
> - trace_aer_event(dev_name(&dev->dev), (info->status & ~info->mask),
> - info->severity, info->tlp_header_valid, &info->tlp);
> }
^ permalink raw reply [flat|nested] 28+ messages in thread* Re: [PATCH v3 5/8] PCI/AER: Introduce ratelimit for error logs
2025-03-19 18:47 ` Bjorn Helgaas
@ 2025-03-20 8:27 ` Jon Pan-Doh
2025-03-20 18:23 ` Bjorn Helgaas
0 siblings, 1 reply; 28+ messages in thread
From: Jon Pan-Doh @ 2025-03-20 8:27 UTC (permalink / raw)
To: Bjorn Helgaas
Cc: Bjorn Helgaas, Karolina Stolarek, linux-pci, Martin Petersen,
Ben Fuller, Drew Walton, Anil Agrawal, Tony Luck,
Ilpo Järvinen, Sathyanarayanan Kuppuswamy, Lukas Wunner,
Jonathan Cameron, Terry Bowman
On Wed, Mar 19, 2025 at 11:47 AM Bjorn Helgaas <helgaas@kernel.org> wrote:
> - I think the ratelimit lookup and __ratelimit() call should be
> together since there's no need for trace_aer_event() to be in the
> middle.
>
> - The lookup and __ratelimit() calls are repeated and are probably
> worth factoring out into something like this:
>
> static int aer_ratelimit(struct pci_dev *dev, unsigned int severity)
Changed in v4.
> - Previously we *always* called trace_aer_event(), but now we don't
> in the !info->status case. Maybe an unintentional change? I
> think we should call trace_aer_event() always, or change that in a
> separate patch if we need to. This would always have been simpler
> if trace_aer_event() had been the very first thing in the
> function.
Good catch. That is an unintentional bug. trace_aer_event() should
always be called. Moved it to the first thing in aer_print_error() in
v4 (same patch as I wasn't sure what justification to put for a
separate commit message other than precursor for ratelimit).
>
> - The !info->status case message is not rate-limited. Seems like
> maybe it should be?
Changed in v4.
Thanks,
Jon
^ permalink raw reply [flat|nested] 28+ messages in thread
* Re: [PATCH v3 5/8] PCI/AER: Introduce ratelimit for error logs
2025-03-20 8:27 ` Jon Pan-Doh
@ 2025-03-20 18:23 ` Bjorn Helgaas
0 siblings, 0 replies; 28+ messages in thread
From: Bjorn Helgaas @ 2025-03-20 18:23 UTC (permalink / raw)
To: Jon Pan-Doh
Cc: Bjorn Helgaas, Karolina Stolarek, linux-pci, Martin Petersen,
Ben Fuller, Drew Walton, Anil Agrawal, Tony Luck,
Ilpo Järvinen, Sathyanarayanan Kuppuswamy, Lukas Wunner,
Jonathan Cameron, Terry Bowman
On Thu, Mar 20, 2025 at 01:27:27AM -0700, Jon Pan-Doh wrote:
> On Wed, Mar 19, 2025 at 11:47 AM Bjorn Helgaas <helgaas@kernel.org> wrote:
> > - Previously we *always* called trace_aer_event(), but now we don't
> > in the !info->status case. Maybe an unintentional change? I
> > think we should call trace_aer_event() always, or change that in a
> > separate patch if we need to. This would always have been simpler
> > if trace_aer_event() had been the very first thing in the
> > function.
>
> Good catch. That is an unintentional bug. trace_aer_event() should
> always be called. Moved it to the first thing in aer_print_error() in
> v4 (same patch as I wasn't sure what justification to put for a
> separate commit message other than precursor for ratelimit).
I wonder if trace_aer_event() and pci_dev_aer_stats_incr() should be
part of the same function since we always do both. But I guess the
trace needs a little more information. Minor thing we can worry about
later.
Bjorn
^ permalink raw reply [flat|nested] 28+ messages in thread
* [PATCH v3 6/8] PCI/AER: Add ratelimits to PCI AER Documentation
2025-03-19 8:40 [PATCH v3 0/8] Rate limit AER logs Jon Pan-Doh
` (4 preceding siblings ...)
2025-03-19 8:40 ` [PATCH v3 5/8] PCI/AER: Introduce ratelimit for error logs Jon Pan-Doh
@ 2025-03-19 8:40 ` Jon Pan-Doh
2025-03-19 8:40 ` [PATCH v3 7/8] PCI/AER: Add sysfs attributes for log ratelimits Jon Pan-Doh
` (2 subsequent siblings)
8 siblings, 0 replies; 28+ messages in thread
From: Jon Pan-Doh @ 2025-03-19 8:40 UTC (permalink / raw)
To: Bjorn Helgaas, Karolina Stolarek
Cc: linux-pci, Martin Petersen, Ben Fuller, Drew Walton, Anil Agrawal,
Tony Luck, Ilpo Järvinen, Sathyanarayanan Kuppuswamy,
Lukas Wunner, Jonathan Cameron, Terry Bowman, Jon Pan-Doh
Add ratelimits section for rationale and defaults.
Signed-off-by: Karolina Stolarek <karolina.stolarek@oracle.com>
Signed-off-by: Jon Pan-Doh <pandoh@google.com>
---
Documentation/PCI/pcieaer-howto.rst | 11 +++++++++++
1 file changed, 11 insertions(+)
diff --git a/Documentation/PCI/pcieaer-howto.rst b/Documentation/PCI/pcieaer-howto.rst
index f013f3b27c82..896d2a232a90 100644
--- a/Documentation/PCI/pcieaer-howto.rst
+++ b/Documentation/PCI/pcieaer-howto.rst
@@ -85,6 +85,17 @@ In the example, 'Requester ID' means the ID of the device that sent
the error message to the Root Port. Please refer to PCIe specs for other
fields.
+AER Ratelimits
+--------------
+
+Since error messages can be generated for each transaction, we may see
+large volumes of errors reported. To prevent spammy devices from flooding
+the console/stalling execution, messages are throttled by device and error
+type (correctable vs. uncorrectable).
+
+AER uses the default ratelimit of DEFAULT_RATELIMIT_BURST (10 events) over
+DEFAULT_RATELIMIT_INTERVAL (5 seconds).
+
AER Statistics / Counters
-------------------------
--
2.49.0.rc1.451.g8f38331e32-goog
^ permalink raw reply related [flat|nested] 28+ messages in thread* [PATCH v3 7/8] PCI/AER: Add sysfs attributes for log ratelimits
2025-03-19 8:40 [PATCH v3 0/8] Rate limit AER logs Jon Pan-Doh
` (5 preceding siblings ...)
2025-03-19 8:40 ` [PATCH v3 6/8] PCI/AER: Add ratelimits to PCI AER Documentation Jon Pan-Doh
@ 2025-03-19 8:40 ` Jon Pan-Doh
2025-03-19 9:51 ` Ilpo Järvinen
2025-03-19 8:40 ` [PATCH v3 8/8] PCI/AER: Update AER sysfs ABI filename Jon Pan-Doh
2025-03-19 22:29 ` [PATCH v3 0/8] Rate limit AER logs Sathyanarayanan Kuppuswamy
8 siblings, 1 reply; 28+ messages in thread
From: Jon Pan-Doh @ 2025-03-19 8:40 UTC (permalink / raw)
To: Bjorn Helgaas, Karolina Stolarek
Cc: linux-pci, Martin Petersen, Ben Fuller, Drew Walton, Anil Agrawal,
Tony Luck, Ilpo Järvinen, Sathyanarayanan Kuppuswamy,
Lukas Wunner, Jonathan Cameron, Terry Bowman, Jon Pan-Doh
Allow userspace to read/write log ratelimits per device (including
enable/disable). Create aer/ sysfs directory to store them and any
future aer configs.
Tested using aer-inject[1]. Configured correctable log ratelimit to 5.
Sent 6 AER errors. Observed 5 errors logged while AER stats
(cat /sys/bus/pci/devices/<dev>/aer_dev_correctable) shows 6.
Disabled ratelimiting and sent 6 more AER errors. Observed all 6 errors
logged and accounted in AER stats (12 total errors).
[1] https://git.kernel.org/pub/scm/linux/kernel/git/gong.chen/aer-inject.git
Signed-off-by: Karolina Stolarek <karolina.stolarek@oracle.com>
Signed-off-by: Jon Pan-Doh <pandoh@google.com>
---
.../testing/sysfs-bus-pci-devices-aer_stats | 34 +++++++
Documentation/PCI/pcieaer-howto.rst | 3 +
drivers/pci/pci-sysfs.c | 1 +
drivers/pci/pci.h | 1 +
drivers/pci/pcie/aer.c | 93 +++++++++++++++++++
5 files changed, 132 insertions(+)
diff --git a/Documentation/ABI/testing/sysfs-bus-pci-devices-aer_stats b/Documentation/ABI/testing/sysfs-bus-pci-devices-aer_stats
index d1f67bb81d5d..4561653fdbde 100644
--- a/Documentation/ABI/testing/sysfs-bus-pci-devices-aer_stats
+++ b/Documentation/ABI/testing/sysfs-bus-pci-devices-aer_stats
@@ -117,3 +117,37 @@ Date: July 2018
KernelVersion: 4.19.0
Contact: linux-pci@vger.kernel.org, rajatja@google.com
Description: Total number of ERR_NONFATAL messages reported to rootport.
+
+PCIe AER ratelimits
+-------------------
+
+These attributes show up under all the devices that are AER capable.
+They represent configurable ratelimits of logs per error type.
+
+See Documentation/PCI/pcieaer-howto.rst for more info on ratelimits.
+
+What: /sys/bus/pci/devices/<dev>/aer/ratelimit_log_enable
+Date: March 2025
+KernelVersion: 6.15.0
+Contact: linux-pci@vger.kernel.org, pandoh@google.com
+Description: Writing 1/0 enables/disables AER log ratelimiting. Reading
+ gets whether or not AER is currently enabled. Enabled by
+ default.
+
+What: /sys/bus/pci/devices/<dev>/aer/ratelimit_in_5secs_cor_log
+Date: March 2025
+KernelVersion: 6.15.0
+Contact: linux-pci@vger.kernel.org, pandoh@google.com
+Description: Ratelimit burst for correctable error logs. Writing a value
+ changes the number of errors (burst) allowed per interval
+ (5 second window) before ratelimiting. Reading gets the
+ current ratelimit burst.
+
+What: /sys/bus/pci/devices/<dev>/aer/ratelimit_in_5secs_uncor_log
+Date: March 2025
+KernelVersion: 6.15.0
+Contact: linux-pci@vger.kernel.org, pandoh@google.com
+Description: Ratelimit burst for uncorrectable error logs. Writing a
+ value changes the number of errors (burst) allowed per
+ interval (5 second window) before ratelimiting. Reading
+ gets the current ratelimit burst.
diff --git a/Documentation/PCI/pcieaer-howto.rst b/Documentation/PCI/pcieaer-howto.rst
index 896d2a232a90..b45a2e18d1cf 100644
--- a/Documentation/PCI/pcieaer-howto.rst
+++ b/Documentation/PCI/pcieaer-howto.rst
@@ -96,6 +96,9 @@ type (correctable vs. uncorrectable).
AER uses the default ratelimit of DEFAULT_RATELIMIT_BURST (10 events) over
DEFAULT_RATELIMIT_INTERVAL (5 seconds).
+Ratelimits are exposed in the form of sysfs attributes and configurable.
+See Documentation/ABI/testing/sysfs-bus-pci-devices-aer_stats.
+
AER Statistics / Counters
-------------------------
diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c
index b46ce1a2c554..16de3093294e 100644
--- a/drivers/pci/pci-sysfs.c
+++ b/drivers/pci/pci-sysfs.c
@@ -1801,6 +1801,7 @@ const struct attribute_group *pci_dev_attr_groups[] = {
&pcie_dev_attr_group,
#ifdef CONFIG_PCIEAER
&aer_stats_attr_group,
+ &aer_attr_group,
#endif
#ifdef CONFIG_PCIEASPM
&aspm_ctrl_attr_group,
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 9d63d32f041c..34633fe12201 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -889,6 +889,7 @@ void pci_no_aer(void);
void pci_aer_init(struct pci_dev *dev);
void pci_aer_exit(struct pci_dev *dev);
extern const struct attribute_group aer_stats_attr_group;
+extern const struct attribute_group aer_attr_group;
void pci_aer_clear_fatal_status(struct pci_dev *dev);
int pci_aer_clear_status(struct pci_dev *dev);
int pci_aer_raw_clear_status(struct pci_dev *dev);
diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
index 0bd20c4993d4..13227a94c9f9 100644
--- a/drivers/pci/pcie/aer.c
+++ b/drivers/pci/pcie/aer.c
@@ -631,6 +631,99 @@ const struct attribute_group aer_stats_attr_group = {
.is_visible = aer_stats_attrs_are_visible,
};
+/*
+ * Ratelimit enable toggle uses interval value of
+ * 0: disabled
+ * DEFAULT_RATELIMIT_INTERVAL: enabled
+ */
+static ssize_t ratelimit_log_enable_show(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct pci_dev *pdev = to_pci_dev(dev);
+ bool enable = pdev->aer_report->cor_log_ratelimit.interval != 0;
+
+ return sysfs_emit(buf, "%d\n", enable);
+}
+
+static ssize_t ratelimit_log_enable_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct pci_dev *pdev = to_pci_dev(dev);
+ bool enable;
+ int interval;
+
+ if (kstrtobool(buf, &enable) < 0)
+ return -EINVAL;
+
+ if (enable)
+ interval = DEFAULT_RATELIMIT_INTERVAL;
+ else
+ interval = 0;
+
+ pdev->aer_report->cor_log_ratelimit.interval = interval;
+ pdev->aer_report->uncor_log_ratelimit.interval = interval;
+ return count;
+}
+static DEVICE_ATTR_RW(ratelimit_log_enable);
+
+/*
+ * Ratelimits are doubled as a given error produces 2 logs (root port
+ * and endpoint) that should be under same ratelimit.
+ */
+#define aer_ratelimit_burst_attr(name, ratelimit) \
+ static ssize_t \
+ name##_show(struct device *dev, struct device_attribute *attr, \
+ char *buf) \
+{ \
+ struct pci_dev *pdev = to_pci_dev(dev); \
+ return sysfs_emit(buf, "%d\n", \
+ pdev->aer_report->ratelimit.burst / 2); \
+} \
+ \
+ static ssize_t \
+ name##_store(struct device *dev, struct device_attribute *attr, \
+ const char *buf, size_t count) \
+{ \
+ struct pci_dev *pdev = to_pci_dev(dev); \
+ int burst; \
+ \
+ if (kstrtoint(buf, 0, &burst) < 0) \
+ return -EINVAL; \
+ \
+ pdev->aer_report->ratelimit.burst = burst * 2; \
+ return count; \
+} \
+static DEVICE_ATTR_RW(name)
+
+aer_ratelimit_burst_attr(ratelimit_in_5secs_cor_log, cor_log_ratelimit);
+aer_ratelimit_burst_attr(ratelimit_in_5secs_uncor_log, uncor_log_ratelimit);
+
+static struct attribute *aer_attrs[] = {
+ &dev_attr_ratelimit_log_enable.attr,
+ &dev_attr_ratelimit_in_5secs_cor_log.attr,
+ &dev_attr_ratelimit_in_5secs_uncor_log.attr,
+ NULL
+};
+
+static umode_t aer_attrs_are_visible(struct kobject *kobj,
+ struct attribute *a, int n)
+{
+ struct device *dev = kobj_to_dev(kobj);
+ struct pci_dev *pdev = to_pci_dev(dev);
+
+ if (!pdev->aer_report)
+ return 0;
+ return a->mode;
+}
+
+const struct attribute_group aer_attr_group = {
+ .name = "aer",
+ .attrs = aer_attrs,
+ .is_visible = aer_attrs_are_visible,
+};
+
void pci_dev_aer_stats_incr(struct pci_dev *pdev, struct aer_err_info *info)
{
unsigned long status = info->status & ~info->mask;
--
2.49.0.rc1.451.g8f38331e32-goog
^ permalink raw reply related [flat|nested] 28+ messages in thread* Re: [PATCH v3 7/8] PCI/AER: Add sysfs attributes for log ratelimits
2025-03-19 8:40 ` [PATCH v3 7/8] PCI/AER: Add sysfs attributes for log ratelimits Jon Pan-Doh
@ 2025-03-19 9:51 ` Ilpo Järvinen
2025-03-20 8:27 ` Jon Pan-Doh
0 siblings, 1 reply; 28+ messages in thread
From: Ilpo Järvinen @ 2025-03-19 9:51 UTC (permalink / raw)
To: Jon Pan-Doh
Cc: Bjorn Helgaas, Karolina Stolarek, linux-pci, Martin Petersen,
Ben Fuller, Drew Walton, Anil Agrawal, Tony Luck,
Sathyanarayanan Kuppuswamy, Lukas Wunner, Jonathan Cameron,
Terry Bowman
On Wed, 19 Mar 2025, Jon Pan-Doh wrote:
> Allow userspace to read/write log ratelimits per device (including
> enable/disable). Create aer/ sysfs directory to store them and any
> future aer configs.
>
> Tested using aer-inject[1]. Configured correctable log ratelimit to 5.
> Sent 6 AER errors. Observed 5 errors logged while AER stats
> (cat /sys/bus/pci/devices/<dev>/aer_dev_correctable) shows 6.
>
> Disabled ratelimiting and sent 6 more AER errors. Observed all 6 errors
> logged and accounted in AER stats (12 total errors).
>
> [1] https://git.kernel.org/pub/scm/linux/kernel/git/gong.chen/aer-inject.git
>
> Signed-off-by: Karolina Stolarek <karolina.stolarek@oracle.com>
> Signed-off-by: Jon Pan-Doh <pandoh@google.com>
> ---
> .../testing/sysfs-bus-pci-devices-aer_stats | 34 +++++++
> Documentation/PCI/pcieaer-howto.rst | 3 +
> drivers/pci/pci-sysfs.c | 1 +
> drivers/pci/pci.h | 1 +
> drivers/pci/pcie/aer.c | 93 +++++++++++++++++++
> 5 files changed, 132 insertions(+)
>
> diff --git a/Documentation/ABI/testing/sysfs-bus-pci-devices-aer_stats b/Documentation/ABI/testing/sysfs-bus-pci-devices-aer_stats
> index d1f67bb81d5d..4561653fdbde 100644
> --- a/Documentation/ABI/testing/sysfs-bus-pci-devices-aer_stats
> +++ b/Documentation/ABI/testing/sysfs-bus-pci-devices-aer_stats
> @@ -117,3 +117,37 @@ Date: July 2018
> KernelVersion: 4.19.0
> Contact: linux-pci@vger.kernel.org, rajatja@google.com
> Description: Total number of ERR_NONFATAL messages reported to rootport.
> +
> +PCIe AER ratelimits
> +-------------------
> +
> +These attributes show up under all the devices that are AER capable.
> +They represent configurable ratelimits of logs per error type.
> +
> +See Documentation/PCI/pcieaer-howto.rst for more info on ratelimits.
> +
> +What: /sys/bus/pci/devices/<dev>/aer/ratelimit_log_enable
> +Date: March 2025
> +KernelVersion: 6.15.0
> +Contact: linux-pci@vger.kernel.org, pandoh@google.com
> +Description: Writing 1/0 enables/disables AER log ratelimiting. Reading
> + gets whether or not AER is currently enabled. Enabled by
> + default.
> +
> +What: /sys/bus/pci/devices/<dev>/aer/ratelimit_in_5secs_cor_log
> +Date: March 2025
> +KernelVersion: 6.15.0
> +Contact: linux-pci@vger.kernel.org, pandoh@google.com
> +Description: Ratelimit burst for correctable error logs. Writing a value
> + changes the number of errors (burst) allowed per interval
> + (5 second window) before ratelimiting. Reading gets the
> + current ratelimit burst.
> +
> +What: /sys/bus/pci/devices/<dev>/aer/ratelimit_in_5secs_uncor_log
> +Date: March 2025
> +KernelVersion: 6.15.0
> +Contact: linux-pci@vger.kernel.org, pandoh@google.com
> +Description: Ratelimit burst for uncorrectable error logs. Writing a
> + value changes the number of errors (burst) allowed per
> + interval (5 second window) before ratelimiting. Reading
> + gets the current ratelimit burst.
> diff --git a/Documentation/PCI/pcieaer-howto.rst b/Documentation/PCI/pcieaer-howto.rst
> index 896d2a232a90..b45a2e18d1cf 100644
> --- a/Documentation/PCI/pcieaer-howto.rst
> +++ b/Documentation/PCI/pcieaer-howto.rst
> @@ -96,6 +96,9 @@ type (correctable vs. uncorrectable).
> AER uses the default ratelimit of DEFAULT_RATELIMIT_BURST (10 events) over
> DEFAULT_RATELIMIT_INTERVAL (5 seconds).
>
> +Ratelimits are exposed in the form of sysfs attributes and configurable.
> +See Documentation/ABI/testing/sysfs-bus-pci-devices-aer_stats.
Why does the next patch immediately modify this? A patch series should try
to avoid back and forth changes like that.
--
i.
> AER Statistics / Counters
> -------------------------
>
> diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c
> index b46ce1a2c554..16de3093294e 100644
> --- a/drivers/pci/pci-sysfs.c
> +++ b/drivers/pci/pci-sysfs.c
> @@ -1801,6 +1801,7 @@ const struct attribute_group *pci_dev_attr_groups[] = {
> &pcie_dev_attr_group,
> #ifdef CONFIG_PCIEAER
> &aer_stats_attr_group,
> + &aer_attr_group,
> #endif
> #ifdef CONFIG_PCIEASPM
> &aspm_ctrl_attr_group,
> diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
> index 9d63d32f041c..34633fe12201 100644
> --- a/drivers/pci/pci.h
> +++ b/drivers/pci/pci.h
> @@ -889,6 +889,7 @@ void pci_no_aer(void);
> void pci_aer_init(struct pci_dev *dev);
> void pci_aer_exit(struct pci_dev *dev);
> extern const struct attribute_group aer_stats_attr_group;
> +extern const struct attribute_group aer_attr_group;
> void pci_aer_clear_fatal_status(struct pci_dev *dev);
> int pci_aer_clear_status(struct pci_dev *dev);
> int pci_aer_raw_clear_status(struct pci_dev *dev);
> diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
> index 0bd20c4993d4..13227a94c9f9 100644
> --- a/drivers/pci/pcie/aer.c
> +++ b/drivers/pci/pcie/aer.c
> @@ -631,6 +631,99 @@ const struct attribute_group aer_stats_attr_group = {
> .is_visible = aer_stats_attrs_are_visible,
> };
>
> +/*
> + * Ratelimit enable toggle uses interval value of
> + * 0: disabled
> + * DEFAULT_RATELIMIT_INTERVAL: enabled
> + */
> +static ssize_t ratelimit_log_enable_show(struct device *dev,
> + struct device_attribute *attr,
> + char *buf)
> +{
> + struct pci_dev *pdev = to_pci_dev(dev);
> + bool enable = pdev->aer_report->cor_log_ratelimit.interval != 0;
> +
> + return sysfs_emit(buf, "%d\n", enable);
> +}
> +
> +static ssize_t ratelimit_log_enable_store(struct device *dev,
> + struct device_attribute *attr,
> + const char *buf, size_t count)
> +{
> + struct pci_dev *pdev = to_pci_dev(dev);
> + bool enable;
> + int interval;
> +
> + if (kstrtobool(buf, &enable) < 0)
> + return -EINVAL;
> +
> + if (enable)
> + interval = DEFAULT_RATELIMIT_INTERVAL;
> + else
> + interval = 0;
> +
> + pdev->aer_report->cor_log_ratelimit.interval = interval;
> + pdev->aer_report->uncor_log_ratelimit.interval = interval;
> + return count;
> +}
> +static DEVICE_ATTR_RW(ratelimit_log_enable);
> +
> +/*
> + * Ratelimits are doubled as a given error produces 2 logs (root port
> + * and endpoint) that should be under same ratelimit.
> + */
> +#define aer_ratelimit_burst_attr(name, ratelimit) \
> + static ssize_t \
> + name##_show(struct device *dev, struct device_attribute *attr, \
> + char *buf) \
> +{ \
> + struct pci_dev *pdev = to_pci_dev(dev); \
> + return sysfs_emit(buf, "%d\n", \
> + pdev->aer_report->ratelimit.burst / 2); \
> +} \
> + \
> + static ssize_t \
> + name##_store(struct device *dev, struct device_attribute *attr, \
> + const char *buf, size_t count) \
> +{ \
> + struct pci_dev *pdev = to_pci_dev(dev); \
> + int burst; \
> + \
> + if (kstrtoint(buf, 0, &burst) < 0) \
> + return -EINVAL; \
> + \
> + pdev->aer_report->ratelimit.burst = burst * 2; \
> + return count; \
> +} \
> +static DEVICE_ATTR_RW(name)
> +
> +aer_ratelimit_burst_attr(ratelimit_in_5secs_cor_log, cor_log_ratelimit);
> +aer_ratelimit_burst_attr(ratelimit_in_5secs_uncor_log, uncor_log_ratelimit);
> +
> +static struct attribute *aer_attrs[] = {
> + &dev_attr_ratelimit_log_enable.attr,
> + &dev_attr_ratelimit_in_5secs_cor_log.attr,
> + &dev_attr_ratelimit_in_5secs_uncor_log.attr,
> + NULL
> +};
> +
> +static umode_t aer_attrs_are_visible(struct kobject *kobj,
> + struct attribute *a, int n)
> +{
> + struct device *dev = kobj_to_dev(kobj);
> + struct pci_dev *pdev = to_pci_dev(dev);
> +
> + if (!pdev->aer_report)
> + return 0;
> + return a->mode;
> +}
> +
> +const struct attribute_group aer_attr_group = {
> + .name = "aer",
> + .attrs = aer_attrs,
> + .is_visible = aer_attrs_are_visible,
> +};
> +
> void pci_dev_aer_stats_incr(struct pci_dev *pdev, struct aer_err_info *info)
> {
> unsigned long status = info->status & ~info->mask;
>
^ permalink raw reply [flat|nested] 28+ messages in thread* Re: [PATCH v3 7/8] PCI/AER: Add sysfs attributes for log ratelimits
2025-03-19 9:51 ` Ilpo Järvinen
@ 2025-03-20 8:27 ` Jon Pan-Doh
0 siblings, 0 replies; 28+ messages in thread
From: Jon Pan-Doh @ 2025-03-20 8:27 UTC (permalink / raw)
To: Ilpo Järvinen
Cc: Bjorn Helgaas, Karolina Stolarek, linux-pci, Martin Petersen,
Ben Fuller, Drew Walton, Anil Agrawal, Tony Luck,
Sathyanarayanan Kuppuswamy, Lukas Wunner, Jonathan Cameron,
Terry Bowman
On Wed, Mar 19, 2025 at 2:51 AM Ilpo Järvinen
<ilpo.jarvinen@linux.intel.com> wrote:
> Why does the next patch immediately modify this? A patch series should try
> to avoid back and forth changes like that.
Ack. The intention was to make the patch as small as possible.
Squashed patch 8 into this patch in v4.
Thanks,
Jon
^ permalink raw reply [flat|nested] 28+ messages in thread
* [PATCH v3 8/8] PCI/AER: Update AER sysfs ABI filename
2025-03-19 8:40 [PATCH v3 0/8] Rate limit AER logs Jon Pan-Doh
` (6 preceding siblings ...)
2025-03-19 8:40 ` [PATCH v3 7/8] PCI/AER: Add sysfs attributes for log ratelimits Jon Pan-Doh
@ 2025-03-19 8:40 ` Jon Pan-Doh
2025-03-19 22:29 ` [PATCH v3 0/8] Rate limit AER logs Sathyanarayanan Kuppuswamy
8 siblings, 0 replies; 28+ messages in thread
From: Jon Pan-Doh @ 2025-03-19 8:40 UTC (permalink / raw)
To: Bjorn Helgaas, Karolina Stolarek
Cc: linux-pci, Martin Petersen, Ben Fuller, Drew Walton, Anil Agrawal,
Tony Luck, Ilpo Järvinen, Sathyanarayanan Kuppuswamy,
Lukas Wunner, Jonathan Cameron, Terry Bowman, Jon Pan-Doh
Change Documentation/ABI/testing/sysfs-bus-pci-devices-aer_stats to
Documentation/ABI/testing/sysfs-bus-pci-devices-aer to reflect the broader
scope of AER sysfs attributes (e.g. stats and ratelimits).
Signed-off-by: Jon Pan-Doh <pandoh@google.com>
---
...fs-bus-pci-devices-aer_stats => sysfs-bus-pci-devices-aer} | 0
Documentation/PCI/pcieaer-howto.rst | 4 ++--
2 files changed, 2 insertions(+), 2 deletions(-)
rename Documentation/ABI/testing/{sysfs-bus-pci-devices-aer_stats => sysfs-bus-pci-devices-aer} (100%)
diff --git a/Documentation/ABI/testing/sysfs-bus-pci-devices-aer_stats b/Documentation/ABI/testing/sysfs-bus-pci-devices-aer
similarity index 100%
rename from Documentation/ABI/testing/sysfs-bus-pci-devices-aer_stats
rename to Documentation/ABI/testing/sysfs-bus-pci-devices-aer
diff --git a/Documentation/PCI/pcieaer-howto.rst b/Documentation/PCI/pcieaer-howto.rst
index b45a2e18d1cf..043cdb3194be 100644
--- a/Documentation/PCI/pcieaer-howto.rst
+++ b/Documentation/PCI/pcieaer-howto.rst
@@ -97,14 +97,14 @@ AER uses the default ratelimit of DEFAULT_RATELIMIT_BURST (10 events) over
DEFAULT_RATELIMIT_INTERVAL (5 seconds).
Ratelimits are exposed in the form of sysfs attributes and configurable.
-See Documentation/ABI/testing/sysfs-bus-pci-devices-aer_stats.
+See Documentation/ABI/testing/sysfs-bus-pci-devices-aer.
AER Statistics / Counters
-------------------------
When PCIe AER errors are captured, the counters / statistics are also exposed
in the form of sysfs attributes which are documented at
-Documentation/ABI/testing/sysfs-bus-pci-devices-aer_stats
+Documentation/ABI/testing/sysfs-bus-pci-devices-aer.
Developer Guide
===============
--
2.49.0.rc1.451.g8f38331e32-goog
^ permalink raw reply related [flat|nested] 28+ messages in thread* Re: [PATCH v3 0/8] Rate limit AER logs
2025-03-19 8:40 [PATCH v3 0/8] Rate limit AER logs Jon Pan-Doh
` (7 preceding siblings ...)
2025-03-19 8:40 ` [PATCH v3 8/8] PCI/AER: Update AER sysfs ABI filename Jon Pan-Doh
@ 2025-03-19 22:29 ` Sathyanarayanan Kuppuswamy
2025-03-19 22:52 ` Jon Pan-Doh
8 siblings, 1 reply; 28+ messages in thread
From: Sathyanarayanan Kuppuswamy @ 2025-03-19 22:29 UTC (permalink / raw)
To: Jon Pan-Doh, Bjorn Helgaas, Karolina Stolarek
Cc: linux-pci, Martin Petersen, Ben Fuller, Drew Walton, Anil Agrawal,
Tony Luck, Ilpo Järvinen, Lukas Wunner, Jonathan Cameron,
Terry Bowman
Hi Jon,
On 3/19/25 1:40 AM, Jon Pan-Doh wrote:
> Proposal
> ========
>
> When using native AER, spammy devices can flood kernel logs with AER errors
> and slow/stall execution. Add per-device per-error-severity ratelimits
> for more robust error logging. Allow userspace to configure ratelimits
> via sysfs knobs.
>
> Motivation
> ==========
>
> Several OCP members have issues with inconsistent PCIe error handling,
> exacerbated at datacenter scale (myriad of devices).
> OCP HW/Fault Management subproject set out to solve this by
> standardizing industry:
>
> - PCIe error handling best practices
> - Fault Management/RAS (incl. PCIe errors)
>
> Exposing PCIe errors/debug info in-band for a userspace daemon (e.g.
> rasdaemon) to collect/pass on to repairability services is part of the
> roadmap.
>
> Background
> ==========
>
> AER error spam has been observed many times, both publicly (e.g. [1], [2],
> [3]) and privately. While it usually occurs with correctable errors, it can
> happen with uncorrectable errors (e.g. during new HW bringup).
>
> There have been previous attempts to add ratelimits to AER logs ([4],
> [5]). The most recent attempt[5] has many similarities with the proposed
> approach.
>
> Patch organization
> ==================
> 1-4 AER logging cleanup
> 5-8 Ratelimits and sysfs knobs
>
> Outstanding work
> ================
> Cleanup:
> - Consolidate aer_print_error() and pci_print_error() path
>
> Roadmap:
> - IRQ ratelimiting
What is the baseline version for this patch set? When I tried to apply it on
v6.14-rc7 or linux-next, it does not apply cleanly.
> v3:
> - Ratelimit aer_print_port_info() (drop Patch 1)
> - Add ratelimit enable toggle
> - Move trace outside of ratelimit
> - Split log level (Patch 2) into two
> - More descriptive documentation/sysfs naming
> v2:
> - Rebased on top of pci/aer (6.14.rc-1)
> - Split series into log and IRQ ratelimits (defer patch 5)
> - Dropped patch 8 (Move AER sysfs)
> - Added log level cleanup patch[6] from Karolina's series
> - Fixed bug where dpc errors didn't increment counters
> - "X callbacks suppressed" message on ratelimit release -> immediately
> - Separate documentation into own patch
>
> [1] https://bugzilla.kernel.org/show_bug.cgi?id=215027
> [2] https://bugzilla.kernel.org/show_bug.cgi?id=201517
> [3] https://bugzilla.kernel.org/show_bug.cgi?id=196183
> [4] https://lore.kernel.org/linux-pci/20230606035442.2886343-2-grundler@chromium.org/
> [5] https://lore.kernel.org/linux-pci/cover.1736341506.git.karolina.stolarek@oracle.com/
> [6] https://lore.kernel.org/linux-pci/edd77011aafad4c0654358a26b4e538d0c5a321d.1736341506.git.karolina.stolarek@oracle.com/
>
> Jon Pan-Doh (6):
> PCI/AER: Move AER stat collection out of __aer_print_error()
> PCI/AER: Rename struct aer_stats to aer_report
> PCI/AER: Introduce ratelimit for error logs
> PCI/AER: Add ratelimits to PCI AER Documentation
> PCI/AER: Add sysfs attributes for log ratelimits
> PCI/AER: Update AER sysfs ABI filename
>
> Karolina Stolarek (2):
> PCI/AER: Check log level once and propagate down
> PCI/AER: Make all pci_print_aer() log levels depend on error type
>
> ...es-aer_stats => sysfs-bus-pci-devices-aer} | 34 +++
> Documentation/PCI/pcieaer-howto.rst | 16 +-
> drivers/pci/pci-sysfs.c | 1 +
> drivers/pci/pci.h | 4 +-
> drivers/pci/pcie/aer.c | 276 +++++++++++++-----
> drivers/pci/pcie/dpc.c | 3 +-
> include/linux/pci.h | 2 +-
> 7 files changed, 266 insertions(+), 70 deletions(-)
> rename Documentation/ABI/testing/{sysfs-bus-pci-devices-aer_stats => sysfs-bus-pci-devices-aer} (77%)
>
--
Sathyanarayanan Kuppuswamy
Linux Kernel Developer
^ permalink raw reply [flat|nested] 28+ messages in thread* Re: [PATCH v3 0/8] Rate limit AER logs
2025-03-19 22:29 ` [PATCH v3 0/8] Rate limit AER logs Sathyanarayanan Kuppuswamy
@ 2025-03-19 22:52 ` Jon Pan-Doh
0 siblings, 0 replies; 28+ messages in thread
From: Jon Pan-Doh @ 2025-03-19 22:52 UTC (permalink / raw)
To: Sathyanarayanan Kuppuswamy
Cc: Bjorn Helgaas, Karolina Stolarek, linux-pci, Martin Petersen,
Ben Fuller, Drew Walton, Anil Agrawal, Tony Luck,
Ilpo Järvinen, Lukas Wunner, Jonathan Cameron, Terry Bowman
On Wed, Mar 19, 2025 at 3:29 PM Sathyanarayanan Kuppuswamy
<sathyanarayanan.kuppuswamy@linux.intel.com> wrote:
> What is the baseline version for this patch set? When I tried to apply it on
> v6.14-rc7 or linux-next, it does not apply cleanly.
v2 and v3 are both based off of pci/aer.
Thanks,
Jon
^ permalink raw reply [flat|nested] 28+ messages in thread