Linux PCI subsystem development
 help / color / mirror / Atom feed
From: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@linux.intel.com>
To: Jon Pan-Doh <pandoh@google.com>,
	Bjorn Helgaas <bhelgaas@google.com>,
	Karolina Stolarek <karolina.stolarek@oracle.com>
Cc: linux-pci@vger.kernel.org,
	"Martin Petersen" <martin.petersen@oracle.com>,
	"Ben Fuller" <ben.fuller@oracle.com>,
	"Drew Walton" <drewwalton@microsoft.com>,
	"Anil Agrawal" <anilagrawal@meta.com>,
	"Tony Luck" <tony.luck@intel.com>,
	"Ilpo Järvinen" <ilpo.jarvinen@linux.intel.com>,
	"Lukas Wunner" <lukas@wunner.de>,
	"Jonathan Cameron" <Jonathan.Cameron@huawei.com>,
	"Terry Bowman" <Terry.bowman@amd.com>
Subject: Re: [PATCH v4 5/7] PCI/AER: Introduce ratelimit for error logs
Date: Thu, 20 Mar 2025 18:00:08 -0700	[thread overview]
Message-ID: <85bd0cd9-c09f-464d-9397-ced829df27d7@linux.intel.com> (raw)
In-Reply-To: <20250320082057.622983-6-pandoh@google.com>

Hi,

On 3/20/25 1:20 AM, Jon Pan-Doh wrote:
> Spammy devices can flood kernel logs with AER errors and slow/stall
> execution. Add per-device ratelimits for AER correctable and uncorrectable
> errors that use the kernel defaults (10 per 5s).

Should we exclude fatal errors from the rate limit? Fatal error logs 
would be
really useful for debug analysis, and they not happen very frequently.

>
> Tested using aer-inject[1]. Sent 11 AER errors. Observed 10 errors logged
> while AER stats (cat /sys/bus/pci/devices/<dev>/aer_dev_correctable) show
> true count of 11.
>
> [1] https://git.kernel.org/pub/scm/linux/kernel/git/gong.chen/aer-inject.git
>
> Signed-off-by: Jon Pan-Doh <pandoh@google.com>
> Reviewed-by: Karolina Stolarek <karolina.stolarek@oracle.com>
> ---
>   drivers/pci/pcie/aer.c | 74 +++++++++++++++++++++++++++++++++---------
>   1 file changed, 58 insertions(+), 16 deletions(-)
>
> diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
> index 3069376b3553..081cef5fc678 100644
> --- a/drivers/pci/pcie/aer.c
> +++ b/drivers/pci/pcie/aer.c
> @@ -28,6 +28,7 @@
>   #include <linux/interrupt.h>
>   #include <linux/delay.h>
>   #include <linux/kfifo.h>
> +#include <linux/ratelimit.h>
>   #include <linux/slab.h>
>   #include <acpi/apei.h>
>   #include <acpi/ghes.h>
> @@ -88,6 +89,10 @@ struct aer_report {
>   	u64 rootport_total_cor_errs;
>   	u64 rootport_total_fatal_errs;
>   	u64 rootport_total_nonfatal_errs;
> +
> +	/* Ratelimits for errors */
> +	struct ratelimit_state cor_log_ratelimit;
> +	struct ratelimit_state uncor_log_ratelimit;
>   };
>   
>   #define AER_LOG_TLP_MASKS		(PCI_ERR_UNC_POISON_TLP|	\
> @@ -379,6 +384,15 @@ void pci_aer_init(struct pci_dev *dev)
>   
>   	dev->aer_report = kzalloc(sizeof(*dev->aer_report), GFP_KERNEL);
>   
> +	/*
> +	 * Ratelimits are doubled as a given error produces 2 logs (root port
> +	 * and endpoint) that should be under same ratelimit.
> +	 */
> +	ratelimit_state_init(&dev->aer_report->cor_log_ratelimit,
> +			     DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST * 2);
> +	ratelimit_state_init(&dev->aer_report->uncor_log_ratelimit,
> +			     DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST * 2);
> +
>   	/*
>   	 * We save/restore PCI_ERR_UNCOR_MASK, PCI_ERR_UNCOR_SEVER,
>   	 * PCI_ERR_COR_MASK, and PCI_ERR_CAP.  Root and Root Complex Event
> @@ -668,6 +682,17 @@ static void pci_rootport_aer_stats_incr(struct pci_dev *pdev,
>   	}
>   }
>   
> +static int aer_ratelimit(struct pci_dev *dev, unsigned int severity)
> +{
> +	struct ratelimit_state *ratelimit;
> +
> +	if (severity == AER_CORRECTABLE)
> +		ratelimit = &dev->aer_report->cor_log_ratelimit;
> +	else
> +		ratelimit = &dev->aer_report->uncor_log_ratelimit;
> +	return __ratelimit(ratelimit);
> +}
> +
>   static void __aer_print_error(struct pci_dev *dev,
>   			      struct aer_err_info *info,
>   			      const char *level)
> @@ -698,6 +723,12 @@ void aer_print_error(struct pci_dev *dev, struct aer_err_info *info,
>   	int layer, agent;
>   	int id = pci_dev_id(dev);
>   
> +	trace_aer_event(dev_name(&dev->dev), (info->status & ~info->mask),
> +			info->severity, info->tlp_header_valid, &info->tlp);
> +
> +	if (!aer_ratelimit(dev, info->severity))
> +		return;
> +
>   	if (!info->status) {
>   		pci_err(dev, "PCIe Bus Error: severity=%s, type=Inaccessible, (Unregistered Agent ID)\n",
>   			aer_error_severity_string[info->severity]);
> @@ -722,21 +753,28 @@ void aer_print_error(struct pci_dev *dev, struct aer_err_info *info,
>   out:
>   	if (info->id && info->error_dev_num > 1 && info->id == id)
>   		pci_err(dev, "  Error of this Agent is reported first\n");
> -
> -	trace_aer_event(dev_name(&dev->dev), (info->status & ~info->mask),
> -			info->severity, info->tlp_header_valid, &info->tlp);
>   }
>   
>   static void aer_print_port_info(struct pci_dev *dev, struct aer_err_info *info)
>   {
>   	u8 bus = info->id >> 8;
>   	u8 devfn = info->id & 0xff;
> +	struct pci_dev *endpoint;
> +	int i;
> +
> +	/* extract endpoint device ratelimit */
> +	for (i = 0; i < info->error_dev_num; i++) {
> +		endpoint = info->dev[i];
> +		if (info->id == pci_dev_id(endpoint))
> +			break;
> +	}
>   
> -	pci_info(dev, "%s%s error message received from %04x:%02x:%02x.%d\n",
> -		 info->multi_error_valid ? "Multiple " : "",
> -		 aer_error_severity_string[info->severity],
> -		 pci_domain_nr(dev->bus), bus, PCI_SLOT(devfn),
> -		 PCI_FUNC(devfn));
> +	if (aer_ratelimit(endpoint, info->severity))
> +		pci_info(dev, "%s%s error message received from %04x:%02x:%02x.%d\n",
> +			 info->multi_error_valid ? "Multiple " : "",
> +			 aer_error_severity_string[info->severity],
> +			 pci_domain_nr(dev->bus), bus, PCI_SLOT(devfn),
> +			 PCI_FUNC(devfn));
>   }
>   
>   #ifdef CONFIG_ACPI_APEI_PCIEAER
> @@ -784,6 +822,12 @@ void pci_print_aer(struct pci_dev *dev, int aer_severity,
>   
>   	pci_dev_aer_stats_incr(dev, &info);
>   
> +	trace_aer_event(dev_name(&dev->dev), (status & ~mask),
> +			aer_severity, tlp_header_valid, &aer->header_log);
> +
> +	if (!aer_ratelimit(dev, aer_severity))
> +		return;
> +
>   	aer_printk(level, dev, "aer_status: 0x%08x, aer_mask: 0x%08x\n", status, mask);
>   	__aer_print_error(dev, &info, level);
>   	aer_printk(level, dev, "aer_layer=%s, aer_agent=%s\n",
> @@ -795,9 +839,6 @@ void pci_print_aer(struct pci_dev *dev, int aer_severity,
>   
>   	if (tlp_header_valid)
>   		pcie_print_tlp_log(dev, &aer->header_log, dev_fmt("  "));
> -
> -	trace_aer_event(dev_name(&dev->dev), (status & ~mask),
> -			aer_severity, tlp_header_valid, &aer->header_log);
>   }
>   EXPORT_SYMBOL_NS_GPL(pci_print_aer, "CXL");
>   
> @@ -1299,10 +1340,11 @@ static void aer_isr_one_error(struct aer_rpc *rpc,
>   			e_info.multi_error_valid = 1;
>   		else
>   			e_info.multi_error_valid = 0;
> -		aer_print_port_info(pdev, &e_info);
>   
> -		if (find_source_device(pdev, &e_info))
> +		if (find_source_device(pdev, &e_info)) {
> +			aer_print_port_info(pdev, &e_info);
>   			aer_process_err_devices(&e_info, KERN_WARNING);
> +		}
>   	}
>   
>   	if (e_src->status & PCI_ERR_ROOT_UNCOR_RCV) {
> @@ -1318,10 +1360,10 @@ static void aer_isr_one_error(struct aer_rpc *rpc,
>   		else
>   			e_info.multi_error_valid = 0;
>   
> -		aer_print_port_info(pdev, &e_info);
> -
> -		if (find_source_device(pdev, &e_info))
> +		if (find_source_device(pdev, &e_info)) {
> +			aer_print_port_info(pdev, &e_info);
>   			aer_process_err_devices(&e_info, KERN_ERR);
> +		}
>   	}
>   }
>   

-- 
Sathyanarayanan Kuppuswamy
Linux Kernel Developer


  parent reply	other threads:[~2025-03-21  1:00 UTC|newest]

Thread overview: 32+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-03-20  8:20 [PATCH v4 0/7] Rate limit AER logs Jon Pan-Doh
2025-03-20  8:20 ` [PATCH v4 1/7] PCI/AER: Check log level once and propagate down Jon Pan-Doh
2025-03-20  8:20 ` [PATCH v4 2/7] PCI/AER: Make all pci_print_aer() log levels depend on error type Jon Pan-Doh
2025-03-20  8:20 ` [PATCH v4 3/7] PCI/AER: Move AER stat collection out of __aer_print_error() Jon Pan-Doh
2025-03-20 14:59   ` Karolina Stolarek
2025-03-20 19:07     ` Jon Pan-Doh
2025-03-20  8:20 ` [PATCH v4 4/7] PCI/AER: Rename struct aer_stats to aer_report Jon Pan-Doh
2025-03-20 17:42   ` Sathyanarayanan Kuppuswamy
2025-03-20 19:53     ` Jon Pan-Doh
2025-03-21 13:38       ` Karolina Stolarek
2025-03-20  8:20 ` [PATCH v4 5/7] PCI/AER: Introduce ratelimit for error logs Jon Pan-Doh
2025-03-20 14:56   ` Karolina Stolarek
2025-03-20 17:51     ` Bjorn Helgaas
2025-03-20 19:53       ` Jon Pan-Doh
2025-03-20 20:29         ` Bjorn Helgaas
2025-03-21  1:58           ` Jon Pan-Doh
2025-03-20 19:37     ` Jon Pan-Doh
2025-03-21  1:00   ` Sathyanarayanan Kuppuswamy [this message]
2025-03-21 19:24     ` Jon Pan-Doh
2025-03-21 21:47       ` Sathyanarayanan Kuppuswamy
2025-03-21 21:59         ` Bjorn Helgaas
2025-03-21 22:11         ` Jon Pan-Doh
2025-03-20  8:20 ` [PATCH v4 6/7] PCI/AER: Add ratelimits to PCI AER Documentation Jon Pan-Doh
2025-03-20 14:57   ` Karolina Stolarek
2025-03-21  1:00   ` Sathyanarayanan Kuppuswamy
2025-03-20  8:20 ` [PATCH v4 7/7] PCI/AER: Add sysfs attributes for log ratelimits Jon Pan-Doh
2025-03-20 14:58   ` Karolina Stolarek
2025-03-20 19:36     ` Jon Pan-Doh
2025-03-21  1:02   ` Sathyanarayanan Kuppuswamy
2025-03-21  1:55     ` Jon Pan-Doh
2025-03-20 14:34 ` [PATCH v4 0/7] Rate limit AER logs Christoph Hellwig
2025-03-20 18:45 ` Paul E. McKenney

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=85bd0cd9-c09f-464d-9397-ced829df27d7@linux.intel.com \
    --to=sathyanarayanan.kuppuswamy@linux.intel.com \
    --cc=Jonathan.Cameron@huawei.com \
    --cc=Terry.bowman@amd.com \
    --cc=anilagrawal@meta.com \
    --cc=ben.fuller@oracle.com \
    --cc=bhelgaas@google.com \
    --cc=drewwalton@microsoft.com \
    --cc=ilpo.jarvinen@linux.intel.com \
    --cc=karolina.stolarek@oracle.com \
    --cc=linux-pci@vger.kernel.org \
    --cc=lukas@wunner.de \
    --cc=martin.petersen@oracle.com \
    --cc=pandoh@google.com \
    --cc=tony.luck@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox