linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Jonathan Cameron <Jonathan.Cameron@huawei.com>
To: Terry Bowman <terry.bowman@amd.com>
Cc: <PradeepVineshReddy.Kodamati@amd.com>, <dave@stgolabs.net>,
	<dave.jiang@intel.com>, <alison.schofield@intel.com>,
	<vishal.l.verma@intel.com>, <ira.weiny@intel.com>,
	<dan.j.williams@intel.com>, <bhelgaas@google.com>, <bp@alien8.de>,
	<ming.li@zohomail.com>, <shiju.jose@huawei.com>,
	<dan.carpenter@linaro.org>,
	<Smita.KoralahalliChannabasappa@amd.com>,
	<kobayashi.da-06@fujitsu.com>, <yanfei.xu@intel.com>,
	<rrichter@amd.com>, <peterz@infradead.org>, <colyli@suse.de>,
	<uaisheng.ye@intel.com>, <fabio.m.de.francesco@linux.intel.com>,
	<ilpo.jarvinen@linux.intel.com>, <yazen.ghannam@amd.com>,
	<linux-cxl@vger.kernel.org>, <linux-kernel@vger.kernel.org>,
	<linux-pci@vger.kernel.org>
Subject: Re: [PATCH v9 12/16] cxl/pci: Introduce CXL Endpoint protocol error handlers
Date: Thu, 12 Jun 2025 17:55:56 +0100	[thread overview]
Message-ID: <20250612175556.000036b5@huawei.com> (raw)
In-Reply-To: <20250603172239.159260-13-terry.bowman@amd.com>

On Tue, 3 Jun 2025 12:22:35 -0500
Terry Bowman <terry.bowman@amd.com> wrote:

> CXL Endpoint protocol errors are currently handled using PCI error
> handlers. The CXL Endpoint requires CXL specific handling in the case of
> uncorrectable error handling not provided by the PCI handlers.
> 
> Add CXL specific handlers for CXL Endpoints. Rename the existing
> cxl_error_handlers to be pci_error_handlers to more correctly indicate
> the error type and follow naming consistency.

I wonder if a rename precursor patch would be better here than doing it
all in one go?

Having not read the patch description thoroughly this had me confused ;)

> 
> Keep the existing PCI Endpoint handlers. PCI handlers can be called if the
> CXL device is not trained for alternate protocol (CXL). Update the CXL
> Endpoint PCI handlers to call the CXL handler. If the CXL uncorrectable
> handler returns PCI_ERS_RESULT_PANIC then the PCI handler invokes panic().
> 
> Signed-off-by: Terry Bowman <terry.bowman@amd.com>


>  
>  static bool cxl_handle_endpoint_ras(struct cxl_dev_state *cxlds)
>  {
> -

So this snuck in somewhere between upstream and here.  If it is in this
set let's push the removal back to where it came from.

>  	return __cxl_handle_ras(&cxlds->cxlmd->dev, cxlds->serial, cxlds->regs.ras);
>  }
>  
> @@ -844,14 +843,15 @@ static void cxl_handle_rdport_errors(struct cxl_dev_state *cxlds)
>  static void cxl_handle_rdport_errors(struct cxl_dev_state *cxlds) { }
>  #endif
>
> +pci_ers_result_t cxl_error_detected(struct device *dev)
> +{
> +	struct pci_dev *pdev = to_pci_dev(dev);
> +	struct cxl_dev_state *cxlds = pci_get_drvdata(pdev);
> +	struct device *cxlmd_dev = &cxlds->cxlmd->dev;
> +	pci_ers_result_t ue;
> +
> +	scoped_guard(device, cxlmd_dev) {
>  		if (!dev->driver) {
>  			dev_warn(&pdev->dev,
>  				 "%s: memdev disabled, abort error handling\n",
>  				 dev_name(dev));
> -			return PCI_ERS_RESULT_DISCONNECT;
> +			return PCI_ERS_RESULT_PANIC;
>  		}
>  
>  		if (cxlds->rcd)
> @@ -892,29 +900,25 @@ pci_ers_result_t cxl_error_detected(struct pci_dev *pdev,
>  		ue = cxl_handle_endpoint_ras(cxlds);
>  	}
>  
> -

Maybe something more in the patch description on why this chunk isn't relevant?
I guess we don't need the more complex handling as these are all panic :)

> -	switch (state) {
> -	case pci_channel_io_normal:
> -		if (ue) {
> -			device_release_driver(dev);
> -			return PCI_ERS_RESULT_NEED_RESET;
> -		}
> -		return PCI_ERS_RESULT_CAN_RECOVER;
> -	case pci_channel_io_frozen:
> -		dev_warn(&pdev->dev,
> -			 "%s: frozen state error detected, disable CXL.mem\n",
> -			 dev_name(dev));
> -		device_release_driver(dev);
> -		return PCI_ERS_RESULT_NEED_RESET;
> -	case pci_channel_io_perm_failure:
> -		dev_warn(&pdev->dev,
> -			 "failure state error detected, request disconnect\n");
> -		return PCI_ERS_RESULT_DISCONNECT;
> -	}
> -	return PCI_ERS_RESULT_NEED_RESET;
> +	return ue;
>  }
>  EXPORT_SYMBOL_NS_GPL(cxl_error_detected, "CXL");

>  static int cxl_flit_size(struct pci_dev *pdev)
>  {
>  	if (cxl_pci_flit_256(pdev))
> diff --git a/drivers/cxl/core/ras.c b/drivers/cxl/core/ras.c
> index 0ef8c2068c0c..664f532cc838 100644
> --- a/drivers/cxl/core/ras.c
> +++ b/drivers/cxl/core/ras.c

> @@ -244,6 +244,8 @@ static struct pci_dev *sbdf_to_pci(struct cxl_prot_error_info *err_info)
>  static void cxl_handle_prot_error(struct cxl_prot_error_info *err_info)
>  {
>  	struct pci_dev *pdev __free(pci_dev_put) = pci_dev_get(sbdf_to_pci(err_info));
> +	struct cxl_dev_state *cxlds = pci_get_drvdata(pdev);
> +	struct device *cxlmd_dev __free(put_device) = get_device(&cxlds->cxlmd->dev);
>  
>  	if (!pdev) {
>  		pr_err("Failed to find the CXL device\n");
> @@ -260,15 +262,13 @@ static void cxl_handle_prot_error(struct cxl_prot_error_info *err_info)
>  
>  	if (err_info->severity == AER_CORRECTABLE) {
>  		int aer = pdev->aer_cap;
> -		struct cxl_dev_state *cxlds = pci_get_drvdata(pdev);
> -		struct device *dev __free(put_device) = get_device(&cxlds->cxlmd->dev);

This code move seems somewhat unrelated?  Maybe it's in the wrong patch and
becomes necessary later?

>  
>  		if (aer)
>  			pci_clear_and_set_config_dword(pdev,
>  						       aer + PCI_ERR_COR_STATUS,
>  						       0, PCI_ERR_COR_INTERNAL);
>  
> -		cxl_cor_error_detected(pdev);
> +		cxl_cor_error_detected(&pdev->dev);
>  
>  		pcie_clear_device_status(pdev);
>  	} else {




  parent reply	other threads:[~2025-06-12 16:56 UTC|newest]

Thread overview: 90+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-06-03 17:22 [PATCH v9 00/16] Enable CXL PCIe port protocol error handling and logging Terry Bowman
2025-06-03 17:22 ` [PATCH v9 01/16] PCI/CXL: Add pcie_is_cxl() Terry Bowman
2025-06-04 19:06   ` Sathyanarayanan Kuppuswamy
2025-06-04 19:18     ` Bowman, Terry
2025-06-05 23:24   ` Dave Jiang
2025-06-03 17:22 ` [PATCH v9 02/16] PCI/AER: Report CXL or PCIe bus error type in trace logging Terry Bowman
2025-06-03 22:02   ` Sathyanarayanan Kuppuswamy
2025-06-04 14:32     ` Bowman, Terry
2025-06-04 19:24       ` Sathyanarayanan Kuppuswamy
2025-06-04 21:30         ` Bowman, Terry
2025-06-05 23:28   ` Dave Jiang
2025-06-03 17:22 ` [PATCH v9 03/16] CXL/AER: Introduce kfifo for forwarding CXL errors Terry Bowman
2025-06-04  6:01   ` Dan Carpenter
2025-06-04 14:37     ` Bowman, Terry
2025-06-04 17:24       ` Dan Carpenter
2025-06-04 19:21         ` Bowman, Terry
2025-06-04 22:50   ` Sathyanarayanan Kuppuswamy
2025-06-05 14:04     ` Bowman, Terry
2025-06-06  0:27   ` Dave Jiang
2025-06-06 14:27     ` Bowman, Terry
2025-06-06 14:36       ` Dave Jiang
2025-06-12 11:04   ` Jonathan Cameron
2025-06-12 14:29     ` Bowman, Terry
2025-06-03 17:22 ` [PATCH v9 04/16] PCI/AER: Dequeue forwarded CXL error Terry Bowman
2025-06-04  6:05   ` Dan Carpenter
2025-06-04 14:38     ` Bowman, Terry
2025-06-04 23:58   ` Sathyanarayanan Kuppuswamy
2025-06-06 15:57   ` Dave Jiang
2025-06-06 18:14     ` Bowman, Terry
2025-06-06 22:43       ` Dave Jiang
2025-06-09 19:57         ` Bowman, Terry
2025-06-09 20:34           ` Dave Jiang
2025-06-12 11:17             ` Jonathan Cameron
2025-06-06 21:08     ` Bowman, Terry
2025-06-06 23:15     ` Bowman, Terry
2025-06-09 20:17       ` Dave Jiang
2025-06-10  4:15   ` Lukas Wunner
2025-06-10 18:07     ` Bowman, Terry
2025-06-10 21:20       ` Bowman, Terry
2025-06-11  4:38         ` Lukas Wunner
2025-06-17 16:08           ` Dave Jiang
2025-06-17 18:20             ` Robert Richter
2025-06-12 11:36   ` Jonathan Cameron
2025-06-12 18:35     ` Bowman, Terry
2025-06-03 17:22 ` [PATCH v9 05/16] CXL/PCI: Introduce CXL uncorrectable protocol error recovery Terry Bowman
2025-06-05 15:14   ` Sathyanarayanan Kuppuswamy
2025-06-05 16:01     ` Bowman, Terry
2025-06-06 16:45   ` Dave Jiang
2025-06-06 18:16     ` Bowman, Terry
2025-06-12 16:06   ` Jonathan Cameron
2025-06-12 16:29     ` Bowman, Terry
2025-06-03 17:22 ` [PATCH v9 06/16] cxl/pci: Move RAS initialization to cxl_port driver Terry Bowman
2025-06-06 17:04   ` Dave Jiang
2025-06-06 18:17     ` Bowman, Terry
2025-06-03 17:22 ` [PATCH v9 07/16] cxl/pci: Map CXL Endpoint Port and CXL Switch Port RAS registers Terry Bowman
2025-06-03 17:22 ` [PATCH v9 08/16] cxl/pci: Update RAS handler interfaces to also support CXL Ports Terry Bowman
2025-06-05 16:42   ` Sathyanarayanan Kuppuswamy
2025-06-03 17:22 ` [PATCH v9 09/16] cxl/pci: Log message if RAS registers are unmapped Terry Bowman
2025-06-05 16:42   ` Sathyanarayanan Kuppuswamy
2025-06-06 17:27   ` Dave Jiang
2025-06-03 17:22 ` [PATCH v9 10/16] cxl/pci: Unify CXL trace logging for CXL Endpoints and CXL Ports Terry Bowman
2025-06-05 16:49   ` Sathyanarayanan Kuppuswamy
2025-06-06  9:08   ` Shiju Jose
2025-06-06 14:41     ` Bowman, Terry
2025-06-06 15:24       ` Bowman, Terry
2025-06-12 16:25         ` Jonathan Cameron
2025-06-03 17:22 ` [PATCH v9 11/16] cxl/pci: Update __cxl_handle_cor_ras() to return early if no RAS errors Terry Bowman
2025-06-05 18:37   ` Sathyanarayanan Kuppuswamy
2025-06-06 20:30   ` Dave Jiang
2025-06-06 20:55     ` Bowman, Terry
2025-06-06 22:38       ` Dave Jiang
2025-06-12 16:46   ` Jonathan Cameron
2025-06-16 20:30     ` Bowman, Terry
2025-06-03 17:22 ` [PATCH v9 12/16] cxl/pci: Introduce CXL Endpoint protocol error handlers Terry Bowman
2025-06-06  0:22   ` Sathyanarayanan Kuppuswamy
2025-06-12 16:55   ` Jonathan Cameron [this message]
2025-06-03 17:22 ` [PATCH v9 13/16] cxl/pci: Introduce CXL Port " Terry Bowman
2025-06-06  0:50   ` Sathyanarayanan Kuppuswamy
2025-06-12 17:14   ` Jonathan Cameron
2025-06-16 22:17     ` Bowman, Terry
2025-06-03 17:22 ` [PATCH v9 14/16] cxl/pci: Remove unnecessary CXL Endpoint handling helper functions Terry Bowman
2025-06-06  0:50   ` Sathyanarayanan Kuppuswamy
2025-06-12 17:16   ` Jonathan Cameron
2025-06-03 17:22 ` [PATCH v9 15/16] CXL/PCI: Enable CXL protocol errors during CXL Port probe Terry Bowman
2025-06-06  0:51   ` Sathyanarayanan Kuppuswamy
2025-06-03 17:22 ` [PATCH v9 16/16] CXL/PCI: Disable CXL protocol error interrupts during CXL Port cleanup Terry Bowman
2025-06-06  0:52   ` Sathyanarayanan Kuppuswamy
2025-06-06 13:51     ` Bowman, Terry
2025-06-06 22:59   ` Dave Jiang
2025-06-12 17:19   ` Jonathan Cameron

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20250612175556.000036b5@huawei.com \
    --to=jonathan.cameron@huawei.com \
    --cc=PradeepVineshReddy.Kodamati@amd.com \
    --cc=Smita.KoralahalliChannabasappa@amd.com \
    --cc=alison.schofield@intel.com \
    --cc=bhelgaas@google.com \
    --cc=bp@alien8.de \
    --cc=colyli@suse.de \
    --cc=dan.carpenter@linaro.org \
    --cc=dan.j.williams@intel.com \
    --cc=dave.jiang@intel.com \
    --cc=dave@stgolabs.net \
    --cc=fabio.m.de.francesco@linux.intel.com \
    --cc=ilpo.jarvinen@linux.intel.com \
    --cc=ira.weiny@intel.com \
    --cc=kobayashi.da-06@fujitsu.com \
    --cc=linux-cxl@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-pci@vger.kernel.org \
    --cc=ming.li@zohomail.com \
    --cc=peterz@infradead.org \
    --cc=rrichter@amd.com \
    --cc=shiju.jose@huawei.com \
    --cc=terry.bowman@amd.com \
    --cc=uaisheng.ye@intel.com \
    --cc=vishal.l.verma@intel.com \
    --cc=yanfei.xu@intel.com \
    --cc=yazen.ghannam@amd.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).