[PATCH] cxl/ras: Fix CPER handler device confusion

linux-cxl.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH] cxl/ras: Fix CPER handler device confusion
@ 2025-06-11 17:17 Dan Williams
  2025-06-11 18:14 ` Dave Jiang
                   ` (4 more replies)
  0 siblings, 5 replies; 11+ messages in thread
From: Dan Williams @ 2025-06-11 17:17 UTC (permalink / raw)
  To: dave.jiang
  Cc: linux-cxl, Smita Koralahalli, Terry Bowman, Li Ming,
	Alison Schofield, Ira Weiny, Tony Luck

By inspection, cxl_cper_handle_prot_err() is making a series of fragile
assumptions that can lead to crashes:

1/ It assumes that endpoints identified in the record are a CXL-type-3
   device, nothing guarantees that.

2/ It assumes that the device is bound to the cxl_pci driver, nothing
   guarantees that.

3/ Minor, it holds the device lock over the switch-port tracing for no
   reason as the trace is 100% generated from data in the record.

Correct those by checking that the PCIe endpoint parents a cxl_memdev
before assuming the format of the driver data, and move the lock to where
it is required. Consequently this also makes the implementation ready for
CXL accelerators that are not bound to cxl_pci.

Fixes: 36f257e3b0ba ("acpi/ghes, cxl/pci: Process CXL CPER Protocol Errors")
Cc: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com>
Cc: Terry Bowman <terry.bowman@amd.com>
Cc: Li Ming <ming.li@zohomail.com>
Cc: Alison Schofield <alison.schofield@intel.com>
Cc: Ira Weiny <ira.weiny@intel.com>
Cc: Tony Luck <tony.luck@intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/cxl/core/ras.c | 48 ++++++++++++++++++++++++------------------
 1 file changed, 28 insertions(+), 20 deletions(-)

diff --git a/drivers/cxl/core/ras.c b/drivers/cxl/core/ras.c
index 485a831695c7..1173eaff7e2b 100644
--- a/drivers/cxl/core/ras.c
+++ b/drivers/cxl/core/ras.c
@@ -31,40 +31,38 @@ static void cxl_cper_trace_uncorr_port_prot_err(struct pci_dev *pdev,
 					       ras_cap.header_log);
 }
 
-static void cxl_cper_trace_corr_prot_err(struct pci_dev *pdev,
-				  struct cxl_ras_capability_regs ras_cap)
+static void cxl_cper_trace_corr_prot_err(struct cxl_memdev *cxlmd,
+					 struct cxl_ras_capability_regs ras_cap)
 {
 	u32 status = ras_cap.cor_status & ~ras_cap.cor_mask;
-	struct cxl_dev_state *cxlds;
 
-	cxlds = pci_get_drvdata(pdev);
-	if (!cxlds)
-		return;
-
-	trace_cxl_aer_correctable_error(cxlds->cxlmd, status);
+	trace_cxl_aer_correctable_error(cxlmd, status);
 }
 
-static void cxl_cper_trace_uncorr_prot_err(struct pci_dev *pdev,
-				    struct cxl_ras_capability_regs ras_cap)
+static void
+cxl_cper_trace_uncorr_prot_err(struct cxl_memdev *cxlmd,
+			       struct cxl_ras_capability_regs ras_cap)
 {
 	u32 status = ras_cap.uncor_status & ~ras_cap.uncor_mask;
-	struct cxl_dev_state *cxlds;
 	u32 fe;
 
-	cxlds = pci_get_drvdata(pdev);
-	if (!cxlds)
-		return;
-
 	if (hweight32(status) > 1)
 		fe = BIT(FIELD_GET(CXL_RAS_CAP_CONTROL_FE_MASK,
 				   ras_cap.cap_control));
 	else
 		fe = status;
 
-	trace_cxl_aer_uncorrectable_error(cxlds->cxlmd, status, fe,
+	trace_cxl_aer_uncorrectable_error(cxlmd, status, fe,
 					  ras_cap.header_log);
 }
 
+static int match_memdev_by_parent(struct device *dev, const void *uport)
+{
+	if (is_cxl_memdev(dev) && dev->parent == uport)
+		return 1;
+	return 0;
+}
+
 static void cxl_cper_handle_prot_err(struct cxl_cper_prot_err_work_data *data)
 {
 	unsigned int devfn = PCI_DEVFN(data->prot_err.agent_addr.device,
@@ -73,13 +71,13 @@ static void cxl_cper_handle_prot_err(struct cxl_cper_prot_err_work_data *data)
 		pci_get_domain_bus_and_slot(data->prot_err.agent_addr.segment,
 					    data->prot_err.agent_addr.bus,
 					    devfn);
+	struct cxl_memdev *cxlmd;
+	struct device *mem_dev;
 	int port_type;
 
 	if (!pdev)
 		return;
 
-	guard(device)(&pdev->dev);
-
 	port_type = pci_pcie_type(pdev);
 	if (port_type == PCI_EXP_TYPE_ROOT_PORT ||
 	    port_type == PCI_EXP_TYPE_DOWNSTREAM ||
@@ -92,10 +90,20 @@ static void cxl_cper_handle_prot_err(struct cxl_cper_prot_err_work_data *data)
 		return;
 	}
 
+	guard(device)(&pdev->dev);
+	if (!pdev->dev.driver)
+		return;
+
+	mem_dev = bus_find_device(&cxl_bus_type, NULL, pdev,
+				  match_memdev_by_parent);
+	if (!mem_dev)
+		return;
+
+	cxlmd = to_cxl_memdev(mem_dev);
 	if (data->severity == AER_CORRECTABLE)
-		cxl_cper_trace_corr_prot_err(pdev, data->ras_cap);
+		cxl_cper_trace_corr_prot_err(cxlmd, data->ras_cap);
 	else
-		cxl_cper_trace_uncorr_prot_err(pdev, data->ras_cap);
+		cxl_cper_trace_uncorr_prot_err(cxlmd, data->ras_cap);
 }
 
 static void cxl_cper_prot_err_work_fn(struct work_struct *work)

base-commit: 19272b37aa4f83ca52bdf9c16d5d81bdd1354494
-- 
2.49.0


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: [PATCH] cxl/ras: Fix CPER handler device confusion
  2025-06-11 17:17 [PATCH] cxl/ras: Fix CPER handler device confusion Dan Williams
@ 2025-06-11 18:14 ` Dave Jiang
  2025-06-11 18:41 ` Koralahalli Channabasappa, Smita
                   ` (3 subsequent siblings)
  4 siblings, 0 replies; 11+ messages in thread
From: Dave Jiang @ 2025-06-11 18:14 UTC (permalink / raw)
  To: Dan Williams
  Cc: linux-cxl, Smita Koralahalli, Terry Bowman, Li Ming,
	Alison Schofield, Ira Weiny, Tony Luck



On 6/11/25 10:17 AM, Dan Williams wrote:
> By inspection, cxl_cper_handle_prot_err() is making a series of fragile
> assumptions that can lead to crashes:
> 
> 1/ It assumes that endpoints identified in the record are a CXL-type-3
>    device, nothing guarantees that.
> 
> 2/ It assumes that the device is bound to the cxl_pci driver, nothing
>    guarantees that.
> 
> 3/ Minor, it holds the device lock over the switch-port tracing for no
>    reason as the trace is 100% generated from data in the record.
> 
> Correct those by checking that the PCIe endpoint parents a cxl_memdev
> before assuming the format of the driver data, and move the lock to where
> it is required. Consequently this also makes the implementation ready for
> CXL accelerators that are not bound to cxl_pci.
> 
> Fixes: 36f257e3b0ba ("acpi/ghes, cxl/pci: Process CXL CPER Protocol Errors")
> Cc: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com>
> Cc: Terry Bowman <terry.bowman@amd.com>
> Cc: Li Ming <ming.li@zohomail.com>
> Cc: Alison Schofield <alison.schofield@intel.com>
> Cc: Ira Weiny <ira.weiny@intel.com>
> Cc: Tony Luck <tony.luck@intel.com>
> Signed-off-by: Dan Williams <dan.j.williams@intel.com>

Reviewed-by: Dave Jiang <dave.jiang@intel.com>
> ---
>  drivers/cxl/core/ras.c | 48 ++++++++++++++++++++++++------------------
>  1 file changed, 28 insertions(+), 20 deletions(-)
> 
> diff --git a/drivers/cxl/core/ras.c b/drivers/cxl/core/ras.c
> index 485a831695c7..1173eaff7e2b 100644
> --- a/drivers/cxl/core/ras.c
> +++ b/drivers/cxl/core/ras.c
> @@ -31,40 +31,38 @@ static void cxl_cper_trace_uncorr_port_prot_err(struct pci_dev *pdev,
>  					       ras_cap.header_log);
>  }
>  
> -static void cxl_cper_trace_corr_prot_err(struct pci_dev *pdev,
> -				  struct cxl_ras_capability_regs ras_cap)
> +static void cxl_cper_trace_corr_prot_err(struct cxl_memdev *cxlmd,
> +					 struct cxl_ras_capability_regs ras_cap)
>  {
>  	u32 status = ras_cap.cor_status & ~ras_cap.cor_mask;
> -	struct cxl_dev_state *cxlds;
>  
> -	cxlds = pci_get_drvdata(pdev);
> -	if (!cxlds)
> -		return;
> -
> -	trace_cxl_aer_correctable_error(cxlds->cxlmd, status);
> +	trace_cxl_aer_correctable_error(cxlmd, status);
>  }
>  
> -static void cxl_cper_trace_uncorr_prot_err(struct pci_dev *pdev,
> -				    struct cxl_ras_capability_regs ras_cap)
> +static void
> +cxl_cper_trace_uncorr_prot_err(struct cxl_memdev *cxlmd,
> +			       struct cxl_ras_capability_regs ras_cap)
>  {
>  	u32 status = ras_cap.uncor_status & ~ras_cap.uncor_mask;
> -	struct cxl_dev_state *cxlds;
>  	u32 fe;
>  
> -	cxlds = pci_get_drvdata(pdev);
> -	if (!cxlds)
> -		return;
> -
>  	if (hweight32(status) > 1)
>  		fe = BIT(FIELD_GET(CXL_RAS_CAP_CONTROL_FE_MASK,
>  				   ras_cap.cap_control));
>  	else
>  		fe = status;
>  
> -	trace_cxl_aer_uncorrectable_error(cxlds->cxlmd, status, fe,
> +	trace_cxl_aer_uncorrectable_error(cxlmd, status, fe,
>  					  ras_cap.header_log);
>  }
>  
> +static int match_memdev_by_parent(struct device *dev, const void *uport)
> +{
> +	if (is_cxl_memdev(dev) && dev->parent == uport)
> +		return 1;
> +	return 0;
> +}
> +
>  static void cxl_cper_handle_prot_err(struct cxl_cper_prot_err_work_data *data)
>  {
>  	unsigned int devfn = PCI_DEVFN(data->prot_err.agent_addr.device,
> @@ -73,13 +71,13 @@ static void cxl_cper_handle_prot_err(struct cxl_cper_prot_err_work_data *data)
>  		pci_get_domain_bus_and_slot(data->prot_err.agent_addr.segment,
>  					    data->prot_err.agent_addr.bus,
>  					    devfn);
> +	struct cxl_memdev *cxlmd;
> +	struct device *mem_dev;
>  	int port_type;
>  
>  	if (!pdev)
>  		return;
>  
> -	guard(device)(&pdev->dev);
> -
>  	port_type = pci_pcie_type(pdev);
>  	if (port_type == PCI_EXP_TYPE_ROOT_PORT ||
>  	    port_type == PCI_EXP_TYPE_DOWNSTREAM ||
> @@ -92,10 +90,20 @@ static void cxl_cper_handle_prot_err(struct cxl_cper_prot_err_work_data *data)
>  		return;
>  	}
>  
> +	guard(device)(&pdev->dev);
> +	if (!pdev->dev.driver)
> +		return;
> +
> +	mem_dev = bus_find_device(&cxl_bus_type, NULL, pdev,
> +				  match_memdev_by_parent);
> +	if (!mem_dev)
> +		return;
> +
> +	cxlmd = to_cxl_memdev(mem_dev);
>  	if (data->severity == AER_CORRECTABLE)
> -		cxl_cper_trace_corr_prot_err(pdev, data->ras_cap);
> +		cxl_cper_trace_corr_prot_err(cxlmd, data->ras_cap);
>  	else
> -		cxl_cper_trace_uncorr_prot_err(pdev, data->ras_cap);
> +		cxl_cper_trace_uncorr_prot_err(cxlmd, data->ras_cap);
>  }
>  
>  static void cxl_cper_prot_err_work_fn(struct work_struct *work)
> 
> base-commit: 19272b37aa4f83ca52bdf9c16d5d81bdd1354494


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] cxl/ras: Fix CPER handler device confusion
  2025-06-11 17:17 [PATCH] cxl/ras: Fix CPER handler device confusion Dan Williams
  2025-06-11 18:14 ` Dave Jiang
@ 2025-06-11 18:41 ` Koralahalli Channabasappa, Smita
  2025-06-11 18:50 ` Bowman, Terry
                   ` (2 subsequent siblings)
  4 siblings, 0 replies; 11+ messages in thread
From: Koralahalli Channabasappa, Smita @ 2025-06-11 18:41 UTC (permalink / raw)
  To: Dan Williams, dave.jiang
  Cc: linux-cxl, Terry Bowman, Li Ming, Alison Schofield, Ira Weiny,
	Tony Luck

On 6/11/2025 10:17 AM, Dan Williams wrote:
> By inspection, cxl_cper_handle_prot_err() is making a series of fragile
> assumptions that can lead to crashes:
> 
> 1/ It assumes that endpoints identified in the record are a CXL-type-3
>     device, nothing guarantees that.
> 
> 2/ It assumes that the device is bound to the cxl_pci driver, nothing
>     guarantees that.
> 
> 3/ Minor, it holds the device lock over the switch-port tracing for no
>     reason as the trace is 100% generated from data in the record.
> 
> Correct those by checking that the PCIe endpoint parents a cxl_memdev
> before assuming the format of the driver data, and move the lock to where
> it is required. Consequently this also makes the implementation ready for
> CXL accelerators that are not bound to cxl_pci.
> 
> Fixes: 36f257e3b0ba ("acpi/ghes, cxl/pci: Process CXL CPER Protocol Errors")
> Cc: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com>
> Cc: Terry Bowman <terry.bowman@amd.com>
> Cc: Li Ming <ming.li@zohomail.com>
> Cc: Alison Schofield <alison.schofield@intel.com>
> Cc: Ira Weiny <ira.weiny@intel.com>
> Cc: Tony Luck <tony.luck@intel.com>
> Signed-off-by: Dan Williams <dan.j.williams@intel.com>
> ---
>   drivers/cxl/core/ras.c | 48 ++++++++++++++++++++++++------------------
>   1 file changed, 28 insertions(+), 20 deletions(-)
> 
> diff --git a/drivers/cxl/core/ras.c b/drivers/cxl/core/ras.c
> index 485a831695c7..1173eaff7e2b 100644
> --- a/drivers/cxl/core/ras.c
> +++ b/drivers/cxl/core/ras.c
> @@ -31,40 +31,38 @@ static void cxl_cper_trace_uncorr_port_prot_err(struct pci_dev *pdev,
>   					       ras_cap.header_log);
>   }
>   
> -static void cxl_cper_trace_corr_prot_err(struct pci_dev *pdev,
> -				  struct cxl_ras_capability_regs ras_cap)
> +static void cxl_cper_trace_corr_prot_err(struct cxl_memdev *cxlmd,
> +					 struct cxl_ras_capability_regs ras_cap)
>   {
>   	u32 status = ras_cap.cor_status & ~ras_cap.cor_mask;
> -	struct cxl_dev_state *cxlds;
>   
> -	cxlds = pci_get_drvdata(pdev);
> -	if (!cxlds)
> -		return;
> -
> -	trace_cxl_aer_correctable_error(cxlds->cxlmd, status);
> +	trace_cxl_aer_correctable_error(cxlmd, status);
>   }
>   
> -static void cxl_cper_trace_uncorr_prot_err(struct pci_dev *pdev,
> -				    struct cxl_ras_capability_regs ras_cap)
> +static void
> +cxl_cper_trace_uncorr_prot_err(struct cxl_memdev *cxlmd,
> +			       struct cxl_ras_capability_regs ras_cap)
>   {
>   	u32 status = ras_cap.uncor_status & ~ras_cap.uncor_mask;
> -	struct cxl_dev_state *cxlds;
>   	u32 fe;
>   
> -	cxlds = pci_get_drvdata(pdev);
> -	if (!cxlds)
> -		return;
> -
>   	if (hweight32(status) > 1)
>   		fe = BIT(FIELD_GET(CXL_RAS_CAP_CONTROL_FE_MASK,
>   				   ras_cap.cap_control));
>   	else
>   		fe = status;
>   
> -	trace_cxl_aer_uncorrectable_error(cxlds->cxlmd, status, fe,
> +	trace_cxl_aer_uncorrectable_error(cxlmd, status, fe,
>   					  ras_cap.header_log);
>   }
>   
> +static int match_memdev_by_parent(struct device *dev, const void *uport)
> +{
> +	if (is_cxl_memdev(dev) && dev->parent == uport)
> +		return 1;
> +	return 0;
> +}
> +
>   static void cxl_cper_handle_prot_err(struct cxl_cper_prot_err_work_data *data)
>   {
>   	unsigned int devfn = PCI_DEVFN(data->prot_err.agent_addr.device,
> @@ -73,13 +71,13 @@ static void cxl_cper_handle_prot_err(struct cxl_cper_prot_err_work_data *data)
>   		pci_get_domain_bus_and_slot(data->prot_err.agent_addr.segment,
>   					    data->prot_err.agent_addr.bus,
>   					    devfn);
> +	struct cxl_memdev *cxlmd;
> +	struct device *mem_dev;
>   	int port_type;
>   
>   	if (!pdev)
>   		return;
>   
> -	guard(device)(&pdev->dev);
> -
>   	port_type = pci_pcie_type(pdev);
>   	if (port_type == PCI_EXP_TYPE_ROOT_PORT ||
>   	    port_type == PCI_EXP_TYPE_DOWNSTREAM ||
> @@ -92,10 +90,20 @@ static void cxl_cper_handle_prot_err(struct cxl_cper_prot_err_work_data *data)
>   		return;
>   	}
>   
> +	guard(device)(&pdev->dev);
> +	if (!pdev->dev.driver)
> +		return;
> +
> +	mem_dev = bus_find_device(&cxl_bus_type, NULL, pdev,
> +				  match_memdev_by_parent);
> +	if (!mem_dev)
> +		return;
> +
> +	cxlmd = to_cxl_memdev(mem_dev);
>   	if (data->severity == AER_CORRECTABLE)
> -		cxl_cper_trace_corr_prot_err(pdev, data->ras_cap);
> +		cxl_cper_trace_corr_prot_err(cxlmd, data->ras_cap);
>   	else
> -		cxl_cper_trace_uncorr_prot_err(pdev, data->ras_cap);
> +		cxl_cper_trace_uncorr_prot_err(cxlmd, data->ras_cap);
>   }
>   
>   static void cxl_cper_prot_err_work_fn(struct work_struct *work)
> 
> base-commit: 19272b37aa4f83ca52bdf9c16d5d81bdd1354494

Thank you for this. Looks good to me.

Reviewed-by: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com>


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] cxl/ras: Fix CPER handler device confusion
  2025-06-11 17:17 [PATCH] cxl/ras: Fix CPER handler device confusion Dan Williams
  2025-06-11 18:14 ` Dave Jiang
  2025-06-11 18:41 ` Koralahalli Channabasappa, Smita
@ 2025-06-11 18:50 ` Bowman, Terry
  2025-06-12 15:38   ` Dan Williams
  2025-06-12  5:25 ` Li Ming
  2025-06-12 19:20 ` [PATCH v2] " Dan Williams
  4 siblings, 1 reply; 11+ messages in thread
From: Bowman, Terry @ 2025-06-11 18:50 UTC (permalink / raw)
  To: Dan Williams, dave.jiang
  Cc: linux-cxl, Smita Koralahalli, Li Ming, Alison Schofield,
	Ira Weiny, Tony Luck



On 6/11/2025 12:17 PM, Dan Williams wrote:
> By inspection, cxl_cper_handle_prot_err() is making a series of fragile
> assumptions that can lead to crashes:
>
> 1/ It assumes that endpoints identified in the record are a CXL-type-3
>    device, nothing guarantees that.
>
> 2/ It assumes that the device is bound to the cxl_pci driver, nothing
>    guarantees that.
>
> 3/ Minor, it holds the device lock over the switch-port tracing for no
>    reason as the trace is 100% generated from data in the record.
>
> Correct those by checking that the PCIe endpoint parents a cxl_memdev
> before assuming the format of the driver data, and move the lock to where
> it is required. Consequently this also makes the implementation ready for
> CXL accelerators that are not bound to cxl_pci.

Hi Dan,

The AER CE/UCE trace routines access the pdev->dev and pdev->dev->parent.
Doesn't this require a guard() lock for all callers?

-Terry

> Fixes: 36f257e3b0ba ("acpi/ghes, cxl/pci: Process CXL CPER Protocol Errors")
> Cc: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com>
> Cc: Terry Bowman <terry.bowman@amd.com>
> Cc: Li Ming <ming.li@zohomail.com>
> Cc: Alison Schofield <alison.schofield@intel.com>
> Cc: Ira Weiny <ira.weiny@intel.com>
> Cc: Tony Luck <tony.luck@intel.com>
> Signed-off-by: Dan Williams <dan.j.williams@intel.com>
> ---
>  drivers/cxl/core/ras.c | 48 ++++++++++++++++++++++++------------------
>  1 file changed, 28 insertions(+), 20 deletions(-)
>
> diff --git a/drivers/cxl/core/ras.c b/drivers/cxl/core/ras.c
> index 485a831695c7..1173eaff7e2b 100644
> --- a/drivers/cxl/core/ras.c
> +++ b/drivers/cxl/core/ras.c
> @@ -31,40 +31,38 @@ static void cxl_cper_trace_uncorr_port_prot_err(struct pci_dev *pdev,
>  					       ras_cap.header_log);
>  }
>  
> -static void cxl_cper_trace_corr_prot_err(struct pci_dev *pdev,
> -				  struct cxl_ras_capability_regs ras_cap)
> +static void cxl_cper_trace_corr_prot_err(struct cxl_memdev *cxlmd,
> +					 struct cxl_ras_capability_regs ras_cap)
>  {
>  	u32 status = ras_cap.cor_status & ~ras_cap.cor_mask;
> -	struct cxl_dev_state *cxlds;
>  
> -	cxlds = pci_get_drvdata(pdev);
> -	if (!cxlds)
> -		return;
> -
> -	trace_cxl_aer_correctable_error(cxlds->cxlmd, status);
> +	trace_cxl_aer_correctable_error(cxlmd, status);
>  }
>  
> -static void cxl_cper_trace_uncorr_prot_err(struct pci_dev *pdev,
> -				    struct cxl_ras_capability_regs ras_cap)
> +static void
> +cxl_cper_trace_uncorr_prot_err(struct cxl_memdev *cxlmd,
> +			       struct cxl_ras_capability_regs ras_cap)
>  {
>  	u32 status = ras_cap.uncor_status & ~ras_cap.uncor_mask;
> -	struct cxl_dev_state *cxlds;
>  	u32 fe;
>  
> -	cxlds = pci_get_drvdata(pdev);
> -	if (!cxlds)
> -		return;
> -
>  	if (hweight32(status) > 1)
>  		fe = BIT(FIELD_GET(CXL_RAS_CAP_CONTROL_FE_MASK,
>  				   ras_cap.cap_control));
>  	else
>  		fe = status;
>  
> -	trace_cxl_aer_uncorrectable_error(cxlds->cxlmd, status, fe,
> +	trace_cxl_aer_uncorrectable_error(cxlmd, status, fe,
>  					  ras_cap.header_log);
>  }
>  
> +static int match_memdev_by_parent(struct device *dev, const void *uport)
> +{
> +	if (is_cxl_memdev(dev) && dev->parent == uport)
> +		return 1;
> +	return 0;
> +}
> +
>  static void cxl_cper_handle_prot_err(struct cxl_cper_prot_err_work_data *data)
>  {
>  	unsigned int devfn = PCI_DEVFN(data->prot_err.agent_addr.device,
> @@ -73,13 +71,13 @@ static void cxl_cper_handle_prot_err(struct cxl_cper_prot_err_work_data *data)
>  		pci_get_domain_bus_and_slot(data->prot_err.agent_addr.segment,
>  					    data->prot_err.agent_addr.bus,
>  					    devfn);
> +	struct cxl_memdev *cxlmd;
> +	struct device *mem_dev;
>  	int port_type;
>  
>  	if (!pdev)
>  		return;
>  
> -	guard(device)(&pdev->dev);
> -
>  	port_type = pci_pcie_type(pdev);
>  	if (port_type == PCI_EXP_TYPE_ROOT_PORT ||
>  	    port_type == PCI_EXP_TYPE_DOWNSTREAM ||
> @@ -92,10 +90,20 @@ static void cxl_cper_handle_prot_err(struct cxl_cper_prot_err_work_data *data)
>  		return;
>  	}
>  
> +	guard(device)(&pdev->dev);
> +	if (!pdev->dev.driver)
> +		return;
> +
> +	mem_dev = bus_find_device(&cxl_bus_type, NULL, pdev,
> +				  match_memdev_by_parent);
> +	if (!mem_dev)
> +		return;
> +
> +	cxlmd = to_cxl_memdev(mem_dev);
>  	if (data->severity == AER_CORRECTABLE)
> -		cxl_cper_trace_corr_prot_err(pdev, data->ras_cap);
> +		cxl_cper_trace_corr_prot_err(cxlmd, data->ras_cap);
>  	else
> -		cxl_cper_trace_uncorr_prot_err(pdev, data->ras_cap);
> +		cxl_cper_trace_uncorr_prot_err(cxlmd, data->ras_cap);
>  }
>  
>  static void cxl_cper_prot_err_work_fn(struct work_struct *work)
>
> base-commit: 19272b37aa4f83ca52bdf9c16d5d81bdd1354494


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] cxl/ras: Fix CPER handler device confusion
  2025-06-11 17:17 [PATCH] cxl/ras: Fix CPER handler device confusion Dan Williams
                   ` (2 preceding siblings ...)
  2025-06-11 18:50 ` Bowman, Terry
@ 2025-06-12  5:25 ` Li Ming
  2025-06-12 14:59   ` Dan Williams
  2025-06-12 19:20 ` [PATCH v2] " Dan Williams
  4 siblings, 1 reply; 11+ messages in thread
From: Li Ming @ 2025-06-12  5:25 UTC (permalink / raw)
  To: Dan Williams
  Cc: linux-cxl, Smita Koralahalli, Terry Bowman, Alison Schofield,
	Ira Weiny, Tony Luck, dave.jiang

On 6/12/2025 1:17 AM, Dan Williams wrote:
> By inspection, cxl_cper_handle_prot_err() is making a series of fragile
> assumptions that can lead to crashes:
>
> 1/ It assumes that endpoints identified in the record are a CXL-type-3
>    device, nothing guarantees that.
>
> 2/ It assumes that the device is bound to the cxl_pci driver, nothing
>    guarantees that.
>
> 3/ Minor, it holds the device lock over the switch-port tracing for no
>    reason as the trace is 100% generated from data in the record.
>
> Correct those by checking that the PCIe endpoint parents a cxl_memdev
> before assuming the format of the driver data, and move the lock to where
> it is required. Consequently this also makes the implementation ready for
> CXL accelerators that are not bound to cxl_pci.
>
> Fixes: 36f257e3b0ba ("acpi/ghes, cxl/pci: Process CXL CPER Protocol Errors")
> Cc: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com>
> Cc: Terry Bowman <terry.bowman@amd.com>
> Cc: Li Ming <ming.li@zohomail.com>
> Cc: Alison Schofield <alison.schofield@intel.com>
> Cc: Ira Weiny <ira.weiny@intel.com>
> Cc: Tony Luck <tony.luck@intel.com>
> Signed-off-by: Dan Williams <dan.j.williams@intel.com>
> ---
>  drivers/cxl/core/ras.c | 48 ++++++++++++++++++++++++------------------
>  1 file changed, 28 insertions(+), 20 deletions(-)
>
[snip]
>  static void cxl_cper_handle_prot_err(struct cxl_cper_prot_err_work_data *data)
>  {
>  	unsigned int devfn = PCI_DEVFN(data->prot_err.agent_addr.device,
> @@ -73,13 +71,13 @@ static void cxl_cper_handle_prot_err(struct cxl_cper_prot_err_work_data *data)
>  		pci_get_domain_bus_and_slot(data->prot_err.agent_addr.segment,
>  					    data->prot_err.agent_addr.bus,
>  					    devfn);
> +	struct cxl_memdev *cxlmd;
> +	struct device *mem_dev;
>  	int port_type;
>  
>  	if (!pdev)
>  		return;
>  
> -	guard(device)(&pdev->dev);
> -
>  	port_type = pci_pcie_type(pdev);
>  	if (port_type == PCI_EXP_TYPE_ROOT_PORT ||
>  	    port_type == PCI_EXP_TYPE_DOWNSTREAM ||
> @@ -92,10 +90,20 @@ static void cxl_cper_handle_prot_err(struct cxl_cper_prot_err_work_data *data)
>  		return;
>  	}
>  
> +	guard(device)(&pdev->dev);
> +	if (!pdev->dev.driver)
> +		return;
> +
> +	mem_dev = bus_find_device(&cxl_bus_type, NULL, pdev,
> +				  match_memdev_by_parent);
> +	if (!mem_dev)
> +		return;
> +
> +	cxlmd = to_cxl_memdev(mem_dev);
>  	if (data->severity == AER_CORRECTABLE)
> -		cxl_cper_trace_corr_prot_err(pdev, data->ras_cap);
> +		cxl_cper_trace_corr_prot_err(cxlmd, data->ras_cap);
>  	else
> -		cxl_cper_trace_uncorr_prot_err(pdev, data->ras_cap);
> +		cxl_cper_trace_uncorr_prot_err(cxlmd, data->ras_cap);

missing put_device() for the mem_dev?


Ming

>  }
>  
>  static void cxl_cper_prot_err_work_fn(struct work_struct *work)
>
> base-commit: 19272b37aa4f83ca52bdf9c16d5d81bdd1354494



^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] cxl/ras: Fix CPER handler device confusion
  2025-06-12  5:25 ` Li Ming
@ 2025-06-12 14:59   ` Dan Williams
  0 siblings, 0 replies; 11+ messages in thread
From: Dan Williams @ 2025-06-12 14:59 UTC (permalink / raw)
  To: Li Ming, Dan Williams
  Cc: linux-cxl, Smita Koralahalli, Terry Bowman, Alison Schofield,
	Ira Weiny, Tony Luck, dave.jiang

Li Ming wrote:
> On 6/12/2025 1:17 AM, Dan Williams wrote:
> > By inspection, cxl_cper_handle_prot_err() is making a series of fragile
> > assumptions that can lead to crashes:
> >
> > 1/ It assumes that endpoints identified in the record are a CXL-type-3
> >    device, nothing guarantees that.
> >
> > 2/ It assumes that the device is bound to the cxl_pci driver, nothing
> >    guarantees that.
> >
> > 3/ Minor, it holds the device lock over the switch-port tracing for no
> >    reason as the trace is 100% generated from data in the record.
> >
> > Correct those by checking that the PCIe endpoint parents a cxl_memdev
> > before assuming the format of the driver data, and move the lock to where
> > it is required. Consequently this also makes the implementation ready for
> > CXL accelerators that are not bound to cxl_pci.
> >
> > Fixes: 36f257e3b0ba ("acpi/ghes, cxl/pci: Process CXL CPER Protocol Errors")
> > Cc: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com>
> > Cc: Terry Bowman <terry.bowman@amd.com>
> > Cc: Li Ming <ming.li@zohomail.com>
> > Cc: Alison Schofield <alison.schofield@intel.com>
> > Cc: Ira Weiny <ira.weiny@intel.com>
> > Cc: Tony Luck <tony.luck@intel.com>
> > Signed-off-by: Dan Williams <dan.j.williams@intel.com>
[..]
> > @@ -92,10 +90,20 @@ static void cxl_cper_handle_prot_err(struct cxl_cper_prot_err_work_data *data)
> >  		return;
> >  	}
> >  
> > +	guard(device)(&pdev->dev);
> > +	if (!pdev->dev.driver)
> > +		return;
> > +
> > +	mem_dev = bus_find_device(&cxl_bus_type, NULL, pdev,
> > +				  match_memdev_by_parent);
> > +	if (!mem_dev)
> > +		return;
> > +
> > +	cxlmd = to_cxl_memdev(mem_dev);
> >  	if (data->severity == AER_CORRECTABLE)
> > -		cxl_cper_trace_corr_prot_err(pdev, data->ras_cap);
> > +		cxl_cper_trace_corr_prot_err(cxlmd, data->ras_cap);
> >  	else
> > -		cxl_cper_trace_uncorr_prot_err(pdev, data->ras_cap);
> > +		cxl_cper_trace_uncorr_prot_err(cxlmd, data->ras_cap);
> 
> missing put_device() for the mem_dev?

Yes, good catch!

A thinko on my part that I forgot to add cleanup for that.

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] cxl/ras: Fix CPER handler device confusion
  2025-06-11 18:50 ` Bowman, Terry
@ 2025-06-12 15:38   ` Dan Williams
  0 siblings, 0 replies; 11+ messages in thread
From: Dan Williams @ 2025-06-12 15:38 UTC (permalink / raw)
  To: Bowman, Terry, Dan Williams, dave.jiang
  Cc: linux-cxl, Smita Koralahalli, Li Ming, Alison Schofield,
	Ira Weiny, Tony Luck

Bowman, Terry wrote:
> 
> 
> On 6/11/2025 12:17 PM, Dan Williams wrote:
> > By inspection, cxl_cper_handle_prot_err() is making a series of fragile
> > assumptions that can lead to crashes:
> >
> > 1/ It assumes that endpoints identified in the record are a CXL-type-3
> >    device, nothing guarantees that.
> >
> > 2/ It assumes that the device is bound to the cxl_pci driver, nothing
> >    guarantees that.
> >
> > 3/ Minor, it holds the device lock over the switch-port tracing for no
> >    reason as the trace is 100% generated from data in the record.
> >
> > Correct those by checking that the PCIe endpoint parents a cxl_memdev
> > before assuming the format of the driver data, and move the lock to where
> > it is required. Consequently this also makes the implementation ready for
> > CXL accelerators that are not bound to cxl_pci.
> 
> Hi Dan,
> 
> The AER CE/UCE trace routines access the pdev->dev and pdev->dev->parent.
> Doesn't this require a guard() lock for all callers?

The device_lock() does not protect the liveness dev and dev->parent. The
AER path is able to assume that a reference on @pdev keeps
@pdev->dev.parent valid.

Typically a parent device only has an elevated reference once while a
child is registered unless a subsystem arranges for a parent to be
pinned until the final put of the child. This is what PCI does. See
pci_release_dev() where it unpins the bus, and see release_pcibus_dev()
where it unpins the bridge device (parent of the pci_dev).

So, no device_lock() needed to walk the PCI device hierarchy if you have
an endpoint PCI device with an elevated reference count. If we need to
make the same guarantee for cxl_port objects, we can. To date it has
been sufficient to just follow the typical "unpin parent at
device_del()" expectation.

Code just needs to be careful that just because you have a device object
with an elevated reference count, that does not mean the device is
registered (i.e. it could be deleted, but not freed, at any point).

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH v2] cxl/ras: Fix CPER handler device confusion
  2025-06-11 17:17 [PATCH] cxl/ras: Fix CPER handler device confusion Dan Williams
                   ` (3 preceding siblings ...)
  2025-06-12  5:25 ` Li Ming
@ 2025-06-12 19:20 ` Dan Williams
  2025-06-12 23:59   ` Li Ming
                     ` (2 more replies)
  4 siblings, 3 replies; 11+ messages in thread
From: Dan Williams @ 2025-06-12 19:20 UTC (permalink / raw)
  To: dave.jiang
  Cc: linux-cxl, Terry Bowman, Li Ming, Alison Schofield, Ira Weiny,
	Tony Luck, Smita Koralahalli

By inspection, cxl_cper_handle_prot_err() is making a series of fragile
assumptions that can lead to crashes:

1/ It assumes that endpoints identified in the record are a CXL-type-3
   device, nothing guarantees that.

2/ It assumes that the device is bound to the cxl_pci driver, nothing
   guarantees that.

3/ Minor, it holds the device lock over the switch-port tracing for no
   reason as the trace is 100% generated from data in the record.

Correct those by checking that the PCIe endpoint parents a cxl_memdev
before assuming the format of the driver data, and move the lock to where
it is required. Consequently this also makes the implementation ready for
CXL accelerators that are not bound to cxl_pci.

Fixes: 36f257e3b0ba ("acpi/ghes, cxl/pci: Process CXL CPER Protocol Errors")
Cc: Terry Bowman <terry.bowman@amd.com>
Cc: Li Ming <ming.li@zohomail.com>
Cc: Alison Schofield <alison.schofield@intel.com>
Cc: Ira Weiny <ira.weiny@intel.com>
Cc: Tony Luck <tony.luck@intel.com>
Reviewed-by: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com>
Reviewed-by: Dave Jiang <dave.jiang@intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
Changes in v2:
* drop the reference from the result of bus_find_device() (Li Ming)

 drivers/cxl/core/ras.c | 47 ++++++++++++++++++++++++------------------
 1 file changed, 27 insertions(+), 20 deletions(-)

diff --git a/drivers/cxl/core/ras.c b/drivers/cxl/core/ras.c
index 485a831695c7..2731ba3a0799 100644
--- a/drivers/cxl/core/ras.c
+++ b/drivers/cxl/core/ras.c
@@ -31,40 +31,38 @@ static void cxl_cper_trace_uncorr_port_prot_err(struct pci_dev *pdev,
 					       ras_cap.header_log);
 }
 
-static void cxl_cper_trace_corr_prot_err(struct pci_dev *pdev,
-				  struct cxl_ras_capability_regs ras_cap)
+static void cxl_cper_trace_corr_prot_err(struct cxl_memdev *cxlmd,
+					 struct cxl_ras_capability_regs ras_cap)
 {
 	u32 status = ras_cap.cor_status & ~ras_cap.cor_mask;
-	struct cxl_dev_state *cxlds;
 
-	cxlds = pci_get_drvdata(pdev);
-	if (!cxlds)
-		return;
-
-	trace_cxl_aer_correctable_error(cxlds->cxlmd, status);
+	trace_cxl_aer_correctable_error(cxlmd, status);
 }
 
-static void cxl_cper_trace_uncorr_prot_err(struct pci_dev *pdev,
-				    struct cxl_ras_capability_regs ras_cap)
+static void
+cxl_cper_trace_uncorr_prot_err(struct cxl_memdev *cxlmd,
+			       struct cxl_ras_capability_regs ras_cap)
 {
 	u32 status = ras_cap.uncor_status & ~ras_cap.uncor_mask;
-	struct cxl_dev_state *cxlds;
 	u32 fe;
 
-	cxlds = pci_get_drvdata(pdev);
-	if (!cxlds)
-		return;
-
 	if (hweight32(status) > 1)
 		fe = BIT(FIELD_GET(CXL_RAS_CAP_CONTROL_FE_MASK,
 				   ras_cap.cap_control));
 	else
 		fe = status;
 
-	trace_cxl_aer_uncorrectable_error(cxlds->cxlmd, status, fe,
+	trace_cxl_aer_uncorrectable_error(cxlmd, status, fe,
 					  ras_cap.header_log);
 }
 
+static int match_memdev_by_parent(struct device *dev, const void *uport)
+{
+	if (is_cxl_memdev(dev) && dev->parent == uport)
+		return 1;
+	return 0;
+}
+
 static void cxl_cper_handle_prot_err(struct cxl_cper_prot_err_work_data *data)
 {
 	unsigned int devfn = PCI_DEVFN(data->prot_err.agent_addr.device,
@@ -73,13 +71,12 @@ static void cxl_cper_handle_prot_err(struct cxl_cper_prot_err_work_data *data)
 		pci_get_domain_bus_and_slot(data->prot_err.agent_addr.segment,
 					    data->prot_err.agent_addr.bus,
 					    devfn);
+	struct cxl_memdev *cxlmd;
 	int port_type;
 
 	if (!pdev)
 		return;
 
-	guard(device)(&pdev->dev);
-
 	port_type = pci_pcie_type(pdev);
 	if (port_type == PCI_EXP_TYPE_ROOT_PORT ||
 	    port_type == PCI_EXP_TYPE_DOWNSTREAM ||
@@ -92,10 +89,20 @@ static void cxl_cper_handle_prot_err(struct cxl_cper_prot_err_work_data *data)
 		return;
 	}
 
+	guard(device)(&pdev->dev);
+	if (!pdev->dev.driver)
+		return;
+
+	struct device *mem_dev __free(put_device) = bus_find_device(
+		&cxl_bus_type, NULL, pdev, match_memdev_by_parent);
+	if (!mem_dev)
+		return;
+
+	cxlmd = to_cxl_memdev(mem_dev);
 	if (data->severity == AER_CORRECTABLE)
-		cxl_cper_trace_corr_prot_err(pdev, data->ras_cap);
+		cxl_cper_trace_corr_prot_err(cxlmd, data->ras_cap);
 	else
-		cxl_cper_trace_uncorr_prot_err(pdev, data->ras_cap);
+		cxl_cper_trace_uncorr_prot_err(cxlmd, data->ras_cap);
 }
 
 static void cxl_cper_prot_err_work_fn(struct work_struct *work)

base-commit: 19272b37aa4f83ca52bdf9c16d5d81bdd1354494
-- 
2.49.0


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: [PATCH v2] cxl/ras: Fix CPER handler device confusion
  2025-06-12 19:20 ` [PATCH v2] " Dan Williams
@ 2025-06-12 23:59   ` Li Ming
  2025-06-13 10:37   ` Jonathan Cameron
  2025-06-13 16:07   ` Dave Jiang
  2 siblings, 0 replies; 11+ messages in thread
From: Li Ming @ 2025-06-12 23:59 UTC (permalink / raw)
  To: Dan Williams, dave.jiang
  Cc: linux-cxl, Terry Bowman, Alison Schofield, Ira Weiny, Tony Luck,
	Smita Koralahalli

On 6/13/2025 3:20 AM, Dan Williams wrote:
> By inspection, cxl_cper_handle_prot_err() is making a series of fragile
> assumptions that can lead to crashes:
>
> 1/ It assumes that endpoints identified in the record are a CXL-type-3
>    device, nothing guarantees that.
>
> 2/ It assumes that the device is bound to the cxl_pci driver, nothing
>    guarantees that.
>
> 3/ Minor, it holds the device lock over the switch-port tracing for no
>    reason as the trace is 100% generated from data in the record.
>
> Correct those by checking that the PCIe endpoint parents a cxl_memdev
> before assuming the format of the driver data, and move the lock to where
> it is required. Consequently this also makes the implementation ready for
> CXL accelerators that are not bound to cxl_pci.
>
> Fixes: 36f257e3b0ba ("acpi/ghes, cxl/pci: Process CXL CPER Protocol Errors")
> Cc: Terry Bowman <terry.bowman@amd.com>
> Cc: Li Ming <ming.li@zohomail.com>
> Cc: Alison Schofield <alison.schofield@intel.com>
> Cc: Ira Weiny <ira.weiny@intel.com>
> Cc: Tony Luck <tony.luck@intel.com>
> Reviewed-by: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com>
> Reviewed-by: Dave Jiang <dave.jiang@intel.com>
> Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Reviewed-by: Li Ming <ming.li@zohomail.com>

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v2] cxl/ras: Fix CPER handler device confusion
  2025-06-12 19:20 ` [PATCH v2] " Dan Williams
  2025-06-12 23:59   ` Li Ming
@ 2025-06-13 10:37   ` Jonathan Cameron
  2025-06-13 16:07   ` Dave Jiang
  2 siblings, 0 replies; 11+ messages in thread
From: Jonathan Cameron @ 2025-06-13 10:37 UTC (permalink / raw)
  To: Dan Williams
  Cc: dave.jiang, linux-cxl, Terry Bowman, Li Ming, Alison Schofield,
	Ira Weiny, Tony Luck, Smita Koralahalli

On Thu, 12 Jun 2025 12:20:43 -0700
Dan Williams <dan.j.williams@intel.com> wrote:

> By inspection, cxl_cper_handle_prot_err() is making a series of fragile
> assumptions that can lead to crashes:
> 
> 1/ It assumes that endpoints identified in the record are a CXL-type-3
>    device, nothing guarantees that.
> 
> 2/ It assumes that the device is bound to the cxl_pci driver, nothing
>    guarantees that.
> 
> 3/ Minor, it holds the device lock over the switch-port tracing for no
>    reason as the trace is 100% generated from data in the record.
> 
> Correct those by checking that the PCIe endpoint parents a cxl_memdev
> before assuming the format of the driver data, and move the lock to where
> it is required. Consequently this also makes the implementation ready for
> CXL accelerators that are not bound to cxl_pci.
> 
> Fixes: 36f257e3b0ba ("acpi/ghes, cxl/pci: Process CXL CPER Protocol Errors")
> Cc: Terry Bowman <terry.bowman@amd.com>
> Cc: Li Ming <ming.li@zohomail.com>
> Cc: Alison Schofield <alison.schofield@intel.com>
> Cc: Ira Weiny <ira.weiny@intel.com>
> Cc: Tony Luck <tony.luck@intel.com>
> Reviewed-by: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com>
> Reviewed-by: Dave Jiang <dave.jiang@intel.com>
> Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v2] cxl/ras: Fix CPER handler device confusion
  2025-06-12 19:20 ` [PATCH v2] " Dan Williams
  2025-06-12 23:59   ` Li Ming
  2025-06-13 10:37   ` Jonathan Cameron
@ 2025-06-13 16:07   ` Dave Jiang
  2 siblings, 0 replies; 11+ messages in thread
From: Dave Jiang @ 2025-06-13 16:07 UTC (permalink / raw)
  To: Dan Williams
  Cc: linux-cxl, Terry Bowman, Li Ming, Alison Schofield, Ira Weiny,
	Tony Luck, Smita Koralahalli



On 6/12/25 12:20 PM, Dan Williams wrote:
> By inspection, cxl_cper_handle_prot_err() is making a series of fragile
> assumptions that can lead to crashes:
> 
> 1/ It assumes that endpoints identified in the record are a CXL-type-3
>    device, nothing guarantees that.
> 
> 2/ It assumes that the device is bound to the cxl_pci driver, nothing
>    guarantees that.
> 
> 3/ Minor, it holds the device lock over the switch-port tracing for no
>    reason as the trace is 100% generated from data in the record.
> 
> Correct those by checking that the PCIe endpoint parents a cxl_memdev
> before assuming the format of the driver data, and move the lock to where
> it is required. Consequently this also makes the implementation ready for
> CXL accelerators that are not bound to cxl_pci.
> 
> Fixes: 36f257e3b0ba ("acpi/ghes, cxl/pci: Process CXL CPER Protocol Errors")
> Cc: Terry Bowman <terry.bowman@amd.com>
> Cc: Li Ming <ming.li@zohomail.com>
> Cc: Alison Schofield <alison.schofield@intel.com>
> Cc: Ira Weiny <ira.weiny@intel.com>
> Cc: Tony Luck <tony.luck@intel.com>
> Reviewed-by: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com>
> Reviewed-by: Dave Jiang <dave.jiang@intel.com>
> Signed-off-by: Dan Williams <dan.j.williams@intel.com>

Applied to cxl/fixes

> ---
> Changes in v2:
> * drop the reference from the result of bus_find_device() (Li Ming)
> 
>  drivers/cxl/core/ras.c | 47 ++++++++++++++++++++++++------------------
>  1 file changed, 27 insertions(+), 20 deletions(-)
> 
> diff --git a/drivers/cxl/core/ras.c b/drivers/cxl/core/ras.c
> index 485a831695c7..2731ba3a0799 100644
> --- a/drivers/cxl/core/ras.c
> +++ b/drivers/cxl/core/ras.c
> @@ -31,40 +31,38 @@ static void cxl_cper_trace_uncorr_port_prot_err(struct pci_dev *pdev,
>  					       ras_cap.header_log);
>  }
>  
> -static void cxl_cper_trace_corr_prot_err(struct pci_dev *pdev,
> -				  struct cxl_ras_capability_regs ras_cap)
> +static void cxl_cper_trace_corr_prot_err(struct cxl_memdev *cxlmd,
> +					 struct cxl_ras_capability_regs ras_cap)
>  {
>  	u32 status = ras_cap.cor_status & ~ras_cap.cor_mask;
> -	struct cxl_dev_state *cxlds;
>  
> -	cxlds = pci_get_drvdata(pdev);
> -	if (!cxlds)
> -		return;
> -
> -	trace_cxl_aer_correctable_error(cxlds->cxlmd, status);
> +	trace_cxl_aer_correctable_error(cxlmd, status);
>  }
>  
> -static void cxl_cper_trace_uncorr_prot_err(struct pci_dev *pdev,
> -				    struct cxl_ras_capability_regs ras_cap)
> +static void
> +cxl_cper_trace_uncorr_prot_err(struct cxl_memdev *cxlmd,
> +			       struct cxl_ras_capability_regs ras_cap)
>  {
>  	u32 status = ras_cap.uncor_status & ~ras_cap.uncor_mask;
> -	struct cxl_dev_state *cxlds;
>  	u32 fe;
>  
> -	cxlds = pci_get_drvdata(pdev);
> -	if (!cxlds)
> -		return;
> -
>  	if (hweight32(status) > 1)
>  		fe = BIT(FIELD_GET(CXL_RAS_CAP_CONTROL_FE_MASK,
>  				   ras_cap.cap_control));
>  	else
>  		fe = status;
>  
> -	trace_cxl_aer_uncorrectable_error(cxlds->cxlmd, status, fe,
> +	trace_cxl_aer_uncorrectable_error(cxlmd, status, fe,
>  					  ras_cap.header_log);
>  }
>  
> +static int match_memdev_by_parent(struct device *dev, const void *uport)
> +{
> +	if (is_cxl_memdev(dev) && dev->parent == uport)
> +		return 1;
> +	return 0;
> +}
> +
>  static void cxl_cper_handle_prot_err(struct cxl_cper_prot_err_work_data *data)
>  {
>  	unsigned int devfn = PCI_DEVFN(data->prot_err.agent_addr.device,
> @@ -73,13 +71,12 @@ static void cxl_cper_handle_prot_err(struct cxl_cper_prot_err_work_data *data)
>  		pci_get_domain_bus_and_slot(data->prot_err.agent_addr.segment,
>  					    data->prot_err.agent_addr.bus,
>  					    devfn);
> +	struct cxl_memdev *cxlmd;
>  	int port_type;
>  
>  	if (!pdev)
>  		return;
>  
> -	guard(device)(&pdev->dev);
> -
>  	port_type = pci_pcie_type(pdev);
>  	if (port_type == PCI_EXP_TYPE_ROOT_PORT ||
>  	    port_type == PCI_EXP_TYPE_DOWNSTREAM ||
> @@ -92,10 +89,20 @@ static void cxl_cper_handle_prot_err(struct cxl_cper_prot_err_work_data *data)
>  		return;
>  	}
>  
> +	guard(device)(&pdev->dev);
> +	if (!pdev->dev.driver)
> +		return;
> +
> +	struct device *mem_dev __free(put_device) = bus_find_device(
> +		&cxl_bus_type, NULL, pdev, match_memdev_by_parent);
> +	if (!mem_dev)
> +		return;
> +
> +	cxlmd = to_cxl_memdev(mem_dev);
>  	if (data->severity == AER_CORRECTABLE)
> -		cxl_cper_trace_corr_prot_err(pdev, data->ras_cap);
> +		cxl_cper_trace_corr_prot_err(cxlmd, data->ras_cap);
>  	else
> -		cxl_cper_trace_uncorr_prot_err(pdev, data->ras_cap);
> +		cxl_cper_trace_uncorr_prot_err(cxlmd, data->ras_cap);
>  }
>  
>  static void cxl_cper_prot_err_work_fn(struct work_struct *work)
> 
> base-commit: 19272b37aa4f83ca52bdf9c16d5d81bdd1354494


^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2025-06-13 16:07 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-06-11 17:17 [PATCH] cxl/ras: Fix CPER handler device confusion Dan Williams
2025-06-11 18:14 ` Dave Jiang
2025-06-11 18:41 ` Koralahalli Channabasappa, Smita
2025-06-11 18:50 ` Bowman, Terry
2025-06-12 15:38   ` Dan Williams
2025-06-12  5:25 ` Li Ming
2025-06-12 14:59   ` Dan Williams
2025-06-12 19:20 ` [PATCH v2] " Dan Williams
2025-06-12 23:59   ` Li Ming
2025-06-13 10:37   ` Jonathan Cameron
2025-06-13 16:07   ` Dave Jiang

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).