Linux PCI subsystem development
 help / color / mirror / Atom feed
* [PATCH] PCI: Fix PCIe SBR dev/link wait error
@ 2025-11-24 10:45 Guanghui Feng
  2025-11-24 23:58 ` Bjorn Helgaas
  2025-11-26  8:20 ` Ilpo Järvinen
  0 siblings, 2 replies; 25+ messages in thread
From: Guanghui Feng @ 2025-11-24 10:45 UTC (permalink / raw)
  To: bhelgaas; +Cc: linux-pci, kanie, alikernel-developer

When executing a PCIe secondary bus reset, all downstream switches and
endpoints will generate reset events. Simultaneously, all PCIe links
will undergo retraining, and each link will independently re-execute the
LTSSM state machine training. Therefore, after executing the SBR, it is
necessary to wait for all downstream links and devices to complete
recovery. Otherwise, after the SBR returns, accessing devices with some
links or endpoints not yet fully recovered may result in driver errors,
or even trigger device offline issues.

Signed-off-by: Guanghui Feng <guanghuifeng@linux.alibaba.com>
Signed-off-by: Guixin Liu <kanie@linux.alibaba.com>
---
 drivers/pci/pci.c | 112 ++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 94 insertions(+), 18 deletions(-)

diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index b14dd064006c..9cf13fe69d94 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -4788,9 +4788,74 @@ static int pci_bus_max_d3cold_delay(const struct pci_bus *bus)
 	return max(min_delay, max_delay);
 }
 
+int __pci_bridge_wait_for_secondary_bus(struct pci_dev *, unsigned long, char *);
+
+static int pci_dev_wait_child(struct pci_dev *pdev, unsigned long start_t, int timeout,
+				char *reset_type)
+{
+	struct pci_dev *child, **dev = NULL;
+	int ct = 0, i = 0, ret = 0, left = 1;
+	unsigned long dev_start_t;
+
+	down_read(&pci_bus_sem);
+
+	list_for_each_entry(child, &pdev->subordinate->devices, bus_list)
+		ct++;
+
+	if (ct) {
+		dev = kzalloc(sizeof(struct pci_dev *) * ct, GFP_KERNEL);
+
+		if(!dev) {
+			pci_err(pdev, "dev mem alloc err\n");
+			up_read(&pci_bus_sem);
+			return -ENOMEM;
+		}
+
+		list_for_each_entry(child, &pdev->subordinate->devices, bus_list)
+			dev[i++] = pci_dev_get(child);
+	}
+
+	up_read(&pci_bus_sem);
+
+	for (i = 0; i < ct; ++i) {
+		left = 1;
+
+dev_wait:
+		dev_start_t = jiffies;
+		ret = pci_dev_wait(dev[i], reset_type, left);
+		timeout -= jiffies_to_msecs(jiffies - dev_start_t);
+
+		if (ret) {
+			if (pci_dev_is_disconnected(dev[i]))
+				continue;
+
+			if (timeout <= 0)
+				goto end;
+
+			left <<= 1;
+			left = timeout > left ? left : timeout;
+			goto dev_wait;
+		}
+	}
+
+	for (i = 0; i < ct; ++i) {
+		ret = __pci_bridge_wait_for_secondary_bus(dev[i], start_t, reset_type);
+		if (ret)
+			break;
+	}
+
+end:
+	for (i = 0; i < ct; ++i)
+		pci_dev_put(dev[i]);
+
+	kfree(dev);
+	return ret;
+}
+
 /**
- * pci_bridge_wait_for_secondary_bus - Wait for secondary bus to be accessible
+ * __pci_bridge_wait_for_secondary_bus - Wait for secondary bus to be accessible
  * @dev: PCI bridge
+ * @start_t: wait start jiffies time
  * @reset_type: reset type in human-readable form
  *
  * Handle necessary delays before access to the devices on the secondary
@@ -4804,10 +4869,9 @@ static int pci_bus_max_d3cold_delay(const struct pci_bus *bus)
  * Return 0 on success or -ENOTTY if the first device on the secondary bus
  * failed to become accessible.
  */
-int pci_bridge_wait_for_secondary_bus(struct pci_dev *dev, char *reset_type)
+int __pci_bridge_wait_for_secondary_bus(struct pci_dev *dev, unsigned long start_t, char *reset_type)
 {
-	struct pci_dev *child __free(pci_dev_put) = NULL;
-	int delay;
+	int delay, left;
 
 	if (pci_dev_is_disconnected(dev))
 		return 0;
@@ -4835,8 +4899,6 @@ int pci_bridge_wait_for_secondary_bus(struct pci_dev *dev, char *reset_type)
 		return 0;
 	}
 
-	child = pci_dev_get(list_first_entry(&dev->subordinate->devices,
-					     struct pci_dev, bus_list));
 	up_read(&pci_bus_sem);
 
 	/*
@@ -4844,8 +4906,12 @@ int pci_bridge_wait_for_secondary_bus(struct pci_dev *dev, char *reset_type)
 	 * accessing the device after reset (that is 1000 ms + 100 ms).
 	 */
 	if (!pci_is_pcie(dev)) {
-		pci_dbg(dev, "waiting %d ms for secondary bus\n", 1000 + delay);
-		msleep(1000 + delay);
+		left = 1000 + delay - jiffies_to_msecs(jiffies - start_t);
+		pci_dbg(dev, "waiting %d ms for secondary bus\n", left > 0 ? left : 0);
+
+		if (left > 0)
+			msleep(left);
+
 		return 0;
 	}
 
@@ -4870,10 +4936,14 @@ int pci_bridge_wait_for_secondary_bus(struct pci_dev *dev, char *reset_type)
 	if (pcie_get_speed_cap(dev) <= PCIE_SPEED_5_0GT) {
 		u16 status;
 
-		pci_dbg(dev, "waiting %d ms for downstream link\n", delay);
-		msleep(delay);
+		left = delay - jiffies_to_msecs(jiffies - start_t);
+		pci_dbg(dev, "waiting %d ms for downstream link\n", left > 0 ? left : 0);
+
+		if (left > 0)
+			msleep(left);
 
-		if (!pci_dev_wait(child, reset_type, PCI_RESET_WAIT - delay))
+		left = PCI_RESET_WAIT - jiffies_to_msecs(jiffies - start_t);
+		if(!pci_dev_wait_child(dev, start_t, left > 0 ? left : 0, reset_type))
 			return 0;
 
 		/*
@@ -4888,20 +4958,26 @@ int pci_bridge_wait_for_secondary_bus(struct pci_dev *dev, char *reset_type)
 		if (!(status & PCI_EXP_LNKSTA_DLLLA))
 			return -ENOTTY;
 
-		return pci_dev_wait(child, reset_type,
-				    PCIE_RESET_READY_POLL_MS - PCI_RESET_WAIT);
+		left = PCIE_RESET_READY_POLL_MS - jiffies_to_msecs(jiffies - start_t);
+		return pci_dev_wait_child(dev, start_t, left > 0 ? left : 0, reset_type);
 	}
 
-	pci_dbg(dev, "waiting %d ms for downstream link, after activation\n",
-		delay);
-	if (!pcie_wait_for_link_delay(dev, true, delay)) {
+	left = delay - jiffies_to_msecs(jiffies - start_t);
+	pci_dbg(dev, "waiting %d ms for downstream link, after activation\n", left > 0 ? left : 0);
+
+	if (!pcie_wait_for_link_delay(dev, true, left > 0 ? left : 0)) {
 		/* Did not train, no need to wait any further */
 		pci_info(dev, "Data Link Layer Link Active not set in %d msec\n", delay);
 		return -ENOTTY;
 	}
 
-	return pci_dev_wait(child, reset_type,
-			    PCIE_RESET_READY_POLL_MS - delay);
+	left = PCIE_RESET_READY_POLL_MS - jiffies_to_msecs(jiffies - start_t);
+	return pci_dev_wait_child(dev, start_t, left > 0 ? left : 0, reset_type);
+}
+
+int pci_bridge_wait_for_secondary_bus(struct pci_dev *dev, char *reset_type)
+{
+	return __pci_bridge_wait_for_secondary_bus(dev, jiffies, reset_type);
 }
 
 void pci_reset_secondary_bus(struct pci_dev *dev)
-- 
2.32.0.3.gf3a3e56d6


^ permalink raw reply related	[flat|nested] 25+ messages in thread

* Re: [PATCH] PCI: Fix PCIe SBR dev/link wait error
  2025-11-24 10:45 [PATCH] PCI: Fix PCIe SBR dev/link wait error Guanghui Feng
@ 2025-11-24 23:58 ` Bjorn Helgaas
  2025-11-25  6:20   ` guanghui.fgh
  2025-11-26  8:20 ` Ilpo Järvinen
  1 sibling, 1 reply; 25+ messages in thread
From: Bjorn Helgaas @ 2025-11-24 23:58 UTC (permalink / raw)
  To: Guanghui Feng; +Cc: bhelgaas, linux-pci, kanie, alikernel-developer

On Mon, Nov 24, 2025 at 06:45:02PM +0800, Guanghui Feng wrote:
> When executing a PCIe secondary bus reset, all downstream switches and
> endpoints will generate reset events. Simultaneously, all PCIe links
> will undergo retraining, and each link will independently re-execute the
> LTSSM state machine training. Therefore, after executing the SBR, it is
> necessary to wait for all downstream links and devices to complete
> recovery. Otherwise, after the SBR returns, accessing devices with some
> links or endpoints not yet fully recovered may result in driver errors,
> or even trigger device offline issues.

I guess this solves a problem you have observed?

Are there any specific details you can share that would help
illustrate the problem?  E.g., cases where we do a Secondary Bus
Reset, then access a downstream device too early, and an error
happens?

Bjorn

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] PCI: Fix PCIe SBR dev/link wait error
  2025-11-24 23:58 ` Bjorn Helgaas
@ 2025-11-25  6:20   ` guanghui.fgh
  2025-12-01 10:03     ` Lukas Wunner
  0 siblings, 1 reply; 25+ messages in thread
From: guanghui.fgh @ 2025-11-25  6:20 UTC (permalink / raw)
  To: Bjorn Helgaas; +Cc: bhelgaas, linux-pci, kanie, alikernel-developer

Thanks for your response.

1. When a multifunction device is connected to a PCIe slot that supports hotplug, 
   during the passthrogh device to guest process based on VFIO, some devices
   need to perform a hot reset to initialize the device:
   vfio_pci_dev_set_hot_reset ---> __pci_reset_slot/__pci_reset_bus --->
   pci_bridge_secondary_bus_reset ---> pci_bridge_wait_for_secondary_bus.

   After __pci_reset_slot/__pci_reset_bus calls pci_bridge_wait_for_secondary_bus,
   the device will be restored via pci_dev_restore. However, when a multifunction PCIe device
   is connected, executing pci_bridge_wait_for_secondary_bus only guarantees the restoration
   of a random device. For other devices that are still restoring, executing pci_dev_restore cannot
   restore the device state normally, resulting in errors or even device offline.

2. In the PCIe dpc_handler process, do pcie_do_recovery(pdev, pci_channel_io_frozen, dpc_reset_link)
   to restores the PCIe link. 
   But dpc_reset_link waits for the link to recover via pci_bridge_wait_for_secondary_bus before returning.
   Similarly, pcie_do_recovery restores devices via pci_walk_bridge(bridge, report_resume, &status),
   which also requires pci_bridge_wait_for_secondary_bus to wait for all devices to recover completely.
   For other devices that are still restoring, executing report_resume cannot restore the device state normally,
   resulting in errors or even device offline.

On Mon, Nov 24, 2025 at 06:45:02PM +0800, Guanghui Feng wrote:
> When executing a PCIe secondary bus reset, all downstream switches and
> endpoints will generate reset events. Simultaneously, all PCIe links
> will undergo retraining, and each link will independently re-execute the
> LTSSM state machine training. Therefore, after executing the SBR, it is
> necessary to wait for all downstream links and devices to complete
> recovery. Otherwise, after the SBR returns, accessing devices with some
> links or endpoints not yet fully recovered may result in driver errors,
> or even trigger device offline issues.

I guess this solves a problem you have observed?

Are there any specific details you can share that would help
illustrate the problem?  E.g., cases where we do a Secondary Bus
Reset, then access a downstream device too early, and an error
happens?

Bjorn

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] PCI: Fix PCIe SBR dev/link wait error
  2025-11-24 10:45 [PATCH] PCI: Fix PCIe SBR dev/link wait error Guanghui Feng
  2025-11-24 23:58 ` Bjorn Helgaas
@ 2025-11-26  8:20 ` Ilpo Järvinen
  2025-11-26 12:08   ` guanghui.fgh
  1 sibling, 1 reply; 25+ messages in thread
From: Ilpo Järvinen @ 2025-11-26  8:20 UTC (permalink / raw)
  To: Guanghui Feng; +Cc: bhelgaas, linux-pci, kanie, alikernel-developer

On Mon, 24 Nov 2025, Guanghui Feng wrote:

> When executing a PCIe secondary bus reset, all downstream switches and
> endpoints will generate reset events. Simultaneously, all PCIe links
> will undergo retraining, and each link will independently re-execute the
> LTSSM state machine training. Therefore, after executing the SBR, it is
> necessary to wait for all downstream links and devices to complete
> recovery. Otherwise, after the SBR returns, accessing devices with some
> links or endpoints not yet fully recovered may result in driver errors,
> or even trigger device offline issues.
> 
> Signed-off-by: Guanghui Feng <guanghuifeng@linux.alibaba.com>
> Signed-off-by: Guixin Liu <kanie@linux.alibaba.com>
> ---
>  drivers/pci/pci.c | 112 ++++++++++++++++++++++++++++++++++++++--------
>  1 file changed, 94 insertions(+), 18 deletions(-)
> 
> diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
> index b14dd064006c..9cf13fe69d94 100644
> --- a/drivers/pci/pci.c
> +++ b/drivers/pci/pci.c
> @@ -4788,9 +4788,74 @@ static int pci_bus_max_d3cold_delay(const struct pci_bus *bus)
>  	return max(min_delay, max_delay);
>  }
>  
> +int __pci_bridge_wait_for_secondary_bus(struct pci_dev *, unsigned long, char *);
> +
> +static int pci_dev_wait_child(struct pci_dev *pdev, unsigned long start_t, int timeout,
> +				char *reset_type)
> +{
> +	struct pci_dev *child, **dev = NULL;
> +	int ct = 0, i = 0, ret = 0, left = 1;
> +	unsigned long dev_start_t;
> +
> +	down_read(&pci_bus_sem);
> +
> +	list_for_each_entry(child, &pdev->subordinate->devices, bus_list)
> +		ct++;
> +
> +	if (ct) {
> +		dev = kzalloc(sizeof(struct pci_dev *) * ct, GFP_KERNEL);
> +
> +		if(!dev) {
> +			pci_err(pdev, "dev mem alloc err\n");
> +			up_read(&pci_bus_sem);
> +			return -ENOMEM;
> +		}
> +
> +		list_for_each_entry(child, &pdev->subordinate->devices, bus_list)
> +			dev[i++] = pci_dev_get(child);
> +	}
> +
> +	up_read(&pci_bus_sem);
> +
> +	for (i = 0; i < ct; ++i) {
> +		left = 1;
> +
> +dev_wait:
> +		dev_start_t = jiffies;
> +		ret = pci_dev_wait(dev[i], reset_type, left);
> +		timeout -= jiffies_to_msecs(jiffies - dev_start_t);
> +
> +		if (ret) {
> +			if (pci_dev_is_disconnected(dev[i]))
> +				continue;
> +
> +			if (timeout <= 0)
> +				goto end;
> +
> +			left <<= 1;
> +			left = timeout > left ? left : timeout;
> +			goto dev_wait;
> +		}
> +	}
> +
> +	for (i = 0; i < ct; ++i) {
> +		ret = __pci_bridge_wait_for_secondary_bus(dev[i], start_t, reset_type);

Does this add recursion without restoring config space on each level 
before starting the child wait?

> +		if (ret)
> +			break;
> +	}
> +
> +end:
> +	for (i = 0; i < ct; ++i)
> +		pci_dev_put(dev[i]);
> +
> +	kfree(dev);
> +	return ret;
> +}
> +
>  /**
> - * pci_bridge_wait_for_secondary_bus - Wait for secondary bus to be accessible
> + * __pci_bridge_wait_for_secondary_bus - Wait for secondary bus to be accessible
>   * @dev: PCI bridge
> + * @start_t: wait start jiffies time
>   * @reset_type: reset type in human-readable form
>   *
>   * Handle necessary delays before access to the devices on the secondary
> @@ -4804,10 +4869,9 @@ static int pci_bus_max_d3cold_delay(const struct pci_bus *bus)
>   * Return 0 on success or -ENOTTY if the first device on the secondary bus
>   * failed to become accessible.
>   */
> -int pci_bridge_wait_for_secondary_bus(struct pci_dev *dev, char *reset_type)
> +int __pci_bridge_wait_for_secondary_bus(struct pci_dev *dev, unsigned long start_t, char *reset_type)
>  {
> -	struct pci_dev *child __free(pci_dev_put) = NULL;
> -	int delay;
> +	int delay, left;
>  
>  	if (pci_dev_is_disconnected(dev))
>  		return 0;
> @@ -4835,8 +4899,6 @@ int pci_bridge_wait_for_secondary_bus(struct pci_dev *dev, char *reset_type)
>  		return 0;
>  	}
>  
> -	child = pci_dev_get(list_first_entry(&dev->subordinate->devices,
> -					     struct pci_dev, bus_list));
>  	up_read(&pci_bus_sem);
>  
>  	/*
> @@ -4844,8 +4906,12 @@ int pci_bridge_wait_for_secondary_bus(struct pci_dev *dev, char *reset_type)
>  	 * accessing the device after reset (that is 1000 ms + 100 ms).
>  	 */
>  	if (!pci_is_pcie(dev)) {
> -		pci_dbg(dev, "waiting %d ms for secondary bus\n", 1000 + delay);
> -		msleep(1000 + delay);
> +		left = 1000 + delay - jiffies_to_msecs(jiffies - start_t);
> +		pci_dbg(dev, "waiting %d ms for secondary bus\n", left > 0 ? left : 0);
> +
> +		if (left > 0)
> +			msleep(left);
> +
>  		return 0;
>  	}
>  
> @@ -4870,10 +4936,14 @@ int pci_bridge_wait_for_secondary_bus(struct pci_dev *dev, char *reset_type)
>  	if (pcie_get_speed_cap(dev) <= PCIE_SPEED_5_0GT) {
>  		u16 status;
>  
> -		pci_dbg(dev, "waiting %d ms for downstream link\n", delay);
> -		msleep(delay);
> +		left = delay - jiffies_to_msecs(jiffies - start_t);
> +		pci_dbg(dev, "waiting %d ms for downstream link\n", left > 0 ? left : 0);
> +
> +		if (left > 0)
> +			msleep(left);
>  
> -		if (!pci_dev_wait(child, reset_type, PCI_RESET_WAIT - delay))
> +		left = PCI_RESET_WAIT - jiffies_to_msecs(jiffies - start_t);
> +		if(!pci_dev_wait_child(dev, start_t, left > 0 ? left : 0, reset_type))
>  			return 0;
>  
>  		/*
> @@ -4888,20 +4958,26 @@ int pci_bridge_wait_for_secondary_bus(struct pci_dev *dev, char *reset_type)
>  		if (!(status & PCI_EXP_LNKSTA_DLLLA))
>  			return -ENOTTY;
>  
> -		return pci_dev_wait(child, reset_type,
> -				    PCIE_RESET_READY_POLL_MS - PCI_RESET_WAIT);
> +		left = PCIE_RESET_READY_POLL_MS - jiffies_to_msecs(jiffies - start_t);
> +		return pci_dev_wait_child(dev, start_t, left > 0 ? left : 0, reset_type);
>  	}
>  
> -	pci_dbg(dev, "waiting %d ms for downstream link, after activation\n",
> -		delay);
> -	if (!pcie_wait_for_link_delay(dev, true, delay)) {
> +	left = delay - jiffies_to_msecs(jiffies - start_t);
> +	pci_dbg(dev, "waiting %d ms for downstream link, after activation\n", left > 0 ? left : 0);
> +
> +	if (!pcie_wait_for_link_delay(dev, true, left > 0 ? left : 0)) {
>  		/* Did not train, no need to wait any further */
>  		pci_info(dev, "Data Link Layer Link Active not set in %d msec\n", delay);
>  		return -ENOTTY;
>  	}
>  
> -	return pci_dev_wait(child, reset_type,
> -			    PCIE_RESET_READY_POLL_MS - delay);
> +	left = PCIE_RESET_READY_POLL_MS - jiffies_to_msecs(jiffies - start_t);
> +	return pci_dev_wait_child(dev, start_t, left > 0 ? left : 0, reset_type);
> +}
> +
> +int pci_bridge_wait_for_secondary_bus(struct pci_dev *dev, char *reset_type)
> +{
> +	return __pci_bridge_wait_for_secondary_bus(dev, jiffies, reset_type);
>  }
>  
>  void pci_reset_secondary_bus(struct pci_dev *dev)
> 

-- 
 i.


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] PCI: Fix PCIe SBR dev/link wait error
  2025-11-26  8:20 ` Ilpo Järvinen
@ 2025-11-26 12:08   ` guanghui.fgh
  2025-11-26 12:37     ` Ilpo Järvinen
  0 siblings, 1 reply; 25+ messages in thread
From: guanghui.fgh @ 2025-11-26 12:08 UTC (permalink / raw)
  To: Ilpo Järvinen; +Cc: bhelgaas, linux-pci, kanie, alikernel-developer

1. Does this add recursion without restoring config space on each level 
before starting the child wait?

Yes
The current implementation does not require restoring the PCIe device configuration space.
The status is determined during the waiting process, either based on software-recorded status
or on the device's RO and HWINIT type registers.




> When executing a PCIe secondary bus reset, all downstream switches and
> endpoints will generate reset events. Simultaneously, all PCIe links
> will undergo retraining, and each link will independently re-execute the
> LTSSM state machine training. Therefore, after executing the SBR, it is
> necessary to wait for all downstream links and devices to complete
> recovery. Otherwise, after the SBR returns, accessing devices with some
> links or endpoints not yet fully recovered may result in driver errors,
> or even trigger device offline issues.
> 
> Signed-off-by: Guanghui Feng <guanghuifeng@linux.alibaba.com>
> Signed-off-by: Guixin Liu <kanie@linux.alibaba.com>
> ---
>  drivers/pci/pci.c | 112 ++++++++++++++++++++++++++++++++++++++--------
>  1 file changed, 94 insertions(+), 18 deletions(-)
> 
> diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
> index b14dd064006c..9cf13fe69d94 100644
> --- a/drivers/pci/pci.c
> +++ b/drivers/pci/pci.c
> @@ -4788,9 +4788,74 @@ static int pci_bus_max_d3cold_delay(const struct pci_bus *bus)
>   return max(min_delay, max_delay);
>  }
>  
> +int __pci_bridge_wait_for_secondary_bus(struct pci_dev *, unsigned long, char *);
> +
> +static int pci_dev_wait_child(struct pci_dev *pdev, unsigned long start_t, int timeout,
> +    char *reset_type)
> +{
> + struct pci_dev *child, **dev = NULL;
> + int ct = 0, i = 0, ret = 0, left = 1;
> + unsigned long dev_start_t;
> +
> + down_read(&pci_bus_sem);
> +
> + list_for_each_entry(child, &pdev->subordinate->devices, bus_list)
> +  ct++;
> +
> + if (ct) {
> +  dev = kzalloc(sizeof(struct pci_dev *) * ct, GFP_KERNEL);
> +
> +  if(!dev) {
> +   pci_err(pdev, "dev mem alloc err\n");
> +   up_read(&pci_bus_sem);
> +   return -ENOMEM;
> +  }
> +
> +  list_for_each_entry(child, &pdev->subordinate->devices, bus_list)
> +   dev[i++] = pci_dev_get(child);
> + }
> +
> + up_read(&pci_bus_sem);
> +
> + for (i = 0; i < ct; ++i) {
> +  left = 1;
> +
> +dev_wait:
> +  dev_start_t = jiffies;
> +  ret = pci_dev_wait(dev[i], reset_type, left);
> +  timeout -= jiffies_to_msecs(jiffies - dev_start_t);
> +
> +  if (ret) {
> +   if (pci_dev_is_disconnected(dev[i]))
> +    continue;
> +
> +   if (timeout <= 0)
> +    goto end;
> +
> +   left <<= 1;
> +   left = timeout > left ? left : timeout;
> +   goto dev_wait;
> +  }
> + }
> +
> + for (i = 0; i < ct; ++i) {
> +  ret = __pci_bridge_wait_for_secondary_bus(dev[i], start_t, reset_type);

Does this add recursion without restoring config space on each level 
before starting the child wait?

> +  if (ret)
> +   break;
> + }
> +
> +end:
> + for (i = 0; i < ct; ++i)
> +  pci_dev_put(dev[i]);
> +
> + kfree(dev);
> + return ret;
> +}
> +
>  /**
> - * pci_bridge_wait_for_secondary_bus - Wait for secondary bus to be accessible
> + * __pci_bridge_wait_for_secondary_bus - Wait for secondary bus to be accessible
>   * @dev: PCI bridge
> + * @start_t: wait start jiffies time
>   * @reset_type: reset type in human-readable form
>   *
>   * Handle necessary delays before access to the devices on the secondary
> @@ -4804,10 +4869,9 @@ static int pci_bus_max_d3cold_delay(const struct pci_bus *bus)
>   * Return 0 on success or -ENOTTY if the first device on the secondary bus
>   * failed to become accessible.
>   */
> -int pci_bridge_wait_for_secondary_bus(struct pci_dev *dev, char *reset_type)
> +int __pci_bridge_wait_for_secondary_bus(struct pci_dev *dev, unsigned long start_t, char *reset_type)
>  {
> - struct pci_dev *child __free(pci_dev_put) = NULL;
> - int delay;
> + int delay, left;
>  
>   if (pci_dev_is_disconnected(dev))
>    return 0;
> @@ -4835,8 +4899,6 @@ int pci_bridge_wait_for_secondary_bus(struct pci_dev *dev, char *reset_type)
>    return 0;
>   }
>  
> - child = pci_dev_get(list_first_entry(&dev->subordinate->devices,
> -          struct pci_dev, bus_list));
>   up_read(&pci_bus_sem);
>  
>   /*
> @@ -4844,8 +4906,12 @@ int pci_bridge_wait_for_secondary_bus(struct pci_dev *dev, char *reset_type)
>    * accessing the device after reset (that is 1000 ms + 100 ms).
>    */
>   if (!pci_is_pcie(dev)) {
> -  pci_dbg(dev, "waiting %d ms for secondary bus\n", 1000 + delay);
> -  msleep(1000 + delay);
> +  left = 1000 + delay - jiffies_to_msecs(jiffies - start_t);
> +  pci_dbg(dev, "waiting %d ms for secondary bus\n", left > 0 ? left : 0);
> +
> +  if (left > 0)
> +   msleep(left);
> +
>    return 0;
>   }
>  
> @@ -4870,10 +4936,14 @@ int pci_bridge_wait_for_secondary_bus(struct pci_dev *dev, char *reset_type)
>   if (pcie_get_speed_cap(dev) <= PCIE_SPEED_5_0GT) {
>    u16 status;
>  
> -  pci_dbg(dev, "waiting %d ms for downstream link\n", delay);
> -  msleep(delay);
> +  left = delay - jiffies_to_msecs(jiffies - start_t);
> +  pci_dbg(dev, "waiting %d ms for downstream link\n", left > 0 ? left : 0);
> +
> +  if (left > 0)
> +   msleep(left);
>  
> -  if (!pci_dev_wait(child, reset_type, PCI_RESET_WAIT - delay))
> +  left = PCI_RESET_WAIT - jiffies_to_msecs(jiffies - start_t);
> +  if(!pci_dev_wait_child(dev, start_t, left > 0 ? left : 0, reset_type))
>     return 0;
>  
>    /*
> @@ -4888,20 +4958,26 @@ int pci_bridge_wait_for_secondary_bus(struct pci_dev *dev, char *reset_type)
>    if (!(status & PCI_EXP_LNKSTA_DLLLA))
>     return -ENOTTY;
>  
> -  return pci_dev_wait(child, reset_type,
> -        PCIE_RESET_READY_POLL_MS - PCI_RESET_WAIT);
> +  left = PCIE_RESET_READY_POLL_MS - jiffies_to_msecs(jiffies - start_t);
> +  return pci_dev_wait_child(dev, start_t, left > 0 ? left : 0, reset_type);
>   }
>  
> - pci_dbg(dev, "waiting %d ms for downstream link, after activation\n",
> -  delay);
> - if (!pcie_wait_for_link_delay(dev, true, delay)) {
> + left = delay - jiffies_to_msecs(jiffies - start_t);
> + pci_dbg(dev, "waiting %d ms for downstream link, after activation\n", left > 0 ? left : 0);
> +
> + if (!pcie_wait_for_link_delay(dev, true, left > 0 ? left : 0)) {
>    /* Did not train, no need to wait any further */
>    pci_info(dev, "Data Link Layer Link Active not set in %d msec\n", delay);
>    return -ENOTTY;
>   }
>  
> - return pci_dev_wait(child, reset_type,
> -       PCIE_RESET_READY_POLL_MS - delay);
> + left = PCIE_RESET_READY_POLL_MS - jiffies_to_msecs(jiffies - start_t);
> + return pci_dev_wait_child(dev, start_t, left > 0 ? left : 0, reset_type);
> +}
> +
> +int pci_bridge_wait_for_secondary_bus(struct pci_dev *dev, char *reset_type)
> +{
> + return __pci_bridge_wait_for_secondary_bus(dev, jiffies, reset_type);
>  }
>  
>  void pci_reset_secondary_bus(struct pci_dev *dev)
> 

-- 
 i.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] PCI: Fix PCIe SBR dev/link wait error
  2025-11-26 12:08   ` guanghui.fgh
@ 2025-11-26 12:37     ` Ilpo Järvinen
  2025-11-26 14:22       ` guanghui.fgh
  0 siblings, 1 reply; 25+ messages in thread
From: Ilpo Järvinen @ 2025-11-26 12:37 UTC (permalink / raw)
  To: guanghui.fgh; +Cc: bhelgaas, linux-pci, kanie, alikernel-developer

[-- Attachment #1: Type: text/plain, Size: 7968 bytes --]

On Wed, 26 Nov 2025, guanghui.fgh wrote:

> 1. Does this add recursion without restoring config space on each level 
> before starting the child wait?
> 
> Yes
> The current implementation does not require restoring the PCIe device 
> configuration space. The status is determined during the waiting 
> process, either based on software-recorded status or on the device's RO 
> and HWINIT type registers.

What guarantees a link even come up without restoring the config space
first (it may work in your case but is that true universally)?

The commit 3e40aa29d47e ("PCI: Wait for Link before restoring Downstream 
Buses") tried to address this sub-hierarchy wait issue within the 
recursive bus restore (without me ever encountering this problem for 
real). So I don't understand why you had to add the recursion to the wait 
side as well?

-- 
 i.

> > When executing a PCIe secondary bus reset, all downstream switches and
> > endpoints will generate reset events. Simultaneously, all PCIe links
> > will undergo retraining, and each link will independently re-execute the
> > LTSSM state machine training. Therefore, after executing the SBR, it is
> > necessary to wait for all downstream links and devices to complete
> > recovery. Otherwise, after the SBR returns, accessing devices with some
> > links or endpoints not yet fully recovered may result in driver errors,
> > or even trigger device offline issues.
> > 
> > Signed-off-by: Guanghui Feng <guanghuifeng@linux.alibaba.com>
> > Signed-off-by: Guixin Liu <kanie@linux.alibaba.com>
> > ---
> >  drivers/pci/pci.c | 112 ++++++++++++++++++++++++++++++++++++++--------
> >  1 file changed, 94 insertions(+), 18 deletions(-)
> > 
> > diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
> > index b14dd064006c..9cf13fe69d94 100644
> > --- a/drivers/pci/pci.c
> > +++ b/drivers/pci/pci.c
> > @@ -4788,9 +4788,74 @@ static int pci_bus_max_d3cold_delay(const struct pci_bus *bus)
> >   return max(min_delay, max_delay);
> >  }
> >  
> > +int __pci_bridge_wait_for_secondary_bus(struct pci_dev *, unsigned long, char *);
> > +
> > +static int pci_dev_wait_child(struct pci_dev *pdev, unsigned long start_t, int timeout,
> > +    char *reset_type)
> > +{
> > + struct pci_dev *child, **dev = NULL;
> > + int ct = 0, i = 0, ret = 0, left = 1;
> > + unsigned long dev_start_t;
> > +
> > + down_read(&pci_bus_sem);
> > +
> > + list_for_each_entry(child, &pdev->subordinate->devices, bus_list)
> > +  ct++;
> > +
> > + if (ct) {
> > +  dev = kzalloc(sizeof(struct pci_dev *) * ct, GFP_KERNEL);
> > +
> > +  if(!dev) {
> > +   pci_err(pdev, "dev mem alloc err\n");
> > +   up_read(&pci_bus_sem);
> > +   return -ENOMEM;
> > +  }
> > +
> > +  list_for_each_entry(child, &pdev->subordinate->devices, bus_list)
> > +   dev[i++] = pci_dev_get(child);
> > + }
> > +
> > + up_read(&pci_bus_sem);
> > +
> > + for (i = 0; i < ct; ++i) {
> > +  left = 1;
> > +
> > +dev_wait:
> > +  dev_start_t = jiffies;
> > +  ret = pci_dev_wait(dev[i], reset_type, left);
> > +  timeout -= jiffies_to_msecs(jiffies - dev_start_t);
> > +
> > +  if (ret) {
> > +   if (pci_dev_is_disconnected(dev[i]))
> > +    continue;
> > +
> > +   if (timeout <= 0)
> > +    goto end;
> > +
> > +   left <<= 1;
> > +   left = timeout > left ? left : timeout;
> > +   goto dev_wait;
> > +  }
> > + }
> > +
> > + for (i = 0; i < ct; ++i) {
> > +  ret = __pci_bridge_wait_for_secondary_bus(dev[i], start_t, reset_type);
> 
> Does this add recursion without restoring config space on each level 
> before starting the child wait?
> 
> > +  if (ret)
> > +   break;
> > + }
> > +
> > +end:
> > + for (i = 0; i < ct; ++i)
> > +  pci_dev_put(dev[i]);
> > +
> > + kfree(dev);
> > + return ret;
> > +}
> > +
> >  /**
> > - * pci_bridge_wait_for_secondary_bus - Wait for secondary bus to be accessible
> > + * __pci_bridge_wait_for_secondary_bus - Wait for secondary bus to be accessible
> >   * @dev: PCI bridge
> > + * @start_t: wait start jiffies time
> >   * @reset_type: reset type in human-readable form
> >   *
> >   * Handle necessary delays before access to the devices on the secondary
> > @@ -4804,10 +4869,9 @@ static int pci_bus_max_d3cold_delay(const struct pci_bus *bus)
> >   * Return 0 on success or -ENOTTY if the first device on the secondary bus
> >   * failed to become accessible.
> >   */
> > -int pci_bridge_wait_for_secondary_bus(struct pci_dev *dev, char *reset_type)
> > +int __pci_bridge_wait_for_secondary_bus(struct pci_dev *dev, unsigned long start_t, char *reset_type)
> >  {
> > - struct pci_dev *child __free(pci_dev_put) = NULL;
> > - int delay;
> > + int delay, left;
> >  
> >   if (pci_dev_is_disconnected(dev))
> >    return 0;
> > @@ -4835,8 +4899,6 @@ int pci_bridge_wait_for_secondary_bus(struct pci_dev *dev, char *reset_type)
> >    return 0;
> >   }
> >  
> > - child = pci_dev_get(list_first_entry(&dev->subordinate->devices,
> > -          struct pci_dev, bus_list));
> >   up_read(&pci_bus_sem);
> >  
> >   /*
> > @@ -4844,8 +4906,12 @@ int pci_bridge_wait_for_secondary_bus(struct pci_dev *dev, char *reset_type)
> >    * accessing the device after reset (that is 1000 ms + 100 ms).
> >    */
> >   if (!pci_is_pcie(dev)) {
> > -  pci_dbg(dev, "waiting %d ms for secondary bus\n", 1000 + delay);
> > -  msleep(1000 + delay);
> > +  left = 1000 + delay - jiffies_to_msecs(jiffies - start_t);
> > +  pci_dbg(dev, "waiting %d ms for secondary bus\n", left > 0 ? left : 0);
> > +
> > +  if (left > 0)
> > +   msleep(left);
> > +
> >    return 0;
> >   }
> >  
> > @@ -4870,10 +4936,14 @@ int pci_bridge_wait_for_secondary_bus(struct pci_dev *dev, char *reset_type)
> >   if (pcie_get_speed_cap(dev) <= PCIE_SPEED_5_0GT) {
> >    u16 status;
> >  
> > -  pci_dbg(dev, "waiting %d ms for downstream link\n", delay);
> > -  msleep(delay);
> > +  left = delay - jiffies_to_msecs(jiffies - start_t);
> > +  pci_dbg(dev, "waiting %d ms for downstream link\n", left > 0 ? left : 0);
> > +
> > +  if (left > 0)
> > +   msleep(left);
> >  
> > -  if (!pci_dev_wait(child, reset_type, PCI_RESET_WAIT - delay))
> > +  left = PCI_RESET_WAIT - jiffies_to_msecs(jiffies - start_t);
> > +  if(!pci_dev_wait_child(dev, start_t, left > 0 ? left : 0, reset_type))
> >     return 0;
> >  
> >    /*
> > @@ -4888,20 +4958,26 @@ int pci_bridge_wait_for_secondary_bus(struct pci_dev *dev, char *reset_type)
> >    if (!(status & PCI_EXP_LNKSTA_DLLLA))
> >     return -ENOTTY;
> >  
> > -  return pci_dev_wait(child, reset_type,
> > -        PCIE_RESET_READY_POLL_MS - PCI_RESET_WAIT);
> > +  left = PCIE_RESET_READY_POLL_MS - jiffies_to_msecs(jiffies - start_t);
> > +  return pci_dev_wait_child(dev, start_t, left > 0 ? left : 0, reset_type);
> >   }
> >  
> > - pci_dbg(dev, "waiting %d ms for downstream link, after activation\n",
> > -  delay);
> > - if (!pcie_wait_for_link_delay(dev, true, delay)) {
> > + left = delay - jiffies_to_msecs(jiffies - start_t);
> > + pci_dbg(dev, "waiting %d ms for downstream link, after activation\n", left > 0 ? left : 0);
> > +
> > + if (!pcie_wait_for_link_delay(dev, true, left > 0 ? left : 0)) {
> >    /* Did not train, no need to wait any further */
> >    pci_info(dev, "Data Link Layer Link Active not set in %d msec\n", delay);
> >    return -ENOTTY;
> >   }
> >  
> > - return pci_dev_wait(child, reset_type,
> > -       PCIE_RESET_READY_POLL_MS - delay);
> > + left = PCIE_RESET_READY_POLL_MS - jiffies_to_msecs(jiffies - start_t);
> > + return pci_dev_wait_child(dev, start_t, left > 0 ? left : 0, reset_type);
> > +}
> > +
> > +int pci_bridge_wait_for_secondary_bus(struct pci_dev *dev, char *reset_type)
> > +{
> > + return __pci_bridge_wait_for_secondary_bus(dev, jiffies, reset_type);
> >  }
> >  
> >  void pci_reset_secondary_bus(struct pci_dev *dev)
> > 
> 
> -- 
>  i.
> 

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] PCI: Fix PCIe SBR dev/link wait error
  2025-11-26 12:37     ` Ilpo Järvinen
@ 2025-11-26 14:22       ` guanghui.fgh
  2025-11-26 14:47         ` Ilpo Järvinen
  0 siblings, 1 reply; 25+ messages in thread
From: guanghui.fgh @ 2025-11-26 14:22 UTC (permalink / raw)
  To: Ilpo Järvinen; +Cc: bhelgaas, linux-pci, kanie, alikernel-developer

Refer to the previous email discussion replies:

1. When a multifunction device is connected to a PCIe slot that supports hotplug, 
   during the passthrogh device to guest process based on VFIO, some devices
   need to perform a hot reset to initialize the device:
   vfio_pci_dev_set_hot_reset ---> __pci_reset_slot/__pci_reset_bus --->
   pci_bridge_secondary_bus_reset ---> pci_bridge_wait_for_secondary_bus.

   After __pci_reset_slot/__pci_reset_bus calls pci_bridge_wait_for_secondary_bus,
   the device will be restored via pci_dev_restore. 

   However, when a       ======      multifunction PCIe device      =========
   is connected, executing pci_bridge_wait_for_secondary_bus only guarantees the restoration
   of a random device. For other devices that are still restoring, executing pci_dev_restore cannot
   restore the device state normally, resulting in errors or even device offline.

2. In the PCIe dpc_handler process, do pcie_do_recovery(pdev, pci_channel_io_frozen, dpc_reset_link)
   to restores the PCIe link. 
   But dpc_reset_link waits for the link to recover via pci_bridge_wait_for_secondary_bus before returning.
   Similarly, pcie_do_recovery restores devices via pci_walk_bridge(bridge, report_resume, &status),
   
   ======      which also requires pci_bridge_wait_for_secondary_bus to wait for all devices to recover completely.
   For other devices that are still restoring, executing report_resume cannot restore the device state normally,
   resulting in errors or even device offline.

3. The PCIe specification only constrains the minimum wait time under different resets. In the historical kernel implementation,
   SBR wait also did not need to restore the PCIe device configuration space before waiting for the coordinate.
   




------------------------------------------------------------------
On Wed, 26 Nov 2025, guanghui.fgh wrote:

> 1. Does this add recursion without restoring config space on each level 
> before starting the child wait?
> 
> Yes
> The current implementation does not require restoring the PCIe device 
> configuration space. The status is determined during the waiting 
> process, either based on software-recorded status or on the device's RO 
> and HWINIT type registers.

What guarantees a link even come up without restoring the config space
first (it may work in your case but is that true universally)?

The commit 3e40aa29d47e ("PCI: Wait for Link before restoring Downstream 
Buses") tried to address this sub-hierarchy wait issue within the 
recursive bus restore (without me ever encountering this problem for 
real). So I don't understand why you had to add the recursion to the wait 
side as well?

-- 
 i.

> > When executing a PCIe secondary bus reset, all downstream switches and
> > endpoints will generate reset events. Simultaneously, all PCIe links
> > will undergo retraining, and each link will independently re-execute the
> > LTSSM state machine training. Therefore, after executing the SBR, it is
> > necessary to wait for all downstream links and devices to complete
> > recovery. Otherwise, after the SBR returns, accessing devices with some
> > links or endpoints not yet fully recovered may result in driver errors,
> > or even trigger device offline issues.
> > 
> > Signed-off-by: Guanghui Feng <guanghuifeng@linux.alibaba.com>
> > Signed-off-by: Guixin Liu <kanie@linux.alibaba.com>
> > ---
> >  drivers/pci/pci.c | 112 ++++++++++++++++++++++++++++++++++++++--------
> >  1 file changed, 94 insertions(+), 18 deletions(-)
> > 
> > diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
> > index b14dd064006c..9cf13fe69d94 100644
> > --- a/drivers/pci/pci.c
> > +++ b/drivers/pci/pci.c
> > @@ -4788,9 +4788,74 @@ static int pci_bus_max_d3cold_delay(const struct pci_bus *bus)
> >   return max(min_delay, max_delay);
> >  }
> >  
> > +int __pci_bridge_wait_for_secondary_bus(struct pci_dev *, unsigned long, char *);
> > +
> > +static int pci_dev_wait_child(struct pci_dev *pdev, unsigned long start_t, int timeout,
> > +    char *reset_type)
> > +{
> > + struct pci_dev *child, **dev = NULL;
> > + int ct = 0, i = 0, ret = 0, left = 1;
> > + unsigned long dev_start_t;
> > +
> > + down_read(&pci_bus_sem);
> > +
> > + list_for_each_entry(child, &pdev->subordinate->devices, bus_list)
> > +  ct++;
> > +
> > + if (ct) {
> > +  dev = kzalloc(sizeof(struct pci_dev *) * ct, GFP_KERNEL);
> > +
> > +  if(!dev) {
> > +   pci_err(pdev, "dev mem alloc err\n");
> > +   up_read(&pci_bus_sem);
> > +   return -ENOMEM;
> > +  }
> > +
> > +  list_for_each_entry(child, &pdev->subordinate->devices, bus_list)
> > +   dev[i++] = pci_dev_get(child);
> > + }
> > +
> > + up_read(&pci_bus_sem);
> > +
> > + for (i = 0; i < ct; ++i) {
> > +  left = 1;
> > +
> > +dev_wait:
> > +  dev_start_t = jiffies;
> > +  ret = pci_dev_wait(dev[i], reset_type, left);
> > +  timeout -= jiffies_to_msecs(jiffies - dev_start_t);
> > +
> > +  if (ret) {
> > +   if (pci_dev_is_disconnected(dev[i]))
> > +    continue;
> > +
> > +   if (timeout <= 0)
> > +    goto end;
> > +
> > +   left <<= 1;
> > +   left = timeout > left ? left : timeout;
> > +   goto dev_wait;
> > +  }
> > + }
> > +
> > + for (i = 0; i < ct; ++i) {
> > +  ret = __pci_bridge_wait_for_secondary_bus(dev[i], start_t, reset_type);
> 
> Does this add recursion without restoring config space on each level 
> before starting the child wait?
> 
> > +  if (ret)
> > +   break;
> > + }
> > +
> > +end:
> > + for (i = 0; i < ct; ++i)
> > +  pci_dev_put(dev[i]);
> > +
> > + kfree(dev);
> > + return ret;
> > +}
> > +
> >  /**
> > - * pci_bridge_wait_for_secondary_bus - Wait for secondary bus to be accessible
> > + * __pci_bridge_wait_for_secondary_bus - Wait for secondary bus to be accessible
> >   * @dev: PCI bridge
> > + * @start_t: wait start jiffies time
> >   * @reset_type: reset type in human-readable form
> >   *
> >   * Handle necessary delays before access to the devices on the secondary
> > @@ -4804,10 +4869,9 @@ static int pci_bus_max_d3cold_delay(const struct pci_bus *bus)
> >   * Return 0 on success or -ENOTTY if the first device on the secondary bus
> >   * failed to become accessible.
> >   */
> > -int pci_bridge_wait_for_secondary_bus(struct pci_dev *dev, char *reset_type)
> > +int __pci_bridge_wait_for_secondary_bus(struct pci_dev *dev, unsigned long start_t, char *reset_type)
> >  {
> > - struct pci_dev *child __free(pci_dev_put) = NULL;
> > - int delay;
> > + int delay, left;
> >  
> >   if (pci_dev_is_disconnected(dev))
> >    return 0;
> > @@ -4835,8 +4899,6 @@ int pci_bridge_wait_for_secondary_bus(struct pci_dev *dev, char *reset_type)
> >    return 0;
> >   }
> >  
> > - child = pci_dev_get(list_first_entry(&dev->subordinate->devices,
> > -          struct pci_dev, bus_list));
> >   up_read(&pci_bus_sem);
> >  
> >   /*
> > @@ -4844,8 +4906,12 @@ int pci_bridge_wait_for_secondary_bus(struct pci_dev *dev, char *reset_type)
> >    * accessing the device after reset (that is 1000 ms + 100 ms).
> >    */
> >   if (!pci_is_pcie(dev)) {
> > -  pci_dbg(dev, "waiting %d ms for secondary bus\n", 1000 + delay);
> > -  msleep(1000 + delay);
> > +  left = 1000 + delay - jiffies_to_msecs(jiffies - start_t);
> > +  pci_dbg(dev, "waiting %d ms for secondary bus\n", left > 0 ? left : 0);
> > +
> > +  if (left > 0)
> > +   msleep(left);
> > +
> >    return 0;
> >   }
> >  
> > @@ -4870,10 +4936,14 @@ int pci_bridge_wait_for_secondary_bus(struct pci_dev *dev, char *reset_type)
> >   if (pcie_get_speed_cap(dev) <= PCIE_SPEED_5_0GT) {
> >    u16 status;
> >  
> > -  pci_dbg(dev, "waiting %d ms for downstream link\n", delay);
> > -  msleep(delay);
> > +  left = delay - jiffies_to_msecs(jiffies - start_t);
> > +  pci_dbg(dev, "waiting %d ms for downstream link\n", left > 0 ? left : 0);
> > +
> > +  if (left > 0)
> > +   msleep(left);
> >  
> > -  if (!pci_dev_wait(child, reset_type, PCI_RESET_WAIT - delay))
> > +  left = PCI_RESET_WAIT - jiffies_to_msecs(jiffies - start_t);
> > +  if(!pci_dev_wait_child(dev, start_t, left > 0 ? left : 0, reset_type))
> >     return 0;
> >  
> >    /*
> > @@ -4888,20 +4958,26 @@ int pci_bridge_wait_for_secondary_bus(struct pci_dev *dev, char *reset_type)
> >    if (!(status & PCI_EXP_LNKSTA_DLLLA))
> >     return -ENOTTY;
> >  
> > -  return pci_dev_wait(child, reset_type,
> > -        PCIE_RESET_READY_POLL_MS - PCI_RESET_WAIT);
> > +  left = PCIE_RESET_READY_POLL_MS - jiffies_to_msecs(jiffies - start_t);
> > +  return pci_dev_wait_child(dev, start_t, left > 0 ? left : 0, reset_type);
> >   }
> >  
> > - pci_dbg(dev, "waiting %d ms for downstream link, after activation\n",
> > -  delay);
> > - if (!pcie_wait_for_link_delay(dev, true, delay)) {
> > + left = delay - jiffies_to_msecs(jiffies - start_t);
> > + pci_dbg(dev, "waiting %d ms for downstream link, after activation\n", left > 0 ? left : 0);
> > +
> > + if (!pcie_wait_for_link_delay(dev, true, left > 0 ? left : 0)) {
> >    /* Did not train, no need to wait any further */
> >    pci_info(dev, "Data Link Layer Link Active not set in %d msec\n", delay);
> >    return -ENOTTY;
> >   }
> >  
> > - return pci_dev_wait(child, reset_type,
> > -       PCIE_RESET_READY_POLL_MS - delay);
> > + left = PCIE_RESET_READY_POLL_MS - jiffies_to_msecs(jiffies - start_t);
> > + return pci_dev_wait_child(dev, start_t, left > 0 ? left : 0, reset_type);
> > +}
> > +
> > +int pci_bridge_wait_for_secondary_bus(struct pci_dev *dev, char *reset_type)
> > +{
> > + return __pci_bridge_wait_for_secondary_bus(dev, jiffies, reset_type);
> >  }
> >  
> >  void pci_reset_secondary_bus(struct pci_dev *dev)
> > 
> 
> -- 
>  i.
> 

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] PCI: Fix PCIe SBR dev/link wait error
  2025-11-26 14:22       ` guanghui.fgh
@ 2025-11-26 14:47         ` Ilpo Järvinen
  2025-11-29 16:36           ` [PATCH v2] " Guanghui Feng
  2025-11-30  5:17           ` [PATCH v3] " Guanghui Feng
  0 siblings, 2 replies; 25+ messages in thread
From: Ilpo Järvinen @ 2025-11-26 14:47 UTC (permalink / raw)
  To: guanghui.fgh; +Cc: bhelgaas, linux-pci, kanie, alikernel-developer

[-- Attachment #1: Type: text/plain, Size: 11084 bytes --]

On Wed, 26 Nov 2025, guanghui.fgh wrote:

> Refer to the previous email discussion replies:
> 
> 1. When a multifunction device is connected to a PCIe slot that supports hotplug, 
>    during the passthrogh device to guest process based on VFIO, some devices
>    need to perform a hot reset to initialize the device:
>    vfio_pci_dev_set_hot_reset ---> __pci_reset_slot/__pci_reset_bus --->
>    pci_bridge_secondary_bus_reset ---> pci_bridge_wait_for_secondary_bus.
>
>    After __pci_reset_slot/__pci_reset_bus calls pci_bridge_wait_for_secondary_bus,
>    the device will be restored via pci_dev_restore. 

...and then they call to pci_slot_restore_locked()/pci_bus_restore_locked() 
that do recursively wait + restore config spaces since the commit 
3e40aa29d47e ("PCI: Wait for Link before restoring Downstream Buses").

>    However, when a       ======      multifunction PCIe device      =========
>    is connected, executing pci_bridge_wait_for_secondary_bus only guarantees the restoration
>    of a random device. For other devices that are still restoring, executing pci_dev_restore cannot
>    restore the device state normally, resulting in errors or even device offline.

Addressing this doesn't require recursion AFAICT.

> 2. In the PCIe dpc_handler process, do pcie_do_recovery(pdev, pci_channel_io_frozen, dpc_reset_link)
>    to restores the PCIe link. 
>    But dpc_reset_link waits for the link to recover via pci_bridge_wait_for_secondary_bus before returning.
>    Similarly, pcie_do_recovery restores devices via pci_walk_bridge(bridge, report_resume, &status),
>
>    ======      which also requires pci_bridge_wait_for_secondary_bus to wait for all devices to recover completely.
>    For other devices that are still restoring, executing report_resume cannot restore the device state normally,
>    resulting in errors or even device offline.

This might lack wait for subordinate buses and ordering with restore, but 
I don't think the right place to add that is 
pci_bridge_wait_for_secondary_bus() that is used by others.

> 3. The PCIe specification only constrains the minimum wait time under different resets. In the historical kernel implementation,
>    SBR wait also did not need to restore the PCIe device configuration space before waiting for the coordinate.



I'm not saying there isn't anything to fix/change but I'm not convinced by 
the approach taken by your patch.

--
 i.

> ------------------------------------------------------------------
> On Wed, 26 Nov 2025, guanghui.fgh wrote:
> 
> > 1. Does this add recursion without restoring config space on each level 
> > before starting the child wait?
> > 
> > Yes
> > The current implementation does not require restoring the PCIe device 
> > configuration space. The status is determined during the waiting 
> > process, either based on software-recorded status or on the device's RO 
> > and HWINIT type registers.
> 
> What guarantees a link even come up without restoring the config space
> first (it may work in your case but is that true universally)?
> 
> The commit 3e40aa29d47e ("PCI: Wait for Link before restoring Downstream 
> Buses") tried to address this sub-hierarchy wait issue within the 
> recursive bus restore (without me ever encountering this problem for 
> real). So I don't understand why you had to add the recursion to the wait 
> side as well?
> 
> -- 
>  i.
> 
> > > When executing a PCIe secondary bus reset, all downstream switches and
> > > endpoints will generate reset events. Simultaneously, all PCIe links
> > > will undergo retraining, and each link will independently re-execute the
> > > LTSSM state machine training. Therefore, after executing the SBR, it is
> > > necessary to wait for all downstream links and devices to complete
> > > recovery. Otherwise, after the SBR returns, accessing devices with some
> > > links or endpoints not yet fully recovered may result in driver errors,
> > > or even trigger device offline issues.
> > > 
> > > Signed-off-by: Guanghui Feng <guanghuifeng@linux.alibaba.com>
> > > Signed-off-by: Guixin Liu <kanie@linux.alibaba.com>
> > > ---
> > >  drivers/pci/pci.c | 112 ++++++++++++++++++++++++++++++++++++++--------
> > >  1 file changed, 94 insertions(+), 18 deletions(-)
> > > 
> > > diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
> > > index b14dd064006c..9cf13fe69d94 100644
> > > --- a/drivers/pci/pci.c
> > > +++ b/drivers/pci/pci.c
> > > @@ -4788,9 +4788,74 @@ static int pci_bus_max_d3cold_delay(const struct pci_bus *bus)
> > >   return max(min_delay, max_delay);
> > >  }
> > >  
> > > +int __pci_bridge_wait_for_secondary_bus(struct pci_dev *, unsigned long, char *);
> > > +
> > > +static int pci_dev_wait_child(struct pci_dev *pdev, unsigned long start_t, int timeout,
> > > +    char *reset_type)
> > > +{
> > > + struct pci_dev *child, **dev = NULL;
> > > + int ct = 0, i = 0, ret = 0, left = 1;
> > > + unsigned long dev_start_t;
> > > +
> > > + down_read(&pci_bus_sem);
> > > +
> > > + list_for_each_entry(child, &pdev->subordinate->devices, bus_list)
> > > +  ct++;
> > > +
> > > + if (ct) {
> > > +  dev = kzalloc(sizeof(struct pci_dev *) * ct, GFP_KERNEL);
> > > +
> > > +  if(!dev) {
> > > +   pci_err(pdev, "dev mem alloc err\n");
> > > +   up_read(&pci_bus_sem);
> > > +   return -ENOMEM;
> > > +  }
> > > +
> > > +  list_for_each_entry(child, &pdev->subordinate->devices, bus_list)
> > > +   dev[i++] = pci_dev_get(child);
> > > + }
> > > +
> > > + up_read(&pci_bus_sem);
> > > +
> > > + for (i = 0; i < ct; ++i) {
> > > +  left = 1;
> > > +
> > > +dev_wait:
> > > +  dev_start_t = jiffies;
> > > +  ret = pci_dev_wait(dev[i], reset_type, left);
> > > +  timeout -= jiffies_to_msecs(jiffies - dev_start_t);
> > > +
> > > +  if (ret) {
> > > +   if (pci_dev_is_disconnected(dev[i]))
> > > +    continue;
> > > +
> > > +   if (timeout <= 0)
> > > +    goto end;
> > > +
> > > +   left <<= 1;
> > > +   left = timeout > left ? left : timeout;
> > > +   goto dev_wait;
> > > +  }
> > > + }
> > > +
> > > + for (i = 0; i < ct; ++i) {
> > > +  ret = __pci_bridge_wait_for_secondary_bus(dev[i], start_t, reset_type);
> > 
> > Does this add recursion without restoring config space on each level 
> > before starting the child wait?
> > 
> > > +  if (ret)
> > > +   break;
> > > + }
> > > +
> > > +end:
> > > + for (i = 0; i < ct; ++i)
> > > +  pci_dev_put(dev[i]);
> > > +
> > > + kfree(dev);
> > > + return ret;
> > > +}
> > > +
> > >  /**
> > > - * pci_bridge_wait_for_secondary_bus - Wait for secondary bus to be accessible
> > > + * __pci_bridge_wait_for_secondary_bus - Wait for secondary bus to be accessible
> > >   * @dev: PCI bridge
> > > + * @start_t: wait start jiffies time
> > >   * @reset_type: reset type in human-readable form
> > >   *
> > >   * Handle necessary delays before access to the devices on the secondary
> > > @@ -4804,10 +4869,9 @@ static int pci_bus_max_d3cold_delay(const struct pci_bus *bus)
> > >   * Return 0 on success or -ENOTTY if the first device on the secondary bus
> > >   * failed to become accessible.
> > >   */
> > > -int pci_bridge_wait_for_secondary_bus(struct pci_dev *dev, char *reset_type)
> > > +int __pci_bridge_wait_for_secondary_bus(struct pci_dev *dev, unsigned long start_t, char *reset_type)
> > >  {
> > > - struct pci_dev *child __free(pci_dev_put) = NULL;
> > > - int delay;
> > > + int delay, left;
> > >  
> > >   if (pci_dev_is_disconnected(dev))
> > >    return 0;
> > > @@ -4835,8 +4899,6 @@ int pci_bridge_wait_for_secondary_bus(struct pci_dev *dev, char *reset_type)
> > >    return 0;
> > >   }
> > >  
> > > - child = pci_dev_get(list_first_entry(&dev->subordinate->devices,
> > > -          struct pci_dev, bus_list));
> > >   up_read(&pci_bus_sem);
> > >  
> > >   /*
> > > @@ -4844,8 +4906,12 @@ int pci_bridge_wait_for_secondary_bus(struct pci_dev *dev, char *reset_type)
> > >    * accessing the device after reset (that is 1000 ms + 100 ms).
> > >    */
> > >   if (!pci_is_pcie(dev)) {
> > > -  pci_dbg(dev, "waiting %d ms for secondary bus\n", 1000 + delay);
> > > -  msleep(1000 + delay);
> > > +  left = 1000 + delay - jiffies_to_msecs(jiffies - start_t);
> > > +  pci_dbg(dev, "waiting %d ms for secondary bus\n", left > 0 ? left : 0);
> > > +
> > > +  if (left > 0)
> > > +   msleep(left);
> > > +
> > >    return 0;
> > >   }
> > >  
> > > @@ -4870,10 +4936,14 @@ int pci_bridge_wait_for_secondary_bus(struct pci_dev *dev, char *reset_type)
> > >   if (pcie_get_speed_cap(dev) <= PCIE_SPEED_5_0GT) {
> > >    u16 status;
> > >  
> > > -  pci_dbg(dev, "waiting %d ms for downstream link\n", delay);
> > > -  msleep(delay);
> > > +  left = delay - jiffies_to_msecs(jiffies - start_t);
> > > +  pci_dbg(dev, "waiting %d ms for downstream link\n", left > 0 ? left : 0);
> > > +
> > > +  if (left > 0)
> > > +   msleep(left);
> > >  
> > > -  if (!pci_dev_wait(child, reset_type, PCI_RESET_WAIT - delay))
> > > +  left = PCI_RESET_WAIT - jiffies_to_msecs(jiffies - start_t);
> > > +  if(!pci_dev_wait_child(dev, start_t, left > 0 ? left : 0, reset_type))
> > >     return 0;
> > >  
> > >    /*
> > > @@ -4888,20 +4958,26 @@ int pci_bridge_wait_for_secondary_bus(struct pci_dev *dev, char *reset_type)
> > >    if (!(status & PCI_EXP_LNKSTA_DLLLA))
> > >     return -ENOTTY;
> > >  
> > > -  return pci_dev_wait(child, reset_type,
> > > -        PCIE_RESET_READY_POLL_MS - PCI_RESET_WAIT);
> > > +  left = PCIE_RESET_READY_POLL_MS - jiffies_to_msecs(jiffies - start_t);
> > > +  return pci_dev_wait_child(dev, start_t, left > 0 ? left : 0, reset_type);
> > >   }
> > >  
> > > - pci_dbg(dev, "waiting %d ms for downstream link, after activation\n",
> > > -  delay);
> > > - if (!pcie_wait_for_link_delay(dev, true, delay)) {
> > > + left = delay - jiffies_to_msecs(jiffies - start_t);
> > > + pci_dbg(dev, "waiting %d ms for downstream link, after activation\n", left > 0 ? left : 0);
> > > +
> > > + if (!pcie_wait_for_link_delay(dev, true, left > 0 ? left : 0)) {
> > >    /* Did not train, no need to wait any further */
> > >    pci_info(dev, "Data Link Layer Link Active not set in %d msec\n", delay);
> > >    return -ENOTTY;
> > >   }
> > >  
> > > - return pci_dev_wait(child, reset_type,
> > > -       PCIE_RESET_READY_POLL_MS - delay);
> > > + left = PCIE_RESET_READY_POLL_MS - jiffies_to_msecs(jiffies - start_t);
> > > + return pci_dev_wait_child(dev, start_t, left > 0 ? left : 0, reset_type);
> > > +}
> > > +
> > > +int pci_bridge_wait_for_secondary_bus(struct pci_dev *dev, char *reset_type)
> > > +{
> > > + return __pci_bridge_wait_for_secondary_bus(dev, jiffies, reset_type);
> > >  }
> > >  
> > >  void pci_reset_secondary_bus(struct pci_dev *dev)
> > > 
> > 
> > -- 
> >  i.
> > 
> 

-- 
 i.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* [PATCH v2] PCI: Fix PCIe SBR dev/link wait error
  2025-11-26 14:47         ` Ilpo Järvinen
@ 2025-11-29 16:36           ` Guanghui Feng
  2025-12-01  9:21             ` Ilpo Järvinen
  2025-11-30  5:17           ` [PATCH v3] " Guanghui Feng
  1 sibling, 1 reply; 25+ messages in thread
From: Guanghui Feng @ 2025-11-29 16:36 UTC (permalink / raw)
  To: bhelgaas, ilpo.jarvinen
  Cc: linux-pci, linux-kernel, kanie, alikernel-developer,
	Guanghui Feng

When executing a PCIe secondary bus reset, all downstream switches and
endpoints will generate reset events. Simultaneously, all PCIe links
will undergo retraining, and each link will independently re-execute the
LTSSM state machine training. Therefore, after executing the SBR, it is
necessary to wait for all downstream links and devices to complete
recovery. Otherwise, after the SBR returns, accessing devices with some
links or endpoints not yet fully recovered may result in driver errors,
or even trigger device offline issues.

Signed-off-by: Guanghui Feng <guanghuifeng@linux.alibaba.com>
Reviewed-by: Guixin Liu <kanie@linux.alibaba.com>
---
 drivers/pci/pci.c | 138 ++++++++++++++++++++++++++++++++--------------
 1 file changed, 97 insertions(+), 41 deletions(-)

diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index b14dd064006c..76afecb11164 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -4788,6 +4788,63 @@ static int pci_bus_max_d3cold_delay(const struct pci_bus *bus)
 	return max(min_delay, max_delay);
 }
 
+static int pci_readiness_check(struct pci_dev *pdev, struct pci_dev *child,
+		unsigned long start_t, char *reset_type)
+{
+	int elapsed = jiffies_to_msecs(jiffies - start_t);
+
+	if (pci_dev_is_disconnected(pdev) || pci_dev_is_disconnected(child))
+		return 0;
+
+	if (pcie_get_speed_cap(pdev) <= PCIE_SPEED_5_0GT) {
+		u16 status;
+
+		pci_dbg(pdev, "waiting %d ms for downstream link\n", elapsed);
+
+		if (!pci_dev_wait(child, reset_type, 0))
+			return 0;
+
+		if (PCI_RESET_WAIT > elapsed)
+			return PCI_RESET_WAIT - elapsed;
+
+		/*
+		 * If the port supports active link reporting we now check
+		 * whether the link is active and if not bail out early with
+		 * the assumption that the device is not present anymore.
+		 */
+		if (!pdev->link_active_reporting)
+			return -ENOTTY;
+
+		pcie_capability_read_word(pdev, PCI_EXP_LNKSTA, &status);
+		if (!(status & PCI_EXP_LNKSTA_DLLLA))
+			return -ENOTTY;
+
+		if (!pci_dev_wait(child, reset_type, 0))
+			return 0;
+
+		if (PCIE_RESET_READY_POLL_MS > elapsed)
+			return PCIE_RESET_READY_POLL_MS - elapsed;
+
+		return -ENOTTY;
+	}
+
+	pci_dbg(pdev, "waiting %d ms for downstream link, after activation\n",
+		elapsed);
+	if (!pcie_wait_for_link_delay(pdev, true, 0)) {
+		/* Did not train, no need to wait any further */
+		pci_info(pdev, "Data Link Layer Link Active not set in %d msec\n", elapsed);
+		return -ENOTTY;
+	}
+
+	if (!pci_dev_wait(child, reset_type, 0))
+		return 0;
+
+	if (PCIE_RESET_READY_POLL_MS > elapsed)
+		return PCIE_RESET_READY_POLL_MS - elapsed;
+
+	return -ENOTTY;
+}
+
 /**
  * pci_bridge_wait_for_secondary_bus - Wait for secondary bus to be accessible
  * @dev: PCI bridge
@@ -4802,12 +4859,14 @@ static int pci_bus_max_d3cold_delay(const struct pci_bus *bus)
  * 4.3.2.
  *
  * Return 0 on success or -ENOTTY if the first device on the secondary bus
- * failed to become accessible.
+ * failed to become accessible or a value greater than 0 indicates the
+ * left required waiting time..
  */
-int pci_bridge_wait_for_secondary_bus(struct pci_dev *dev, char *reset_type)
+static int __pci_bridge_wait_for_secondary_bus(struct pci_dev *dev, unsigned long start_t,
+		char *reset_type)
 {
-	struct pci_dev *child __free(pci_dev_put) = NULL;
-	int delay;
+	struct pci_dev *child;
+	int delay, ret, elapsed = jiffies_to_msecs(jiffies - start_t);
 
 	if (pci_dev_is_disconnected(dev))
 		return 0;
@@ -4835,8 +4894,6 @@ int pci_bridge_wait_for_secondary_bus(struct pci_dev *dev, char *reset_type)
 		return 0;
 	}
 
-	child = pci_dev_get(list_first_entry(&dev->subordinate->devices,
-					     struct pci_dev, bus_list));
 	up_read(&pci_bus_sem);
 
 	/*
@@ -4844,8 +4901,10 @@ int pci_bridge_wait_for_secondary_bus(struct pci_dev *dev, char *reset_type)
 	 * accessing the device after reset (that is 1000 ms + 100 ms).
 	 */
 	if (!pci_is_pcie(dev)) {
-		pci_dbg(dev, "waiting %d ms for secondary bus\n", 1000 + delay);
-		msleep(1000 + delay);
+		if (1000 + delay > elapsed)
+			return 1000 + delay - elapsed;
+
+		pci_dbg(dev, "waiting %d ms for secondary bus\n", elapsed);
 		return 0;
 	}
 
@@ -4867,41 +4926,40 @@ int pci_bridge_wait_for_secondary_bus(struct pci_dev *dev, char *reset_type)
 	if (!pcie_downstream_port(dev))
 		return 0;
 
-	if (pcie_get_speed_cap(dev) <= PCIE_SPEED_5_0GT) {
-		u16 status;
-
-		pci_dbg(dev, "waiting %d ms for downstream link\n", delay);
-		msleep(delay);
-
-		if (!pci_dev_wait(child, reset_type, PCI_RESET_WAIT - delay))
-			return 0;
+	if (delay > elapsed)
+		return delay - elapsed;
 
+	down_read(&pci_bus_sem);
+	list_for_each_entry(child, &dev->subordinate->devices, bus_list) {
 		/*
-		 * If the port supports active link reporting we now check
-		 * whether the link is active and if not bail out early with
-		 * the assumption that the device is not present anymore.
+		 * Check if all devices under the same bus have completed
+		 * the reset process, including multifunction devices in
+		 * the same bus.
 		 */
-		if (!dev->link_active_reporting)
-			return -ENOTTY;
+		ret = pci_readiness_check(dev, child, start_t, reset_type);
 
-		pcie_capability_read_word(dev, PCI_EXP_LNKSTA, &status);
-		if (!(status & PCI_EXP_LNKSTA_DLLLA))
-			return -ENOTTY;
+		if (ret == 0 && child->subordinate)
+			ret = __pci_bridge_wait_for_secondary_bus(child, start_t, reset_type);
 
-		return pci_dev_wait(child, reset_type,
-				    PCIE_RESET_READY_POLL_MS - PCI_RESET_WAIT);
+		if(ret)
+			break;
 	}
+	up_read(&pci_bus_sem);
 
-	pci_dbg(dev, "waiting %d ms for downstream link, after activation\n",
-		delay);
-	if (!pcie_wait_for_link_delay(dev, true, delay)) {
-		/* Did not train, no need to wait any further */
-		pci_info(dev, "Data Link Layer Link Active not set in %d msec\n", delay);
-		return -ENOTTY;
-	}
+	return ret;
+}
 
-	return pci_dev_wait(child, reset_type,
-			    PCIE_RESET_READY_POLL_MS - delay);
+int pci_bridge_wait_for_secondary_bus(struct pci_dev *dev, char *reset_type)
+{
+	int res = 0;
+	unsigned long start_t = jiffies;
+
+	do {
+		msleep(res);
+		res = __pci_bridge_wait_for_secondary_bus(dev, start_t, reset_type);
+	} while (res > 0);
+
+	return res;
 }
 
 void pci_reset_secondary_bus(struct pci_dev *dev)
@@ -5542,10 +5600,8 @@ static void pci_bus_restore_locked(struct pci_bus *bus)
 
 	list_for_each_entry(dev, &bus->devices, bus_list) {
 		pci_dev_restore(dev);
-		if (dev->subordinate) {
-			pci_bridge_wait_for_secondary_bus(dev, "bus reset");
+		if (dev->subordinate)
 			pci_bus_restore_locked(dev->subordinate);
-		}
 	}
 }
 
@@ -5575,14 +5631,14 @@ static void pci_slot_restore_locked(struct pci_slot *slot)
 {
 	struct pci_dev *dev;
 
+	pci_bridge_wait_for_secondary_bus(slot->bus->self, "slot reset");
+
 	list_for_each_entry(dev, &slot->bus->devices, bus_list) {
 		if (!dev->slot || dev->slot != slot)
 			continue;
 		pci_dev_restore(dev);
-		if (dev->subordinate) {
-			pci_bridge_wait_for_secondary_bus(dev, "slot reset");
+		if (dev->subordinate)
 			pci_bus_restore_locked(dev->subordinate);
-		}
 	}
 }
 
-- 
2.32.0.3.gf3a3e56d6


^ permalink raw reply related	[flat|nested] 25+ messages in thread

* [PATCH v3] PCI: Fix PCIe SBR dev/link wait error
  2025-11-26 14:47         ` Ilpo Järvinen
  2025-11-29 16:36           ` [PATCH v2] " Guanghui Feng
@ 2025-11-30  5:17           ` Guanghui Feng
  2025-12-01  9:24             ` Ilpo Järvinen
  1 sibling, 1 reply; 25+ messages in thread
From: Guanghui Feng @ 2025-11-30  5:17 UTC (permalink / raw)
  To: bhelgaas, ilpo.jarvinen; +Cc: linux-pci, alikernel-developer

When executing a PCIe secondary bus reset, all downstream switches and
endpoints will generate reset events. Simultaneously, all PCIe links
will undergo retraining, and each link will independently re-execute the
LTSSM state machine training. Therefore, after executing the SBR, it is
necessary to wait for all downstream links and devices to complete
recovery. Otherwise, after the SBR returns, accessing devices with some
links or endpoints not yet fully recovered may result in driver errors,
or even trigger device offline issues.

Signed-off-by: Guanghui Feng <guanghuifeng@linux.alibaba.com>
Reviewed-by: Guixin Liu <kanie@linux.alibaba.com>
---
 drivers/pci/pci.c | 141 +++++++++++++++++++++++++++++++++-------------
 1 file changed, 101 insertions(+), 40 deletions(-)

diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index b14dd064006c..d91c65145739 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -4788,6 +4788,63 @@ static int pci_bus_max_d3cold_delay(const struct pci_bus *bus)
 	return max(min_delay, max_delay);
 }
 
+static int pci_readiness_check(struct pci_dev *pdev, struct pci_dev *child,
+		unsigned long start_t, char *reset_type)
+{
+	int elapsed = jiffies_to_msecs(jiffies - start_t);
+
+	if (pci_dev_is_disconnected(pdev) || pci_dev_is_disconnected(child))
+		return 0;
+
+	if (pcie_get_speed_cap(pdev) <= PCIE_SPEED_5_0GT) {
+		u16 status;
+
+		pci_dbg(pdev, "waiting %d ms for downstream link\n", elapsed);
+
+		if (!pci_dev_wait(child, reset_type, 0))
+			return 0;
+
+		if (PCI_RESET_WAIT > elapsed)
+			return PCI_RESET_WAIT - elapsed;
+
+		/*
+		 * If the port supports active link reporting we now check
+		 * whether the link is active and if not bail out early with
+		 * the assumption that the device is not present anymore.
+		 */
+		if (!pdev->link_active_reporting)
+			return -ENOTTY;
+
+		pcie_capability_read_word(pdev, PCI_EXP_LNKSTA, &status);
+		if (!(status & PCI_EXP_LNKSTA_DLLLA))
+			return -ENOTTY;
+
+		if (!pci_dev_wait(child, reset_type, 0))
+			return 0;
+
+		if (PCIE_RESET_READY_POLL_MS > elapsed)
+			return PCIE_RESET_READY_POLL_MS - elapsed;
+
+		return -ENOTTY;
+	}
+
+	pci_dbg(pdev, "waiting %d ms for downstream link, after activation\n",
+		elapsed);
+	if (!pcie_wait_for_link_delay(pdev, true, 0)) {
+		/* Did not train, no need to wait any further */
+		pci_info(pdev, "Data Link Layer Link Active not set in %d msec\n", elapsed);
+		return -ENOTTY;
+	}
+
+	if (!pci_dev_wait(child, reset_type, 0))
+		return 0;
+
+	if (PCIE_RESET_READY_POLL_MS > elapsed)
+		return PCIE_RESET_READY_POLL_MS - elapsed;
+
+	return -ENOTTY;
+}
+
 /**
  * pci_bridge_wait_for_secondary_bus - Wait for secondary bus to be accessible
  * @dev: PCI bridge
@@ -4802,12 +4859,14 @@ static int pci_bus_max_d3cold_delay(const struct pci_bus *bus)
  * 4.3.2.
  *
  * Return 0 on success or -ENOTTY if the first device on the secondary bus
- * failed to become accessible.
+ * failed to become accessible or a value greater than 0 indicates the
+ * left required waiting time..
  */
-int pci_bridge_wait_for_secondary_bus(struct pci_dev *dev, char *reset_type)
+static int __pci_bridge_wait_for_secondary_bus(struct pci_dev *dev, unsigned long start_t,
+		char *reset_type)
 {
-	struct pci_dev *child __free(pci_dev_put) = NULL;
-	int delay;
+	struct pci_dev *child;
+	int delay, ret, elapsed = jiffies_to_msecs(jiffies - start_t);
 
 	if (pci_dev_is_disconnected(dev))
 		return 0;
@@ -4835,8 +4894,6 @@ int pci_bridge_wait_for_secondary_bus(struct pci_dev *dev, char *reset_type)
 		return 0;
 	}
 
-	child = pci_dev_get(list_first_entry(&dev->subordinate->devices,
-					     struct pci_dev, bus_list));
 	up_read(&pci_bus_sem);
 
 	/*
@@ -4844,8 +4901,10 @@ int pci_bridge_wait_for_secondary_bus(struct pci_dev *dev, char *reset_type)
 	 * accessing the device after reset (that is 1000 ms + 100 ms).
 	 */
 	if (!pci_is_pcie(dev)) {
-		pci_dbg(dev, "waiting %d ms for secondary bus\n", 1000 + delay);
-		msleep(1000 + delay);
+		if (1000 + delay > elapsed)
+			return 1000 + delay - elapsed;
+
+		pci_dbg(dev, "waiting %d ms for secondary bus\n", elapsed);
 		return 0;
 	}
 
@@ -4867,41 +4926,45 @@ int pci_bridge_wait_for_secondary_bus(struct pci_dev *dev, char *reset_type)
 	if (!pcie_downstream_port(dev))
 		return 0;
 
-	if (pcie_get_speed_cap(dev) <= PCIE_SPEED_5_0GT) {
-		u16 status;
-
-		pci_dbg(dev, "waiting %d ms for downstream link\n", delay);
-		msleep(delay);
-
-		if (!pci_dev_wait(child, reset_type, PCI_RESET_WAIT - delay))
-			return 0;
+	if (delay > elapsed)
+		return delay - elapsed;
 
+	down_read(&pci_bus_sem);
+	list_for_each_entry(child, &dev->subordinate->devices, bus_list) {
 		/*
-		 * If the port supports active link reporting we now check
-		 * whether the link is active and if not bail out early with
-		 * the assumption that the device is not present anymore.
+		 * Check if all devices under the same bus have completed
+		 * the reset process, including multifunction devices in
+		 * the same bus.
 		 */
-		if (!dev->link_active_reporting)
-			return -ENOTTY;
+		ret = pci_readiness_check(dev, child, start_t, reset_type);
 
-		pcie_capability_read_word(dev, PCI_EXP_LNKSTA, &status);
-		if (!(status & PCI_EXP_LNKSTA_DLLLA))
-			return -ENOTTY;
+		if (ret == 0 && child->subordinate)
+			ret = __pci_bridge_wait_for_secondary_bus(child, start_t, reset_type);
 
-		return pci_dev_wait(child, reset_type,
-				    PCIE_RESET_READY_POLL_MS - PCI_RESET_WAIT);
+		if(ret)
+			break;
 	}
+	up_read(&pci_bus_sem);
 
-	pci_dbg(dev, "waiting %d ms for downstream link, after activation\n",
-		delay);
-	if (!pcie_wait_for_link_delay(dev, true, delay)) {
-		/* Did not train, no need to wait any further */
-		pci_info(dev, "Data Link Layer Link Active not set in %d msec\n", delay);
-		return -ENOTTY;
+	return ret;
+}
+
+int pci_bridge_wait_for_secondary_bus(struct pci_dev *dev, char *reset_type)
+{
+	int res, gap = 1;
+	unsigned long start_t = jiffies;
+
+	res = __pci_bridge_wait_for_secondary_bus(dev, start_t, reset_type);
+
+	while (res > 0) {
+		gap = gap < res ? gap : res;
+		msleep(gap);
+		gap <<= 1;
+
+		res = __pci_bridge_wait_for_secondary_bus(dev, start_t, reset_type);
 	}
 
-	return pci_dev_wait(child, reset_type,
-			    PCIE_RESET_READY_POLL_MS - delay);
+	return res;
 }
 
 void pci_reset_secondary_bus(struct pci_dev *dev)
@@ -5542,10 +5605,8 @@ static void pci_bus_restore_locked(struct pci_bus *bus)
 
 	list_for_each_entry(dev, &bus->devices, bus_list) {
 		pci_dev_restore(dev);
-		if (dev->subordinate) {
-			pci_bridge_wait_for_secondary_bus(dev, "bus reset");
+		if (dev->subordinate)
 			pci_bus_restore_locked(dev->subordinate);
-		}
 	}
 }
 
@@ -5575,14 +5636,14 @@ static void pci_slot_restore_locked(struct pci_slot *slot)
 {
 	struct pci_dev *dev;
 
+	pci_bridge_wait_for_secondary_bus(slot->bus->self, "slot reset");
+
 	list_for_each_entry(dev, &slot->bus->devices, bus_list) {
 		if (!dev->slot || dev->slot != slot)
 			continue;
 		pci_dev_restore(dev);
-		if (dev->subordinate) {
-			pci_bridge_wait_for_secondary_bus(dev, "slot reset");
+		if (dev->subordinate)
 			pci_bus_restore_locked(dev->subordinate);
-		}
 	}
 }
 
-- 
2.32.0.3.gf3a3e56d6


^ permalink raw reply related	[flat|nested] 25+ messages in thread

* Re: [PATCH v2] PCI: Fix PCIe SBR dev/link wait error
  2025-11-29 16:36           ` [PATCH v2] " Guanghui Feng
@ 2025-12-01  9:21             ` Ilpo Järvinen
  2025-12-01 12:21               ` guanghuifeng
  0 siblings, 1 reply; 25+ messages in thread
From: Ilpo Järvinen @ 2025-12-01  9:21 UTC (permalink / raw)
  To: Guanghui Feng; +Cc: bhelgaas, linux-pci, LKML, kanie, alikernel-developer

On Sun, 30 Nov 2025, Guanghui Feng wrote:

> When executing a PCIe secondary bus reset, all downstream switches and
> endpoints will generate reset events. Simultaneously, all PCIe links
> will undergo retraining, and each link will independently re-execute the
> LTSSM state machine training. Therefore, after executing the SBR, it is
> necessary to wait for all downstream links and devices to complete
> recovery. Otherwise, after the SBR returns, accessing devices with some
> links or endpoints not yet fully recovered may result in driver errors,
> or even trigger device offline issues.
> 
> Signed-off-by: Guanghui Feng <guanghuifeng@linux.alibaba.com>
> Reviewed-by: Guixin Liu <kanie@linux.alibaba.com>
> ---

Hi,

In future, when posting an update, please always explain here 
below the --- line what was changed between the versions.

>  drivers/pci/pci.c | 138 ++++++++++++++++++++++++++++++++--------------
>  1 file changed, 97 insertions(+), 41 deletions(-)
> 
> diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
> index b14dd064006c..76afecb11164 100644
> --- a/drivers/pci/pci.c
> +++ b/drivers/pci/pci.c
> @@ -4788,6 +4788,63 @@ static int pci_bus_max_d3cold_delay(const struct pci_bus *bus)
>  	return max(min_delay, max_delay);
>  }
>  
> +static int pci_readiness_check(struct pci_dev *pdev, struct pci_dev *child,
> +		unsigned long start_t, char *reset_type)
> +{
> +	int elapsed = jiffies_to_msecs(jiffies - start_t);
> +
> +	if (pci_dev_is_disconnected(pdev) || pci_dev_is_disconnected(child))
> +		return 0;
> +
> +	if (pcie_get_speed_cap(pdev) <= PCIE_SPEED_5_0GT) {
> +		u16 status;
> +
> +		pci_dbg(pdev, "waiting %d ms for downstream link\n", elapsed);
> +
> +		if (!pci_dev_wait(child, reset_type, 0))
> +			return 0;
> +
> +		if (PCI_RESET_WAIT > elapsed)
> +			return PCI_RESET_WAIT - elapsed;
> +
> +		/*
> +		 * If the port supports active link reporting we now check
> +		 * whether the link is active and if not bail out early with
> +		 * the assumption that the device is not present anymore.
> +		 */
> +		if (!pdev->link_active_reporting)
> +			return -ENOTTY;
> +
> +		pcie_capability_read_word(pdev, PCI_EXP_LNKSTA, &status);
> +		if (!(status & PCI_EXP_LNKSTA_DLLLA))
> +			return -ENOTTY;
> +
> +		if (!pci_dev_wait(child, reset_type, 0))
> +			return 0;
> +
> +		if (PCIE_RESET_READY_POLL_MS > elapsed)
> +			return PCIE_RESET_READY_POLL_MS - elapsed;
> +
> +		return -ENOTTY;
> +	}
> +
> +	pci_dbg(pdev, "waiting %d ms for downstream link, after activation\n",
> +		elapsed);
> +	if (!pcie_wait_for_link_delay(pdev, true, 0)) {
> +		/* Did not train, no need to wait any further */
> +		pci_info(pdev, "Data Link Layer Link Active not set in %d msec\n", elapsed);
> +		return -ENOTTY;
> +	}
> +
> +	if (!pci_dev_wait(child, reset_type, 0))
> +		return 0;
> +
> +	if (PCIE_RESET_READY_POLL_MS > elapsed)
> +		return PCIE_RESET_READY_POLL_MS - elapsed;
> +
> +	return -ENOTTY;
> +}
> +
>  /**
>   * pci_bridge_wait_for_secondary_bus - Wait for secondary bus to be accessible
>   * @dev: PCI bridge
> @@ -4802,12 +4859,14 @@ static int pci_bus_max_d3cold_delay(const struct pci_bus *bus)
>   * 4.3.2.
>   *
>   * Return 0 on success or -ENOTTY if the first device on the secondary bus
> - * failed to become accessible.
> + * failed to become accessible or a value greater than 0 indicates the
> + * left required waiting time..
>   */
> -int pci_bridge_wait_for_secondary_bus(struct pci_dev *dev, char *reset_type)
> +static int __pci_bridge_wait_for_secondary_bus(struct pci_dev *dev, unsigned long start_t,
> +		char *reset_type)
>  {
> -	struct pci_dev *child __free(pci_dev_put) = NULL;
> -	int delay;
> +	struct pci_dev *child;
> +	int delay, ret, elapsed = jiffies_to_msecs(jiffies - start_t);
>  
>  	if (pci_dev_is_disconnected(dev))
>  		return 0;
> @@ -4835,8 +4894,6 @@ int pci_bridge_wait_for_secondary_bus(struct pci_dev *dev, char *reset_type)
>  		return 0;
>  	}
>  
> -	child = pci_dev_get(list_first_entry(&dev->subordinate->devices,
> -					     struct pci_dev, bus_list));
>  	up_read(&pci_bus_sem);
>  
>  	/*
> @@ -4844,8 +4901,10 @@ int pci_bridge_wait_for_secondary_bus(struct pci_dev *dev, char *reset_type)
>  	 * accessing the device after reset (that is 1000 ms + 100 ms).
>  	 */
>  	if (!pci_is_pcie(dev)) {
> -		pci_dbg(dev, "waiting %d ms for secondary bus\n", 1000 + delay);
> -		msleep(1000 + delay);
> +		if (1000 + delay > elapsed)
> +			return 1000 + delay - elapsed;
> +
> +		pci_dbg(dev, "waiting %d ms for secondary bus\n", elapsed);
>  		return 0;
>  	}
>  
> @@ -4867,41 +4926,40 @@ int pci_bridge_wait_for_secondary_bus(struct pci_dev *dev, char *reset_type)
>  	if (!pcie_downstream_port(dev))
>  		return 0;
>  
> -	if (pcie_get_speed_cap(dev) <= PCIE_SPEED_5_0GT) {
> -		u16 status;
> -
> -		pci_dbg(dev, "waiting %d ms for downstream link\n", delay);
> -		msleep(delay);
> -
> -		if (!pci_dev_wait(child, reset_type, PCI_RESET_WAIT - delay))
> -			return 0;
> +	if (delay > elapsed)
> +		return delay - elapsed;
>  
> +	down_read(&pci_bus_sem);
> +	list_for_each_entry(child, &dev->subordinate->devices, bus_list) {
>  		/*
> -		 * If the port supports active link reporting we now check
> -		 * whether the link is active and if not bail out early with
> -		 * the assumption that the device is not present anymore.
> +		 * Check if all devices under the same bus have completed
> +		 * the reset process, including multifunction devices in
> +		 * the same bus.
>  		 */
> -		if (!dev->link_active_reporting)
> -			return -ENOTTY;
> +		ret = pci_readiness_check(dev, child, start_t, reset_type);
>  
> -		pcie_capability_read_word(dev, PCI_EXP_LNKSTA, &status);
> -		if (!(status & PCI_EXP_LNKSTA_DLLLA))
> -			return -ENOTTY;
> +		if (ret == 0 && child->subordinate)
> +			ret = __pci_bridge_wait_for_secondary_bus(child, start_t, reset_type);
>  
> -		return pci_dev_wait(child, reset_type,
> -				    PCIE_RESET_READY_POLL_MS - PCI_RESET_WAIT);
> +		if(ret)
> +			break;
>  	}
> +	up_read(&pci_bus_sem);
>  
> -	pci_dbg(dev, "waiting %d ms for downstream link, after activation\n",
> -		delay);
> -	if (!pcie_wait_for_link_delay(dev, true, delay)) {
> -		/* Did not train, no need to wait any further */
> -		pci_info(dev, "Data Link Layer Link Active not set in %d msec\n", delay);
> -		return -ENOTTY;
> -	}
> +	return ret;
> +}
>  
> -	return pci_dev_wait(child, reset_type,
> -			    PCIE_RESET_READY_POLL_MS - delay);
> +int pci_bridge_wait_for_secondary_bus(struct pci_dev *dev, char *reset_type)
> +{
> +	int res = 0;
> +	unsigned long start_t = jiffies;
> +
> +	do {
> +		msleep(res);
> +		res = __pci_bridge_wait_for_secondary_bus(dev, start_t, reset_type);
> +	} while (res > 0);
> +
> +	return res;
>  }
>  
>  void pci_reset_secondary_bus(struct pci_dev *dev)
> @@ -5542,10 +5600,8 @@ static void pci_bus_restore_locked(struct pci_bus *bus)
>  
>  	list_for_each_entry(dev, &bus->devices, bus_list) {
>  		pci_dev_restore(dev);
> -		if (dev->subordinate) {
> -			pci_bridge_wait_for_secondary_bus(dev, "bus reset");
> +		if (dev->subordinate)
>  			pci_bus_restore_locked(dev->subordinate);
> -		}
>  	}
>  }

???

Unfortunately, this takes a wrong turn and is very much against the 
feedback I gave to you.

>  
> @@ -5575,14 +5631,14 @@ static void pci_slot_restore_locked(struct pci_slot *slot)
>  {
>  	struct pci_dev *dev;
>  
> +	pci_bridge_wait_for_secondary_bus(slot->bus->self, "slot reset");
> +
>  	list_for_each_entry(dev, &slot->bus->devices, bus_list) {
>  		if (!dev->slot || dev->slot != slot)
>  			continue;
>  		pci_dev_restore(dev);
> -		if (dev->subordinate) {
> -			pci_bridge_wait_for_secondary_bus(dev, "slot reset");
> +		if (dev->subordinate)
>  			pci_bus_restore_locked(dev->subordinate);
> -		}


-- 
 i.


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v3] PCI: Fix PCIe SBR dev/link wait error
  2025-11-30  5:17           ` [PATCH v3] " Guanghui Feng
@ 2025-12-01  9:24             ` Ilpo Järvinen
  2025-12-01 12:31               ` guanghuifeng
  0 siblings, 1 reply; 25+ messages in thread
From: Ilpo Järvinen @ 2025-12-01  9:24 UTC (permalink / raw)
  To: Guanghui Feng; +Cc: bhelgaas, linux-pci, alikernel-developer

On Sun, 30 Nov 2025, Guanghui Feng wrote:

> When executing a PCIe secondary bus reset, all downstream switches and
> endpoints will generate reset events. Simultaneously, all PCIe links
> will undergo retraining, and each link will independently re-execute the
> LTSSM state machine training. Therefore, after executing the SBR, it is
> necessary to wait for all downstream links and devices to complete
> recovery. Otherwise, after the SBR returns, accessing devices with some
> links or endpoints not yet fully recovered may result in driver errors,
> or even trigger device offline issues.
> 
> Signed-off-by: Guanghui Feng <guanghuifeng@linux.alibaba.com>
> Reviewed-by: Guixin Liu <kanie@linux.alibaba.com>
> ---

The review comment I gave to v2 still stands to this v3 as well.

Please don't send new versions too rapidly but give people time to comment 
on the previous version. And please remember to explain what you changed 
in each version (below the -- line).

-- 
 i.

>  drivers/pci/pci.c | 141 +++++++++++++++++++++++++++++++++-------------
>  1 file changed, 101 insertions(+), 40 deletions(-)
> 
> diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
> index b14dd064006c..d91c65145739 100644
> --- a/drivers/pci/pci.c
> +++ b/drivers/pci/pci.c
> @@ -4788,6 +4788,63 @@ static int pci_bus_max_d3cold_delay(const struct pci_bus *bus)
>  	return max(min_delay, max_delay);
>  }
>  
> +static int pci_readiness_check(struct pci_dev *pdev, struct pci_dev *child,
> +		unsigned long start_t, char *reset_type)
> +{
> +	int elapsed = jiffies_to_msecs(jiffies - start_t);
> +
> +	if (pci_dev_is_disconnected(pdev) || pci_dev_is_disconnected(child))
> +		return 0;
> +
> +	if (pcie_get_speed_cap(pdev) <= PCIE_SPEED_5_0GT) {
> +		u16 status;
> +
> +		pci_dbg(pdev, "waiting %d ms for downstream link\n", elapsed);
> +
> +		if (!pci_dev_wait(child, reset_type, 0))
> +			return 0;
> +
> +		if (PCI_RESET_WAIT > elapsed)
> +			return PCI_RESET_WAIT - elapsed;
> +
> +		/*
> +		 * If the port supports active link reporting we now check
> +		 * whether the link is active and if not bail out early with
> +		 * the assumption that the device is not present anymore.
> +		 */
> +		if (!pdev->link_active_reporting)
> +			return -ENOTTY;
> +
> +		pcie_capability_read_word(pdev, PCI_EXP_LNKSTA, &status);
> +		if (!(status & PCI_EXP_LNKSTA_DLLLA))
> +			return -ENOTTY;
> +
> +		if (!pci_dev_wait(child, reset_type, 0))
> +			return 0;
> +
> +		if (PCIE_RESET_READY_POLL_MS > elapsed)
> +			return PCIE_RESET_READY_POLL_MS - elapsed;
> +
> +		return -ENOTTY;
> +	}
> +
> +	pci_dbg(pdev, "waiting %d ms for downstream link, after activation\n",
> +		elapsed);
> +	if (!pcie_wait_for_link_delay(pdev, true, 0)) {
> +		/* Did not train, no need to wait any further */
> +		pci_info(pdev, "Data Link Layer Link Active not set in %d msec\n", elapsed);
> +		return -ENOTTY;
> +	}
> +
> +	if (!pci_dev_wait(child, reset_type, 0))
> +		return 0;
> +
> +	if (PCIE_RESET_READY_POLL_MS > elapsed)
> +		return PCIE_RESET_READY_POLL_MS - elapsed;
> +
> +	return -ENOTTY;
> +}
> +
>  /**
>   * pci_bridge_wait_for_secondary_bus - Wait for secondary bus to be accessible
>   * @dev: PCI bridge
> @@ -4802,12 +4859,14 @@ static int pci_bus_max_d3cold_delay(const struct pci_bus *bus)
>   * 4.3.2.
>   *
>   * Return 0 on success or -ENOTTY if the first device on the secondary bus
> - * failed to become accessible.
> + * failed to become accessible or a value greater than 0 indicates the
> + * left required waiting time..
>   */
> -int pci_bridge_wait_for_secondary_bus(struct pci_dev *dev, char *reset_type)
> +static int __pci_bridge_wait_for_secondary_bus(struct pci_dev *dev, unsigned long start_t,
> +		char *reset_type)
>  {
> -	struct pci_dev *child __free(pci_dev_put) = NULL;
> -	int delay;
> +	struct pci_dev *child;
> +	int delay, ret, elapsed = jiffies_to_msecs(jiffies - start_t);
>  
>  	if (pci_dev_is_disconnected(dev))
>  		return 0;
> @@ -4835,8 +4894,6 @@ int pci_bridge_wait_for_secondary_bus(struct pci_dev *dev, char *reset_type)
>  		return 0;
>  	}
>  
> -	child = pci_dev_get(list_first_entry(&dev->subordinate->devices,
> -					     struct pci_dev, bus_list));
>  	up_read(&pci_bus_sem);
>  
>  	/*
> @@ -4844,8 +4901,10 @@ int pci_bridge_wait_for_secondary_bus(struct pci_dev *dev, char *reset_type)
>  	 * accessing the device after reset (that is 1000 ms + 100 ms).
>  	 */
>  	if (!pci_is_pcie(dev)) {
> -		pci_dbg(dev, "waiting %d ms for secondary bus\n", 1000 + delay);
> -		msleep(1000 + delay);
> +		if (1000 + delay > elapsed)
> +			return 1000 + delay - elapsed;
> +
> +		pci_dbg(dev, "waiting %d ms for secondary bus\n", elapsed);
>  		return 0;
>  	}
>  
> @@ -4867,41 +4926,45 @@ int pci_bridge_wait_for_secondary_bus(struct pci_dev *dev, char *reset_type)
>  	if (!pcie_downstream_port(dev))
>  		return 0;
>  
> -	if (pcie_get_speed_cap(dev) <= PCIE_SPEED_5_0GT) {
> -		u16 status;
> -
> -		pci_dbg(dev, "waiting %d ms for downstream link\n", delay);
> -		msleep(delay);
> -
> -		if (!pci_dev_wait(child, reset_type, PCI_RESET_WAIT - delay))
> -			return 0;
> +	if (delay > elapsed)
> +		return delay - elapsed;
>  
> +	down_read(&pci_bus_sem);
> +	list_for_each_entry(child, &dev->subordinate->devices, bus_list) {
>  		/*
> -		 * If the port supports active link reporting we now check
> -		 * whether the link is active and if not bail out early with
> -		 * the assumption that the device is not present anymore.
> +		 * Check if all devices under the same bus have completed
> +		 * the reset process, including multifunction devices in
> +		 * the same bus.
>  		 */
> -		if (!dev->link_active_reporting)
> -			return -ENOTTY;
> +		ret = pci_readiness_check(dev, child, start_t, reset_type);
>  
> -		pcie_capability_read_word(dev, PCI_EXP_LNKSTA, &status);
> -		if (!(status & PCI_EXP_LNKSTA_DLLLA))
> -			return -ENOTTY;
> +		if (ret == 0 && child->subordinate)
> +			ret = __pci_bridge_wait_for_secondary_bus(child, start_t, reset_type);
>  
> -		return pci_dev_wait(child, reset_type,
> -				    PCIE_RESET_READY_POLL_MS - PCI_RESET_WAIT);
> +		if(ret)
> +			break;
>  	}
> +	up_read(&pci_bus_sem);
>  
> -	pci_dbg(dev, "waiting %d ms for downstream link, after activation\n",
> -		delay);
> -	if (!pcie_wait_for_link_delay(dev, true, delay)) {
> -		/* Did not train, no need to wait any further */
> -		pci_info(dev, "Data Link Layer Link Active not set in %d msec\n", delay);
> -		return -ENOTTY;
> +	return ret;
> +}
> +
> +int pci_bridge_wait_for_secondary_bus(struct pci_dev *dev, char *reset_type)
> +{
> +	int res, gap = 1;
> +	unsigned long start_t = jiffies;
> +
> +	res = __pci_bridge_wait_for_secondary_bus(dev, start_t, reset_type);
> +
> +	while (res > 0) {
> +		gap = gap < res ? gap : res;
> +		msleep(gap);
> +		gap <<= 1;
> +
> +		res = __pci_bridge_wait_for_secondary_bus(dev, start_t, reset_type);
>  	}
>  
> -	return pci_dev_wait(child, reset_type,
> -			    PCIE_RESET_READY_POLL_MS - delay);
> +	return res;
>  }
>  
>  void pci_reset_secondary_bus(struct pci_dev *dev)
> @@ -5542,10 +5605,8 @@ static void pci_bus_restore_locked(struct pci_bus *bus)
>  
>  	list_for_each_entry(dev, &bus->devices, bus_list) {
>  		pci_dev_restore(dev);
> -		if (dev->subordinate) {
> -			pci_bridge_wait_for_secondary_bus(dev, "bus reset");
> +		if (dev->subordinate)
>  			pci_bus_restore_locked(dev->subordinate);
> -		}
>  	}
>  }
>  
> @@ -5575,14 +5636,14 @@ static void pci_slot_restore_locked(struct pci_slot *slot)
>  {
>  	struct pci_dev *dev;
>  
> +	pci_bridge_wait_for_secondary_bus(slot->bus->self, "slot reset");
> +
>  	list_for_each_entry(dev, &slot->bus->devices, bus_list) {
>  		if (!dev->slot || dev->slot != slot)
>  			continue;
>  		pci_dev_restore(dev);
> -		if (dev->subordinate) {
> -			pci_bridge_wait_for_secondary_bus(dev, "slot reset");
> +		if (dev->subordinate)
>  			pci_bus_restore_locked(dev->subordinate);
> -		}
>  	}
>  }
>  
> 


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] PCI: Fix PCIe SBR dev/link wait error
  2025-11-25  6:20   ` guanghui.fgh
@ 2025-12-01 10:03     ` Lukas Wunner
  2025-12-01 12:56       ` guanghuifeng
  0 siblings, 1 reply; 25+ messages in thread
From: Lukas Wunner @ 2025-12-01 10:03 UTC (permalink / raw)
  To: guanghui.fgh
  Cc: Bjorn Helgaas, linux-pci, kanie, alikernel-developer,
	Ilpo Järvinen

On Tue, Nov 25, 2025 at 02:20:10PM +0800, guanghui.fgh wrote:
> After __pci_reset_slot/__pci_reset_bus calls
> pci_bridge_wait_for_secondary_bus, the device will be restored via
> pci_dev_restore. However, when a multifunction PCIe device is connected,
> executing pci_bridge_wait_for_secondary_bus only guarantees the restoration
> of a random device. For other devices that are still restoring, executing
> pci_dev_restore cannot restore the device state normally, resulting in
> errors or even device offline.

PCIe is point-to-point, i.e. at the two ends of a link there's only a
single physical device.  So if there are multiple pci_dev's on a bus,
they're additional functions or VFs of the same physical device.

The expectation is that if the first device on the bus is accessible,
all other functions of the same physical device are accessible as well.
That's why we only wait for the first device to become accessible.

It seems highly unusual that the different functions of the same physical
device require different delays until they're accessible.  I don't think
we can accept such a sweeping change wholesale without more details,
so please share what the topology looks like (lspci -tv), what devices are
involved (lspci -vvv) and which device requires extra wait time for some
of its functions.

Thanks,

Lukas

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v2] PCI: Fix PCIe SBR dev/link wait error
  2025-12-01  9:21             ` Ilpo Järvinen
@ 2025-12-01 12:21               ` guanghuifeng
  2025-12-01 13:08                 ` Ilpo Järvinen
  0 siblings, 1 reply; 25+ messages in thread
From: guanghuifeng @ 2025-12-01 12:21 UTC (permalink / raw)
  To: Ilpo Järvinen; +Cc: bhelgaas, linux-pci, LKML, kanie, alikernel-developer


在 2025/12/1 17:21, Ilpo Järvinen 写道:
> On Sun, 30 Nov 2025, Guanghui Feng wrote:
>
>> When executing a PCIe secondary bus reset, all downstream switches and
>> endpoints will generate reset events. Simultaneously, all PCIe links
>> will undergo retraining, and each link will independently re-execute the
>> LTSSM state machine training. Therefore, after executing the SBR, it is
>> necessary to wait for all downstream links and devices to complete
>> recovery. Otherwise, after the SBR returns, accessing devices with some
>> links or endpoints not yet fully recovered may result in driver errors,
>> or even trigger device offline issues.
>>
>> Signed-off-by: Guanghui Feng <guanghuifeng@linux.alibaba.com>
>> Reviewed-by: Guixin Liu <kanie@linux.alibaba.com>
>> ---
> Hi,
>
> In future, when posting an update, please always explain here
> below the --- line what was changed between the versions.
OK
>>   drivers/pci/pci.c | 138 ++++++++++++++++++++++++++++++++--------------
>>   1 file changed, 97 insertions(+), 41 deletions(-)
>>
>> diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
>> index b14dd064006c..76afecb11164 100644
>> --- a/drivers/pci/pci.c
>> +++ b/drivers/pci/pci.c
>> @@ -4788,6 +4788,63 @@ static int pci_bus_max_d3cold_delay(const struct pci_bus *bus)
>>   	return max(min_delay, max_delay);
>>   }
>>   
>> +static int pci_readiness_check(struct pci_dev *pdev, struct pci_dev *child,
>> +		unsigned long start_t, char *reset_type)
>> +{
>> +	int elapsed = jiffies_to_msecs(jiffies - start_t);
>> +
>> +	if (pci_dev_is_disconnected(pdev) || pci_dev_is_disconnected(child))
>> +		return 0;
>> +
>> +	if (pcie_get_speed_cap(pdev) <= PCIE_SPEED_5_0GT) {
>> +		u16 status;
>> +
>> +		pci_dbg(pdev, "waiting %d ms for downstream link\n", elapsed);
>> +
>> +		if (!pci_dev_wait(child, reset_type, 0))
>> +			return 0;
>> +
>> +		if (PCI_RESET_WAIT > elapsed)
>> +			return PCI_RESET_WAIT - elapsed;
>> +
>> +		/*
>> +		 * If the port supports active link reporting we now check
>> +		 * whether the link is active and if not bail out early with
>> +		 * the assumption that the device is not present anymore.
>> +		 */
>> +		if (!pdev->link_active_reporting)
>> +			return -ENOTTY;
>> +
>> +		pcie_capability_read_word(pdev, PCI_EXP_LNKSTA, &status);
>> +		if (!(status & PCI_EXP_LNKSTA_DLLLA))
>> +			return -ENOTTY;
>> +
>> +		if (!pci_dev_wait(child, reset_type, 0))
>> +			return 0;
>> +
>> +		if (PCIE_RESET_READY_POLL_MS > elapsed)
>> +			return PCIE_RESET_READY_POLL_MS - elapsed;
>> +
>> +		return -ENOTTY;
>> +	}
>> +
>> +	pci_dbg(pdev, "waiting %d ms for downstream link, after activation\n",
>> +		elapsed);
>> +	if (!pcie_wait_for_link_delay(pdev, true, 0)) {
>> +		/* Did not train, no need to wait any further */
>> +		pci_info(pdev, "Data Link Layer Link Active not set in %d msec\n", elapsed);
>> +		return -ENOTTY;
>> +	}
>> +
>> +	if (!pci_dev_wait(child, reset_type, 0))
>> +		return 0;
>> +
>> +	if (PCIE_RESET_READY_POLL_MS > elapsed)
>> +		return PCIE_RESET_READY_POLL_MS - elapsed;
>> +
>> +	return -ENOTTY;
>> +}
>> +
>>   /**
>>    * pci_bridge_wait_for_secondary_bus - Wait for secondary bus to be accessible
>>    * @dev: PCI bridge
>> @@ -4802,12 +4859,14 @@ static int pci_bus_max_d3cold_delay(const struct pci_bus *bus)
>>    * 4.3.2.
>>    *
>>    * Return 0 on success or -ENOTTY if the first device on the secondary bus
>> - * failed to become accessible.
>> + * failed to become accessible or a value greater than 0 indicates the
>> + * left required waiting time..
>>    */
>> -int pci_bridge_wait_for_secondary_bus(struct pci_dev *dev, char *reset_type)
>> +static int __pci_bridge_wait_for_secondary_bus(struct pci_dev *dev, unsigned long start_t,
>> +		char *reset_type)
>>   {
>> -	struct pci_dev *child __free(pci_dev_put) = NULL;
>> -	int delay;
>> +	struct pci_dev *child;
>> +	int delay, ret, elapsed = jiffies_to_msecs(jiffies - start_t);
>>   
>>   	if (pci_dev_is_disconnected(dev))
>>   		return 0;
>> @@ -4835,8 +4894,6 @@ int pci_bridge_wait_for_secondary_bus(struct pci_dev *dev, char *reset_type)
>>   		return 0;
>>   	}
>>   
>> -	child = pci_dev_get(list_first_entry(&dev->subordinate->devices,
>> -					     struct pci_dev, bus_list));
>>   	up_read(&pci_bus_sem);
>>   
>>   	/*
>> @@ -4844,8 +4901,10 @@ int pci_bridge_wait_for_secondary_bus(struct pci_dev *dev, char *reset_type)
>>   	 * accessing the device after reset (that is 1000 ms + 100 ms).
>>   	 */
>>   	if (!pci_is_pcie(dev)) {
>> -		pci_dbg(dev, "waiting %d ms for secondary bus\n", 1000 + delay);
>> -		msleep(1000 + delay);
>> +		if (1000 + delay > elapsed)
>> +			return 1000 + delay - elapsed;
>> +
>> +		pci_dbg(dev, "waiting %d ms for secondary bus\n", elapsed);
>>   		return 0;
>>   	}
>>   
>> @@ -4867,41 +4926,40 @@ int pci_bridge_wait_for_secondary_bus(struct pci_dev *dev, char *reset_type)
>>   	if (!pcie_downstream_port(dev))
>>   		return 0;
>>   
>> -	if (pcie_get_speed_cap(dev) <= PCIE_SPEED_5_0GT) {
>> -		u16 status;
>> -
>> -		pci_dbg(dev, "waiting %d ms for downstream link\n", delay);
>> -		msleep(delay);
>> -
>> -		if (!pci_dev_wait(child, reset_type, PCI_RESET_WAIT - delay))
>> -			return 0;
>> +	if (delay > elapsed)
>> +		return delay - elapsed;
>>   
>> +	down_read(&pci_bus_sem);
>> +	list_for_each_entry(child, &dev->subordinate->devices, bus_list) {
>>   		/*
>> -		 * If the port supports active link reporting we now check
>> -		 * whether the link is active and if not bail out early with
>> -		 * the assumption that the device is not present anymore.
>> +		 * Check if all devices under the same bus have completed
>> +		 * the reset process, including multifunction devices in
>> +		 * the same bus.
>>   		 */
>> -		if (!dev->link_active_reporting)
>> -			return -ENOTTY;
>> +		ret = pci_readiness_check(dev, child, start_t, reset_type);
>>   
>> -		pcie_capability_read_word(dev, PCI_EXP_LNKSTA, &status);
>> -		if (!(status & PCI_EXP_LNKSTA_DLLLA))
>> -			return -ENOTTY;
>> +		if (ret == 0 && child->subordinate)
>> +			ret = __pci_bridge_wait_for_secondary_bus(child, start_t, reset_type);
>>   
>> -		return pci_dev_wait(child, reset_type,
>> -				    PCIE_RESET_READY_POLL_MS - PCI_RESET_WAIT);
>> +		if(ret)
>> +			break;
>>   	}
>> +	up_read(&pci_bus_sem);
>>   
>> -	pci_dbg(dev, "waiting %d ms for downstream link, after activation\n",
>> -		delay);
>> -	if (!pcie_wait_for_link_delay(dev, true, delay)) {
>> -		/* Did not train, no need to wait any further */
>> -		pci_info(dev, "Data Link Layer Link Active not set in %d msec\n", delay);
>> -		return -ENOTTY;
>> -	}
>> +	return ret;
>> +}
>>   
>> -	return pci_dev_wait(child, reset_type,
>> -			    PCIE_RESET_READY_POLL_MS - delay);
>> +int pci_bridge_wait_for_secondary_bus(struct pci_dev *dev, char *reset_type)
>> +{
>> +	int res = 0;
>> +	unsigned long start_t = jiffies;
>> +
>> +	do {
>> +		msleep(res);
>> +		res = __pci_bridge_wait_for_secondary_bus(dev, start_t, reset_type);
>> +	} while (res > 0);
>> +
>> +	return res;
>>   }
>>   
>>   void pci_reset_secondary_bus(struct pci_dev *dev)
>> @@ -5542,10 +5600,8 @@ static void pci_bus_restore_locked(struct pci_bus *bus)
>>   
>>   	list_for_each_entry(dev, &bus->devices, bus_list) {
>>   		pci_dev_restore(dev);
>> -		if (dev->subordinate) {
>> -			pci_bridge_wait_for_secondary_bus(dev, "bus reset");
>> +		if (dev->subordinate)
>>   			pci_bus_restore_locked(dev->subordinate);
>> -		}
>>   	}
>>   }
> ???
>
> Unfortunately, this takes a wrong turn and is very much against the
> feedback I gave to you.

1. The implementation of 
pci_bridge_secondary_bus_reset/pci_bridge_wait_for_secondary_bus

    has been modified to wait for all downstream switches/link/devices 
to complete their recovery

    before returning.

2. Therefore, when pci_bus_restore_locked is called via __pci_reset_bus, 
the waiting process has

    already been completed via 
pci_bridge_secondary_bus_reset/pci_bridge_wait_for_secondary_bus,

    so pci_bus_restore_locked does not require additional waiting.

>>   
>> @@ -5575,14 +5631,14 @@ static void pci_slot_restore_locked(struct pci_slot *slot)
>>   {
>>   	struct pci_dev *dev;
>>   
>> +	pci_bridge_wait_for_secondary_bus(slot->bus->self, "slot reset");
>> +
>>   	list_for_each_entry(dev, &slot->bus->devices, bus_list) {
>>   		if (!dev->slot || dev->slot != slot)
>>   			continue;
>>   		pci_dev_restore(dev);
>> -		if (dev->subordinate) {
>> -			pci_bridge_wait_for_secondary_bus(dev, "slot reset");
>> +		if (dev->subordinate)
>>   			pci_bus_restore_locked(dev->subordinate);
>> -		}

1. When pci_slot_restore_locked is called for a PCIe slot reset, 
pci_bridge_wait_for_secondary_bus

    has already been executed to wait for all switches, links, and 
devices to complete their recovery firstly

>

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v3] PCI: Fix PCIe SBR dev/link wait error
  2025-12-01  9:24             ` Ilpo Järvinen
@ 2025-12-01 12:31               ` guanghuifeng
  0 siblings, 0 replies; 25+ messages in thread
From: guanghuifeng @ 2025-12-01 12:31 UTC (permalink / raw)
  To: Ilpo Järvinen; +Cc: bhelgaas, linux-pci, alikernel-developer


在 2025/12/1 17:24, Ilpo Järvinen 写道:
> On Sun, 30 Nov 2025, Guanghui Feng wrote:
>
>> When executing a PCIe secondary bus reset, all downstream switches and
>> endpoints will generate reset events. Simultaneously, all PCIe links
>> will undergo retraining, and each link will independently re-execute the
>> LTSSM state machine training. Therefore, after executing the SBR, it is
>> necessary to wait for all downstream links and devices to complete
>> recovery. Otherwise, after the SBR returns, accessing devices with some
>> links or endpoints not yet fully recovered may result in driver errors,
>> or even trigger device offline issues.
>>
>> Signed-off-by: Guanghui Feng <guanghuifeng@linux.alibaba.com>
>> Reviewed-by: Guixin Liu <kanie@linux.alibaba.com>
>> ---
> The review comment I gave to v2 still stands to this v3 as well.
>
> Please don't send new versions too rapidly but give people time to comment
> on the previous version. And please remember to explain what you changed
> in each version (below the -- line).

Thanks.

Compared to v2, patch v3 only modifies the implementation of

pci_bridge_wait_for_secondary_bus.

Patch v2 directly used a longer msleep timer, while v3 modifies

it to use a sequentially amplified msleep timer, allowing for more

timely detection of switch, link, and device recovery completion,

thus reducing the overall SBR waiting time.



^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] PCI: Fix PCIe SBR dev/link wait error
  2025-12-01 10:03     ` Lukas Wunner
@ 2025-12-01 12:56       ` guanghuifeng
  2025-12-01 13:26         ` Lukas Wunner
  0 siblings, 1 reply; 25+ messages in thread
From: guanghuifeng @ 2025-12-01 12:56 UTC (permalink / raw)
  To: Lukas Wunner
  Cc: Bjorn Helgaas, linux-pci, kanie, alikernel-developer,
	Ilpo Järvinen


在 2025/12/1 18:03, Lukas Wunner 写道:
> On Tue, Nov 25, 2025 at 02:20:10PM +0800, guanghui.fgh wrote:
>> After __pci_reset_slot/__pci_reset_bus calls
>> pci_bridge_wait_for_secondary_bus, the device will be restored via
>> pci_dev_restore. However, when a multifunction PCIe device is connected,
>> executing pci_bridge_wait_for_secondary_bus only guarantees the restoration
>> of a random device. For other devices that are still restoring, executing
>> pci_dev_restore cannot restore the device state normally, resulting in
>> errors or even device offline.
> PCIe is point-to-point, i.e. at the two ends of a link there's only a
> single physical device.  So if there are multiple pci_dev's on a bus,
> they're additional functions or VFs of the same physical device.
>
> The expectation is that if the first device on the bus is accessible,
> all other functions of the same physical device are accessible as well.
> That's why we only wait for the first device to become accessible.
>
> It seems highly unusual that the different functions of the same physical
> device require different delays until they're accessible.  I don't think
> we can accept such a sweeping change wholesale without more details,
> so please share what the topology looks like (lspci -tv), what devices are
> involved (lspci -vvv) and which device requires extra wait time for some
> of its functions.
>
> Thanks,
>
> Lukas

1. For PCIe end-to-end/point-to-point connections, PCIe multifunctions 
do indeed share the same PCIe links/lanes.
2. However, the functions within a PCIe multifunction device have 
different functionalities and complexities.

    During the hot reset process, each function requires a different 
recovery time. Therefore, after confirming

    that the PCIe links are functioning correctly, it is necessary to 
further check to ensure that each function has completed its recovery.


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v2] PCI: Fix PCIe SBR dev/link wait error
  2025-12-01 12:21               ` guanghuifeng
@ 2025-12-01 13:08                 ` Ilpo Järvinen
  2025-12-02  4:32                   ` [PATCH v4 0/1] " Guanghui Feng
  0 siblings, 1 reply; 25+ messages in thread
From: Ilpo Järvinen @ 2025-12-01 13:08 UTC (permalink / raw)
  To: guanghuifeng@linux.alibaba.com
  Cc: bhelgaas, linux-pci, LKML, kanie, alikernel-developer

[-- Attachment #1: Type: text/plain, Size: 10747 bytes --]

On Mon, 1 Dec 2025, guanghuifeng@linux.alibaba.com wrote:
> 在 2025/12/1 17:21, Ilpo Järvinen 写道:
> > On Sun, 30 Nov 2025, Guanghui Feng wrote:
> > 
> > > When executing a PCIe secondary bus reset, all downstream switches and
> > > endpoints will generate reset events. Simultaneously, all PCIe links
> > > will undergo retraining, and each link will independently re-execute the
> > > LTSSM state machine training. Therefore, after executing the SBR, it is
> > > necessary to wait for all downstream links and devices to complete
> > > recovery. Otherwise, after the SBR returns, accessing devices with some
> > > links or endpoints not yet fully recovered may result in driver errors,
> > > or even trigger device offline issues.
> > > 
> > > Signed-off-by: Guanghui Feng <guanghuifeng@linux.alibaba.com>
> > > Reviewed-by: Guixin Liu <kanie@linux.alibaba.com>
> > > ---
> > Hi,
> > 
> > In future, when posting an update, please always explain here
> > below the --- line what was changed between the versions.
> OK
> > >   drivers/pci/pci.c | 138 ++++++++++++++++++++++++++++++++--------------
> > >   1 file changed, 97 insertions(+), 41 deletions(-)
> > > 
> > > diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
> > > index b14dd064006c..76afecb11164 100644
> > > --- a/drivers/pci/pci.c
> > > +++ b/drivers/pci/pci.c
> > > @@ -4788,6 +4788,63 @@ static int pci_bus_max_d3cold_delay(const struct
> > > pci_bus *bus)
> > >   	return max(min_delay, max_delay);
> > >   }
> > >   +static int pci_readiness_check(struct pci_dev *pdev, struct pci_dev
> > > *child,
> > > +		unsigned long start_t, char *reset_type)
> > > +{
> > > +	int elapsed = jiffies_to_msecs(jiffies - start_t);
> > > +
> > > +	if (pci_dev_is_disconnected(pdev) || pci_dev_is_disconnected(child))
> > > +		return 0;
> > > +
> > > +	if (pcie_get_speed_cap(pdev) <= PCIE_SPEED_5_0GT) {
> > > +		u16 status;
> > > +
> > > +		pci_dbg(pdev, "waiting %d ms for downstream link\n", elapsed);
> > > +
> > > +		if (!pci_dev_wait(child, reset_type, 0))
> > > +			return 0;
> > > +
> > > +		if (PCI_RESET_WAIT > elapsed)
> > > +			return PCI_RESET_WAIT - elapsed;
> > > +
> > > +		/*
> > > +		 * If the port supports active link reporting we now check
> > > +		 * whether the link is active and if not bail out early with
> > > +		 * the assumption that the device is not present anymore.
> > > +		 */
> > > +		if (!pdev->link_active_reporting)
> > > +			return -ENOTTY;
> > > +
> > > +		pcie_capability_read_word(pdev, PCI_EXP_LNKSTA, &status);
> > > +		if (!(status & PCI_EXP_LNKSTA_DLLLA))
> > > +			return -ENOTTY;
> > > +
> > > +		if (!pci_dev_wait(child, reset_type, 0))
> > > +			return 0;
> > > +
> > > +		if (PCIE_RESET_READY_POLL_MS > elapsed)
> > > +			return PCIE_RESET_READY_POLL_MS - elapsed;
> > > +
> > > +		return -ENOTTY;
> > > +	}
> > > +
> > > +	pci_dbg(pdev, "waiting %d ms for downstream link, after activation\n",
> > > +		elapsed);
> > > +	if (!pcie_wait_for_link_delay(pdev, true, 0)) {
> > > +		/* Did not train, no need to wait any further */
> > > +		pci_info(pdev, "Data Link Layer Link Active not set in %d
> > > msec\n", elapsed);
> > > +		return -ENOTTY;
> > > +	}
> > > +
> > > +	if (!pci_dev_wait(child, reset_type, 0))
> > > +		return 0;
> > > +
> > > +	if (PCIE_RESET_READY_POLL_MS > elapsed)
> > > +		return PCIE_RESET_READY_POLL_MS - elapsed;
> > > +
> > > +	return -ENOTTY;
> > > +}
> > > +
> > >   /**
> > >    * pci_bridge_wait_for_secondary_bus - Wait for secondary bus to be
> > > accessible
> > >    * @dev: PCI bridge
> > > @@ -4802,12 +4859,14 @@ static int pci_bus_max_d3cold_delay(const struct
> > > pci_bus *bus)
> > >    * 4.3.2.
> > >    *
> > >    * Return 0 on success or -ENOTTY if the first device on the secondary
> > > bus
> > > - * failed to become accessible.
> > > + * failed to become accessible or a value greater than 0 indicates the
> > > + * left required waiting time..
> > >    */
> > > -int pci_bridge_wait_for_secondary_bus(struct pci_dev *dev, char
> > > *reset_type)
> > > +static int __pci_bridge_wait_for_secondary_bus(struct pci_dev *dev,
> > > unsigned long start_t,
> > > +		char *reset_type)
> > >   {
> > > -	struct pci_dev *child __free(pci_dev_put) = NULL;
> > > -	int delay;
> > > +	struct pci_dev *child;
> > > +	int delay, ret, elapsed = jiffies_to_msecs(jiffies - start_t);
> > >     	if (pci_dev_is_disconnected(dev))
> > >   		return 0;
> > > @@ -4835,8 +4894,6 @@ int pci_bridge_wait_for_secondary_bus(struct pci_dev
> > > *dev, char *reset_type)
> > >   		return 0;
> > >   	}
> > >   -	child = pci_dev_get(list_first_entry(&dev->subordinate->devices,
> > > -					     struct pci_dev, bus_list));
> > >   	up_read(&pci_bus_sem);
> > >     	/*
> > > @@ -4844,8 +4901,10 @@ int pci_bridge_wait_for_secondary_bus(struct
> > > pci_dev *dev, char *reset_type)
> > >   	 * accessing the device after reset (that is 1000 ms + 100 ms).
> > >   	 */
> > >   	if (!pci_is_pcie(dev)) {
> > > -		pci_dbg(dev, "waiting %d ms for secondary bus\n", 1000 +
> > > delay);
> > > -		msleep(1000 + delay);
> > > +		if (1000 + delay > elapsed)
> > > +			return 1000 + delay - elapsed;
> > > +
> > > +		pci_dbg(dev, "waiting %d ms for secondary bus\n", elapsed);
> > >   		return 0;
> > >   	}
> > >   @@ -4867,41 +4926,40 @@ int pci_bridge_wait_for_secondary_bus(struct
> > > pci_dev *dev, char *reset_type)
> > >   	if (!pcie_downstream_port(dev))
> > >   		return 0;
> > >   -	if (pcie_get_speed_cap(dev) <= PCIE_SPEED_5_0GT) {
> > > -		u16 status;
> > > -
> > > -		pci_dbg(dev, "waiting %d ms for downstream link\n", delay);
> > > -		msleep(delay);
> > > -
> > > -		if (!pci_dev_wait(child, reset_type, PCI_RESET_WAIT - delay))
> > > -			return 0;
> > > +	if (delay > elapsed)
> > > +		return delay - elapsed;
> > >   +	down_read(&pci_bus_sem);
> > > +	list_for_each_entry(child, &dev->subordinate->devices, bus_list) {
> > >   		/*
> > > -		 * If the port supports active link reporting we now check
> > > -		 * whether the link is active and if not bail out early with
> > > -		 * the assumption that the device is not present anymore.
> > > +		 * Check if all devices under the same bus have completed
> > > +		 * the reset process, including multifunction devices in
> > > +		 * the same bus.
> > >   		 */
> > > -		if (!dev->link_active_reporting)
> > > -			return -ENOTTY;
> > > +		ret = pci_readiness_check(dev, child, start_t, reset_type);
> > >   -		pcie_capability_read_word(dev, PCI_EXP_LNKSTA, &status);
> > > -		if (!(status & PCI_EXP_LNKSTA_DLLLA))
> > > -			return -ENOTTY;
> > > +		if (ret == 0 && child->subordinate)
> > > +			ret = __pci_bridge_wait_for_secondary_bus(child,
> > > start_t, reset_type);
> > >   -		return pci_dev_wait(child, reset_type,
> > > -				    PCIE_RESET_READY_POLL_MS -
> > > PCI_RESET_WAIT);
> > > +		if(ret)
> > > +			break;
> > >   	}
> > > +	up_read(&pci_bus_sem);
> > >   -	pci_dbg(dev, "waiting %d ms for downstream link, after activation\n",
> > > -		delay);
> > > -	if (!pcie_wait_for_link_delay(dev, true, delay)) {
> > > -		/* Did not train, no need to wait any further */
> > > -		pci_info(dev, "Data Link Layer Link Active not set in %d
> > > msec\n", delay);
> > > -		return -ENOTTY;
> > > -	}
> > > +	return ret;
> > > +}
> > >   -	return pci_dev_wait(child, reset_type,
> > > -			    PCIE_RESET_READY_POLL_MS - delay);
> > > +int pci_bridge_wait_for_secondary_bus(struct pci_dev *dev, char
> > > *reset_type)
> > > +{
> > > +	int res = 0;
> > > +	unsigned long start_t = jiffies;
> > > +
> > > +	do {
> > > +		msleep(res);
> > > +		res = __pci_bridge_wait_for_secondary_bus(dev, start_t,
> > > reset_type);
> > > +	} while (res > 0);
> > > +
> > > +	return res;
> > >   }
> > >     void pci_reset_secondary_bus(struct pci_dev *dev)
> > > @@ -5542,10 +5600,8 @@ static void pci_bus_restore_locked(struct pci_bus
> > > *bus)
> > >     	list_for_each_entry(dev, &bus->devices, bus_list) {
> > >   		pci_dev_restore(dev);
> > > -		if (dev->subordinate) {
> > > -			pci_bridge_wait_for_secondary_bus(dev, "bus reset");
> > > +		if (dev->subordinate)
> > >   			pci_bus_restore_locked(dev->subordinate);
> > > -		}
> > >   	}
> > >   }
> > ???
> > 
> > Unfortunately, this takes a wrong turn and is very much against the
> > feedback I gave to you.
> 
> 1. The implementation of
> pci_bridge_secondary_bus_reset/pci_bridge_wait_for_secondary_bus
> 
>    has been modified to wait for all downstream switches/link/devices to
> complete their recovery
> 
>    before returning.
> 
> 2. Therefore, when pci_bus_restore_locked is called via __pci_reset_bus, the
> waiting process has
> 
>    already been completed via
> pci_bridge_secondary_bus_reset/pci_bridge_wait_for_secondary_bus,
> 
>    so pci_bus_restore_locked does not require additional waiting.

But you must also restore the bridge's config space before waiting for 
devices in its subordinate bus, which is done wrong in this patch as it 
moves recursion out from the correct place into 
pci_bridge_wait_for_secondary_bus() that does not do any restoring of the 
config space as it recurses downwards (and it's not correct to add 
restore there either, I don't think this part of the change is fixing 
any real issue but instead adding an old issue back which has been fixed 
as I explained in my comments to v1).

It's not enough that you get it working in your case (which might be 
able to cope without restoring config space prior to wait for the 
devices).

> > >   @@ -5575,14 +5631,14 @@ static void pci_slot_restore_locked(struct
> > > pci_slot *slot)
> > >   {
> > >   	struct pci_dev *dev;
> > >   +	pci_bridge_wait_for_secondary_bus(slot->bus->self, "slot reset");
> > > +
> > >   	list_for_each_entry(dev, &slot->bus->devices, bus_list) {
> > >   		if (!dev->slot || dev->slot != slot)
> > >   			continue;
> > >   		pci_dev_restore(dev);
> > > -		if (dev->subordinate) {
> > > -			pci_bridge_wait_for_secondary_bus(dev, "slot reset");
> > > +		if (dev->subordinate)
> > >   			pci_bus_restore_locked(dev->subordinate);
> > > -		}
> 
> 1. When pci_slot_restore_locked is called for a PCIe slot reset,
> pci_bridge_wait_for_secondary_bus
> 
>    has already been executed to wait for all switches, links, and devices to
> complete their recovery firstly

You need to handle nested topologies right, config space restore and wait 
on every level have to be in the correct order. It's NOT CORRECT to wait 
first for the entire hierarchy and only after that restore the config 
spaces.


-- 
 i.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] PCI: Fix PCIe SBR dev/link wait error
  2025-12-01 12:56       ` guanghuifeng
@ 2025-12-01 13:26         ` Lukas Wunner
  2025-12-01 14:46           ` guanghuifeng
  0 siblings, 1 reply; 25+ messages in thread
From: Lukas Wunner @ 2025-12-01 13:26 UTC (permalink / raw)
  To: guanghuifeng@linux.alibaba.com
  Cc: Bjorn Helgaas, linux-pci, kanie, alikernel-developer,
	Ilpo Järvinen

On Mon, Dec 01, 2025 at 08:56:10PM +0800, guanghuifeng@linux.alibaba.com wrote:
> 2025/12/1 18:03, Lukas Wunner :
> > It seems highly unusual that the different functions of the same physical
> > device require different delays until they're accessible.  I don't think
> > we can accept such a sweeping change wholesale without more details,
> > so please share what the topology looks like (lspci -tv), what devices are
> > involved (lspci -vvv) and which device requires extra wait time for some
> > of its functions.

Could you please provide the details I asked for above?

Thanks,

Lukas

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] PCI: Fix PCIe SBR dev/link wait error
  2025-12-01 13:26         ` Lukas Wunner
@ 2025-12-01 14:46           ` guanghuifeng
  2025-12-01 16:18             ` Lukas Wunner
  0 siblings, 1 reply; 25+ messages in thread
From: guanghuifeng @ 2025-12-01 14:46 UTC (permalink / raw)
  To: Lukas Wunner
  Cc: Bjorn Helgaas, linux-pci, kanie, alikernel-developer,
	Ilpo Järvinen


在 2025/12/1 21:26, Lukas Wunner 写道:
> On Mon, Dec 01, 2025 at 08:56:10PM +0800, guanghuifeng@linux.alibaba.com wrote:
>> 2025/12/1 18:03, Lukas Wunner :
>>> It seems highly unusual that the different functions of the same physical
>>> device require different delays until they're accessible.  I don't think
>>> we can accept such a sweeping change wholesale without more details,
>>> so please share what the topology looks like (lspci -tv), what devices are
>>> involved (lspci -vvv) and which device requires extra wait time for some
>>> of its functions.
> Could you please provide the details I asked for above?
>
> Thanks,
>
> Lukas

Thanks.

1. Currently, there are significant differences in reset recovery times 
among

    some PCIe multifunction devices, especially when the functions and 
complexities

    vary greatly, including some devices used for testing purposes.



2. Furthermore, similar implementations exist in various commercial devices

    (with vastly different functions), such as the multifunction devices

    (including VGA, audio functions, etc.) mentioned in the following link:

https://forums.developer.nvidia.com/t/nvidia-rtx-5090-not-detected-by-nvidia-smi-on-ubuntu-server-24-04/327409?page=2
[from the link]
lspci -nn | grep -i nvidia
c1:00.0 VGA compatible controller [0300]: NVIDIA Corporation GB202 
[GeForce RTX 5090] [10de:2b85] (rev a1)
c1:00.1 Audio device [0403]: NVIDIA Corporation Device [10de:22e8] (rev a1)

[from the link]



^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] PCI: Fix PCIe SBR dev/link wait error
  2025-12-01 14:46           ` guanghuifeng
@ 2025-12-01 16:18             ` Lukas Wunner
  0 siblings, 0 replies; 25+ messages in thread
From: Lukas Wunner @ 2025-12-01 16:18 UTC (permalink / raw)
  To: guanghuifeng@linux.alibaba.com
  Cc: Bjorn Helgaas, linux-pci, kanie, alikernel-developer,
	Ilpo Järvinen

On Mon, Dec 01, 2025 at 10:46:00PM +0800, guanghuifeng@linux.alibaba.com wrote:
> 在 2025/12/1 21:26, Lukas Wunner 写道:
> > On Mon, Dec 01, 2025 at 08:56:10PM +0800, guanghuifeng@linux.alibaba.com wrote:
> > > 2025/12/1 18:03, Lukas Wunner :
> > > > It seems highly unusual that the different functions of the same
> > > > physical device require different delays until they're accessible.
> > > >  I don't think we can accept such a sweeping change wholesale without
> > > > more details, so please share what the topology looks like (lspci -tv),
> > > > what devices are involved (lspci -vvv) and which device requires
> > > > extra wait time for some of its functions.
> 
> 1. Currently, there are significant differences in reset recovery times
> among some PCIe multifunction devices, especially when the functions and
> complexities vary greatly, including some devices used for testing purposes.

Which devices are we talking about exactly?  For internal test devices,
upstream changes to accomodate to quirks are usually frowned upon.
Because the expectation is that those changes can be kept in local
downstream kernel trees and products that actually ship to customers
do not exhibit those quirks.

> 2. Furthermore, similar implementations exist in various commercial devices
> (with vastly different functions), such as the multifunction devices
> (including VGA, audio functions, etc.) mentioned in the following link:
> 
> https://forums.developer.nvidia.com/t/nvidia-rtx-5090-not-detected-by-nvidia-smi-on-ubuntu-server-24-04/327409?page=2

I've looked at that forum discussion and it seems there's an issue with
Resizeable BAR support on certain Nvidia cards, but I'm not seeing
anything on that page relating to functions taking longer than
others to come out of reset.  So this doesn't seem to be related to
the present patch.

Thanks,

Lukas

^ permalink raw reply	[flat|nested] 25+ messages in thread

* [PATCH v4 0/1] PCI: Fix PCIe SBR dev/link wait error
  2025-12-01 13:08                 ` Ilpo Järvinen
@ 2025-12-02  4:32                   ` Guanghui Feng
  2025-12-02  4:32                     ` [PATCH v4 v4 1/1] " Guanghui Feng
  0 siblings, 1 reply; 25+ messages in thread
From: Guanghui Feng @ 2025-12-02  4:32 UTC (permalink / raw)
  To: bhelgaas, ilpo.jarvinen; +Cc: linux-pci, alikernel-developer

Compared to patch v2 and v3, the process of recursively waiting for all links,
switches, and ep devices to recover in the SBR, add restores 64 bytes of
switch configuration space to enable the switch to forward probe requests normally.

Guanghui Feng (1):
  PCI: Fix PCIe SBR dev/link wait error

 drivers/pci/pci.c | 143 +++++++++++++++++++++++++++++++++-------------
 1 file changed, 103 insertions(+), 40 deletions(-)

-- 
2.32.0.3.gf3a3e56d6


^ permalink raw reply	[flat|nested] 25+ messages in thread

* [PATCH v4 v4 1/1] PCI: Fix PCIe SBR dev/link wait error
  2025-12-02  4:32                   ` [PATCH v4 0/1] " Guanghui Feng
@ 2025-12-02  4:32                     ` Guanghui Feng
  2025-12-02 16:49                       ` Bjorn Helgaas
  2025-12-03 14:41                       ` Ilpo Järvinen
  0 siblings, 2 replies; 25+ messages in thread
From: Guanghui Feng @ 2025-12-02  4:32 UTC (permalink / raw)
  To: bhelgaas, ilpo.jarvinen; +Cc: linux-pci, alikernel-developer

When executing a PCIe secondary bus reset, all downstream switches and
endpoints will generate reset events. Simultaneously, all PCIe links
will undergo retraining, and each link will independently re-execute the
LTSSM state machine training. Therefore, after executing the SBR, it is
necessary to wait for all downstream links and devices to complete
recovery. Otherwise, after the SBR returns, accessing devices with some
links or endpoints not yet fully recovered may result in driver errors,
or even trigger device offline issues.

Signed-off-by: Guanghui Feng <guanghuifeng@linux.alibaba.com>
Reviewed-by: Guixin Liu <kanie@linux.alibaba.com>
---
 drivers/pci/pci.c | 143 +++++++++++++++++++++++++++++++++-------------
 1 file changed, 103 insertions(+), 40 deletions(-)

diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index b14dd064006c..9cf0e58ef23f 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -4788,6 +4788,63 @@ static int pci_bus_max_d3cold_delay(const struct pci_bus *bus)
 	return max(min_delay, max_delay);
 }
 
+static int pci_readiness_check(struct pci_dev *pdev, struct pci_dev *child,
+		unsigned long start_t, char *reset_type)
+{
+	int elapsed = jiffies_to_msecs(jiffies - start_t);
+
+	if (pci_dev_is_disconnected(pdev) || pci_dev_is_disconnected(child))
+		return 0;
+
+	if (pcie_get_speed_cap(pdev) <= PCIE_SPEED_5_0GT) {
+		u16 status;
+
+		pci_dbg(pdev, "waiting %d ms for downstream link\n", elapsed);
+
+		if (!pci_dev_wait(child, reset_type, 0))
+			return 0;
+
+		if (PCI_RESET_WAIT > elapsed)
+			return PCI_RESET_WAIT - elapsed;
+
+		/*
+		 * If the port supports active link reporting we now check
+		 * whether the link is active and if not bail out early with
+		 * the assumption that the device is not present anymore.
+		 */
+		if (!pdev->link_active_reporting)
+			return -ENOTTY;
+
+		pcie_capability_read_word(pdev, PCI_EXP_LNKSTA, &status);
+		if (!(status & PCI_EXP_LNKSTA_DLLLA))
+			return -ENOTTY;
+
+		if (!pci_dev_wait(child, reset_type, 0))
+			return 0;
+
+		if (PCIE_RESET_READY_POLL_MS > elapsed)
+			return PCIE_RESET_READY_POLL_MS - elapsed;
+
+		return -ENOTTY;
+	}
+
+	pci_dbg(pdev, "waiting %d ms for downstream link, after activation\n",
+		elapsed);
+	if (!pcie_wait_for_link_delay(pdev, true, 0)) {
+		/* Did not train, no need to wait any further */
+		pci_info(pdev, "Data Link Layer Link Active not set in %d msec\n", elapsed);
+		return -ENOTTY;
+	}
+
+	if (!pci_dev_wait(child, reset_type, 0))
+		return 0;
+
+	if (PCIE_RESET_READY_POLL_MS > elapsed)
+		return PCIE_RESET_READY_POLL_MS - elapsed;
+
+	return -ENOTTY;
+}
+
 /**
  * pci_bridge_wait_for_secondary_bus - Wait for secondary bus to be accessible
  * @dev: PCI bridge
@@ -4802,12 +4859,14 @@ static int pci_bus_max_d3cold_delay(const struct pci_bus *bus)
  * 4.3.2.
  *
  * Return 0 on success or -ENOTTY if the first device on the secondary bus
- * failed to become accessible.
+ * failed to become accessible or a value greater than 0 indicates the
+ * left required waiting time..
  */
-int pci_bridge_wait_for_secondary_bus(struct pci_dev *dev, char *reset_type)
+static int __pci_bridge_wait_for_secondary_bus(struct pci_dev *dev, unsigned long start_t,
+		char *reset_type)
 {
-	struct pci_dev *child __free(pci_dev_put) = NULL;
-	int delay;
+	struct pci_dev *child;
+	int delay, ret, elapsed = jiffies_to_msecs(jiffies - start_t);
 
 	if (pci_dev_is_disconnected(dev))
 		return 0;
@@ -4835,8 +4894,6 @@ int pci_bridge_wait_for_secondary_bus(struct pci_dev *dev, char *reset_type)
 		return 0;
 	}
 
-	child = pci_dev_get(list_first_entry(&dev->subordinate->devices,
-					     struct pci_dev, bus_list));
 	up_read(&pci_bus_sem);
 
 	/*
@@ -4844,8 +4901,10 @@ int pci_bridge_wait_for_secondary_bus(struct pci_dev *dev, char *reset_type)
 	 * accessing the device after reset (that is 1000 ms + 100 ms).
 	 */
 	if (!pci_is_pcie(dev)) {
-		pci_dbg(dev, "waiting %d ms for secondary bus\n", 1000 + delay);
-		msleep(1000 + delay);
+		if (1000 + delay > elapsed)
+			return 1000 + delay - elapsed;
+
+		pci_dbg(dev, "waiting %d ms for secondary bus\n", elapsed);
 		return 0;
 	}
 
@@ -4867,41 +4926,47 @@ int pci_bridge_wait_for_secondary_bus(struct pci_dev *dev, char *reset_type)
 	if (!pcie_downstream_port(dev))
 		return 0;
 
-	if (pcie_get_speed_cap(dev) <= PCIE_SPEED_5_0GT) {
-		u16 status;
-
-		pci_dbg(dev, "waiting %d ms for downstream link\n", delay);
-		msleep(delay);
-
-		if (!pci_dev_wait(child, reset_type, PCI_RESET_WAIT - delay))
-			return 0;
+	if (delay > elapsed)
+		return delay - elapsed;
 
+	down_read(&pci_bus_sem);
+	list_for_each_entry(child, &dev->subordinate->devices, bus_list) {
 		/*
-		 * If the port supports active link reporting we now check
-		 * whether the link is active and if not bail out early with
-		 * the assumption that the device is not present anymore.
+		 * Check if all devices under the same bus have completed
+		 * the reset process, including multifunction devices in
+		 * the same bus.
 		 */
-		if (!dev->link_active_reporting)
-			return -ENOTTY;
+		ret = pci_readiness_check(dev, child, start_t, reset_type);
 
-		pcie_capability_read_word(dev, PCI_EXP_LNKSTA, &status);
-		if (!(status & PCI_EXP_LNKSTA_DLLLA))
-			return -ENOTTY;
+		if (ret == 0 && child->subordinate) {
+			pci_restore_config_space(child);
+			ret = __pci_bridge_wait_for_secondary_bus(child, start_t, reset_type);
+		}
 
-		return pci_dev_wait(child, reset_type,
-				    PCIE_RESET_READY_POLL_MS - PCI_RESET_WAIT);
+		if(ret)
+			break;
 	}
+	up_read(&pci_bus_sem);
 
-	pci_dbg(dev, "waiting %d ms for downstream link, after activation\n",
-		delay);
-	if (!pcie_wait_for_link_delay(dev, true, delay)) {
-		/* Did not train, no need to wait any further */
-		pci_info(dev, "Data Link Layer Link Active not set in %d msec\n", delay);
-		return -ENOTTY;
+	return ret;
+}
+
+int pci_bridge_wait_for_secondary_bus(struct pci_dev *dev, char *reset_type)
+{
+	int res, gap = 1;
+	unsigned long start_t = jiffies;
+
+	res = __pci_bridge_wait_for_secondary_bus(dev, start_t, reset_type);
+
+	while (res > 0) {
+		gap = gap < res ? gap : res;
+		msleep(gap);
+		gap <<= 1;
+
+		res = __pci_bridge_wait_for_secondary_bus(dev, start_t, reset_type);
 	}
 
-	return pci_dev_wait(child, reset_type,
-			    PCIE_RESET_READY_POLL_MS - delay);
+	return res;
 }
 
 void pci_reset_secondary_bus(struct pci_dev *dev)
@@ -5542,10 +5607,8 @@ static void pci_bus_restore_locked(struct pci_bus *bus)
 
 	list_for_each_entry(dev, &bus->devices, bus_list) {
 		pci_dev_restore(dev);
-		if (dev->subordinate) {
-			pci_bridge_wait_for_secondary_bus(dev, "bus reset");
+		if (dev->subordinate)
 			pci_bus_restore_locked(dev->subordinate);
-		}
 	}
 }
 
@@ -5575,14 +5638,14 @@ static void pci_slot_restore_locked(struct pci_slot *slot)
 {
 	struct pci_dev *dev;
 
+	pci_bridge_wait_for_secondary_bus(slot->bus->self, "slot reset");
+
 	list_for_each_entry(dev, &slot->bus->devices, bus_list) {
 		if (!dev->slot || dev->slot != slot)
 			continue;
 		pci_dev_restore(dev);
-		if (dev->subordinate) {
-			pci_bridge_wait_for_secondary_bus(dev, "slot reset");
+		if (dev->subordinate)
 			pci_bus_restore_locked(dev->subordinate);
-		}
 	}
 }
 
-- 
2.32.0.3.gf3a3e56d6


^ permalink raw reply related	[flat|nested] 25+ messages in thread

* Re: [PATCH v4 v4 1/1] PCI: Fix PCIe SBR dev/link wait error
  2025-12-02  4:32                     ` [PATCH v4 v4 1/1] " Guanghui Feng
@ 2025-12-02 16:49                       ` Bjorn Helgaas
  2025-12-02 16:51                         ` Bjorn Helgaas
  2025-12-03 14:41                       ` Ilpo Järvinen
  1 sibling, 1 reply; 25+ messages in thread
From: Bjorn Helgaas @ 2025-12-02 16:49 UTC (permalink / raw)
  To: Guanghui Feng; +Cc: bhelgaas, ilpo.jarvinen, linux-pci, alikernel-developer

On Tue, Dec 02, 2025 at 12:32:07PM +0800, Guanghui Feng wrote:
> When executing a PCIe secondary bus reset, all downstream switches and
> endpoints will generate reset events. Simultaneously, all PCIe links
> will undergo retraining, and each link will independently re-execute the
> LTSSM state machine training. Therefore, after executing the SBR, it is
> necessary to wait for all downstream links and devices to complete
> recovery. Otherwise, after the SBR returns, accessing devices with some
> links or endpoints not yet fully recovered may result in driver errors,
> or even trigger device offline issues.
> 
> Signed-off-by: Guanghui Feng <guanghuifeng@linux.alibaba.com>
> Reviewed-by: Guixin Liu <kanie@linux.alibaba.com>

Can you please supply the lspci information Lukas requested here?
https://lore.kernel.org/r/aS1oArFHeo9FAuv-@wunner.de

Also, if there is language in the PCIe spec you can cite here about
the need to independently wait for each device, that would be useful.

Bjorn

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v4 v4 1/1] PCI: Fix PCIe SBR dev/link wait error
  2025-12-02 16:49                       ` Bjorn Helgaas
@ 2025-12-02 16:51                         ` Bjorn Helgaas
  0 siblings, 0 replies; 25+ messages in thread
From: Bjorn Helgaas @ 2025-12-02 16:51 UTC (permalink / raw)
  To: Guanghui Feng
  Cc: bhelgaas, Ilpo Järvinen, linux-pci, alikernel-developer,
	Lukas Wunner

[+cc Lukas (please always cc people who have commented on previous
iterations of your patch)]

On Tue, Dec 02, 2025 at 10:49:01AM -0600, Bjorn Helgaas wrote:
> On Tue, Dec 02, 2025 at 12:32:07PM +0800, Guanghui Feng wrote:
> > When executing a PCIe secondary bus reset, all downstream switches and
> > endpoints will generate reset events. Simultaneously, all PCIe links
> > will undergo retraining, and each link will independently re-execute the
> > LTSSM state machine training. Therefore, after executing the SBR, it is
> > necessary to wait for all downstream links and devices to complete
> > recovery. Otherwise, after the SBR returns, accessing devices with some
> > links or endpoints not yet fully recovered may result in driver errors,
> > or even trigger device offline issues.
> > 
> > Signed-off-by: Guanghui Feng <guanghuifeng@linux.alibaba.com>
> > Reviewed-by: Guixin Liu <kanie@linux.alibaba.com>
> 
> Can you please supply the lspci information Lukas requested here?
> https://lore.kernel.org/r/aS1oArFHeo9FAuv-@wunner.de
> 
> Also, if there is language in the PCIe spec you can cite here about
> the need to independently wait for each device, that would be useful.
> 
> Bjorn

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v4 v4 1/1] PCI: Fix PCIe SBR dev/link wait error
  2025-12-02  4:32                     ` [PATCH v4 v4 1/1] " Guanghui Feng
  2025-12-02 16:49                       ` Bjorn Helgaas
@ 2025-12-03 14:41                       ` Ilpo Järvinen
  1 sibling, 0 replies; 25+ messages in thread
From: Ilpo Järvinen @ 2025-12-03 14:41 UTC (permalink / raw)
  To: Guanghui Feng; +Cc: linux-pci, alikernel-developer

On Tue, 2 Dec 2025, Guanghui Feng wrote:

> When executing a PCIe secondary bus reset, all downstream switches and
> endpoints will generate reset events. Simultaneously, all PCIe links
> will undergo retraining, and each link will independently re-execute the
> LTSSM state machine training. Therefore, after executing the SBR, it is
> necessary to wait for all downstream links and devices to complete
> recovery. Otherwise, after the SBR returns, accessing devices with some
> links or endpoints not yet fully recovered may result in driver errors,
> or even trigger device offline issues.
> 
> Signed-off-by: Guanghui Feng <guanghuifeng@linux.alibaba.com>
> Reviewed-by: Guixin Liu <kanie@linux.alibaba.com>
> ---
>  drivers/pci/pci.c | 143 +++++++++++++++++++++++++++++++++-------------
>  1 file changed, 103 insertions(+), 40 deletions(-)
> 
> diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
> index b14dd064006c..9cf0e58ef23f 100644
> --- a/drivers/pci/pci.c
> +++ b/drivers/pci/pci.c
> @@ -4788,6 +4788,63 @@ static int pci_bus_max_d3cold_delay(const struct pci_bus *bus)
>  	return max(min_delay, max_delay);
>  }
>  
> +static int pci_readiness_check(struct pci_dev *pdev, struct pci_dev *child,
> +		unsigned long start_t, char *reset_type)
> +{
> +	int elapsed = jiffies_to_msecs(jiffies - start_t);
> +
> +	if (pci_dev_is_disconnected(pdev) || pci_dev_is_disconnected(child))
> +		return 0;
> +
> +	if (pcie_get_speed_cap(pdev) <= PCIE_SPEED_5_0GT) {
> +		u16 status;
> +
> +		pci_dbg(pdev, "waiting %d ms for downstream link\n", elapsed);
> +
> +		if (!pci_dev_wait(child, reset_type, 0))
> +			return 0;
> +
> +		if (PCI_RESET_WAIT > elapsed)
> +			return PCI_RESET_WAIT - elapsed;
> +
> +		/*
> +		 * If the port supports active link reporting we now check
> +		 * whether the link is active and if not bail out early with
> +		 * the assumption that the device is not present anymore.
> +		 */
> +		if (!pdev->link_active_reporting)
> +			return -ENOTTY;
> +
> +		pcie_capability_read_word(pdev, PCI_EXP_LNKSTA, &status);
> +		if (!(status & PCI_EXP_LNKSTA_DLLLA))
> +			return -ENOTTY;
> +
> +		if (!pci_dev_wait(child, reset_type, 0))
> +			return 0;
> +
> +		if (PCIE_RESET_READY_POLL_MS > elapsed)
> +			return PCIE_RESET_READY_POLL_MS - elapsed;
> +
> +		return -ENOTTY;
> +	}
> +
> +	pci_dbg(pdev, "waiting %d ms for downstream link, after activation\n",
> +		elapsed);
> +	if (!pcie_wait_for_link_delay(pdev, true, 0)) {
> +		/* Did not train, no need to wait any further */
> +		pci_info(pdev, "Data Link Layer Link Active not set in %d msec\n", elapsed);
> +		return -ENOTTY;
> +	}
> +
> +	if (!pci_dev_wait(child, reset_type, 0))
> +		return 0;
> +
> +	if (PCIE_RESET_READY_POLL_MS > elapsed)
> +		return PCIE_RESET_READY_POLL_MS - elapsed;
> +
> +	return -ENOTTY;
> +}
> +
>  /**
>   * pci_bridge_wait_for_secondary_bus - Wait for secondary bus to be accessible
>   * @dev: PCI bridge
> @@ -4802,12 +4859,14 @@ static int pci_bus_max_d3cold_delay(const struct pci_bus *bus)
>   * 4.3.2.
>   *
>   * Return 0 on success or -ENOTTY if the first device on the secondary bus
> - * failed to become accessible.
> + * failed to become accessible or a value greater than 0 indicates the
> + * left required waiting time..
>   */
> -int pci_bridge_wait_for_secondary_bus(struct pci_dev *dev, char *reset_type)
> +static int __pci_bridge_wait_for_secondary_bus(struct pci_dev *dev, unsigned long start_t,
> +		char *reset_type)
>  {
> -	struct pci_dev *child __free(pci_dev_put) = NULL;
> -	int delay;
> +	struct pci_dev *child;
> +	int delay, ret, elapsed = jiffies_to_msecs(jiffies - start_t);
>  
>  	if (pci_dev_is_disconnected(dev))
>  		return 0;
> @@ -4835,8 +4894,6 @@ int pci_bridge_wait_for_secondary_bus(struct pci_dev *dev, char *reset_type)
>  		return 0;
>  	}
>  
> -	child = pci_dev_get(list_first_entry(&dev->subordinate->devices,
> -					     struct pci_dev, bus_list));
>  	up_read(&pci_bus_sem);
>  
>  	/*
> @@ -4844,8 +4901,10 @@ int pci_bridge_wait_for_secondary_bus(struct pci_dev *dev, char *reset_type)
>  	 * accessing the device after reset (that is 1000 ms + 100 ms).
>  	 */
>  	if (!pci_is_pcie(dev)) {
> -		pci_dbg(dev, "waiting %d ms for secondary bus\n", 1000 + delay);
> -		msleep(1000 + delay);
> +		if (1000 + delay > elapsed)
> +			return 1000 + delay - elapsed;
> +
> +		pci_dbg(dev, "waiting %d ms for secondary bus\n", elapsed);
>  		return 0;
>  	}
>  
> @@ -4867,41 +4926,47 @@ int pci_bridge_wait_for_secondary_bus(struct pci_dev *dev, char *reset_type)
>  	if (!pcie_downstream_port(dev))
>  		return 0;
>  
> -	if (pcie_get_speed_cap(dev) <= PCIE_SPEED_5_0GT) {
> -		u16 status;
> -
> -		pci_dbg(dev, "waiting %d ms for downstream link\n", delay);
> -		msleep(delay);
> -
> -		if (!pci_dev_wait(child, reset_type, PCI_RESET_WAIT - delay))
> -			return 0;
> +	if (delay > elapsed)
> +		return delay - elapsed;
>  
> +	down_read(&pci_bus_sem);
> +	list_for_each_entry(child, &dev->subordinate->devices, bus_list) {
>  		/*
> -		 * If the port supports active link reporting we now check
> -		 * whether the link is active and if not bail out early with
> -		 * the assumption that the device is not present anymore.
> +		 * Check if all devices under the same bus have completed
> +		 * the reset process, including multifunction devices in
> +		 * the same bus.
>  		 */
> -		if (!dev->link_active_reporting)
> -			return -ENOTTY;
> +		ret = pci_readiness_check(dev, child, start_t, reset_type);
>  
> -		pcie_capability_read_word(dev, PCI_EXP_LNKSTA, &status);
> -		if (!(status & PCI_EXP_LNKSTA_DLLLA))
> -			return -ENOTTY;
> +		if (ret == 0 && child->subordinate) {
> +			pci_restore_config_space(child);
> +			ret = __pci_bridge_wait_for_secondary_bus(child, start_t, reset_type);

No! I'm getting tired of this. You've brushed Lukas' requests on more 
information aside and sent a new version instead (it doesn't really help 
your cause).

I've said now multiple times do not add recursion here. Instead of 
rearchitecting the approach, you're also duplicating config space restore 
here as well. Neither is justified.

The recursion is already in pci_bus_restore_locked() and it looks 
sufficient, you've never even tried to explain why it is not enough.

Also, to me looks there are two orthogonal problems which you try to mix 
up in this fix:

1. Waiting for all childs (devices multiple functions).

2. Performing waiting (an other actions) in case of nested topologies in 
   the correct order.

You should separate these on patch level to separate patch so each can be 
evaluated for its own merit.

AFAICT, 2. is only wrong in case of DPC recovery, if that's not the case 
you lack the explanation why the recursion in pci_bus_restore_locked() is 
not sufficient (don't mix up the explanation to 2. with 1. please).

It's submitters responsability to convince reviewers the patch is 
necessary, not the other way, so far I've not seen any convincing 
explanations to multiple points raised against this patch and approach it 
uses.

--
 i.

> +		}
>  
> -		return pci_dev_wait(child, reset_type,
> -				    PCIE_RESET_READY_POLL_MS - PCI_RESET_WAIT);
> +		if(ret)
> +			break;
>  	}
> +	up_read(&pci_bus_sem);
>  
> -	pci_dbg(dev, "waiting %d ms for downstream link, after activation\n",
> -		delay);
> -	if (!pcie_wait_for_link_delay(dev, true, delay)) {
> -		/* Did not train, no need to wait any further */
> -		pci_info(dev, "Data Link Layer Link Active not set in %d msec\n", delay);
> -		return -ENOTTY;
> +	return ret;
> +}
> +
> +int pci_bridge_wait_for_secondary_bus(struct pci_dev *dev, char *reset_type)
> +{
> +	int res, gap = 1;
> +	unsigned long start_t = jiffies;
> +
> +	res = __pci_bridge_wait_for_secondary_bus(dev, start_t, reset_type);
> +
> +	while (res > 0) {
> +		gap = gap < res ? gap : res;
> +		msleep(gap);
> +		gap <<= 1;
> +
> +		res = __pci_bridge_wait_for_secondary_bus(dev, start_t, reset_type);
>  	}
>  
> -	return pci_dev_wait(child, reset_type,
> -			    PCIE_RESET_READY_POLL_MS - delay);
> +	return res;
>  }
>  
>  void pci_reset_secondary_bus(struct pci_dev *dev)
> @@ -5542,10 +5607,8 @@ static void pci_bus_restore_locked(struct pci_bus *bus)
>  
>  	list_for_each_entry(dev, &bus->devices, bus_list) {
>  		pci_dev_restore(dev);
> -		if (dev->subordinate) {
> -			pci_bridge_wait_for_secondary_bus(dev, "bus reset");
> +		if (dev->subordinate)
>  			pci_bus_restore_locked(dev->subordinate);
> -		}
>  	}
>  }
>  
> @@ -5575,14 +5638,14 @@ static void pci_slot_restore_locked(struct pci_slot *slot)
>  {
>  	struct pci_dev *dev;
>  
> +	pci_bridge_wait_for_secondary_bus(slot->bus->self, "slot reset");
> +
>  	list_for_each_entry(dev, &slot->bus->devices, bus_list) {
>  		if (!dev->slot || dev->slot != slot)
>  			continue;
>  		pci_dev_restore(dev);
> -		if (dev->subordinate) {
> -			pci_bridge_wait_for_secondary_bus(dev, "slot reset");
> +		if (dev->subordinate)
>  			pci_bus_restore_locked(dev->subordinate);
> -		}
>  	}
>  }
>  
> 

^ permalink raw reply	[flat|nested] 25+ messages in thread

end of thread, other threads:[~2025-12-03 14:41 UTC | newest]

Thread overview: 25+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-11-24 10:45 [PATCH] PCI: Fix PCIe SBR dev/link wait error Guanghui Feng
2025-11-24 23:58 ` Bjorn Helgaas
2025-11-25  6:20   ` guanghui.fgh
2025-12-01 10:03     ` Lukas Wunner
2025-12-01 12:56       ` guanghuifeng
2025-12-01 13:26         ` Lukas Wunner
2025-12-01 14:46           ` guanghuifeng
2025-12-01 16:18             ` Lukas Wunner
2025-11-26  8:20 ` Ilpo Järvinen
2025-11-26 12:08   ` guanghui.fgh
2025-11-26 12:37     ` Ilpo Järvinen
2025-11-26 14:22       ` guanghui.fgh
2025-11-26 14:47         ` Ilpo Järvinen
2025-11-29 16:36           ` [PATCH v2] " Guanghui Feng
2025-12-01  9:21             ` Ilpo Järvinen
2025-12-01 12:21               ` guanghuifeng
2025-12-01 13:08                 ` Ilpo Järvinen
2025-12-02  4:32                   ` [PATCH v4 0/1] " Guanghui Feng
2025-12-02  4:32                     ` [PATCH v4 v4 1/1] " Guanghui Feng
2025-12-02 16:49                       ` Bjorn Helgaas
2025-12-02 16:51                         ` Bjorn Helgaas
2025-12-03 14:41                       ` Ilpo Järvinen
2025-11-30  5:17           ` [PATCH v3] " Guanghui Feng
2025-12-01  9:24             ` Ilpo Järvinen
2025-12-01 12:31               ` guanghuifeng

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox