public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
* [PATCH v2] PCI: Fix PCIe SBR dev/link wait error
       [not found] <2e3a1e6b-40ae-3878-e237-fb9032796af8@linux.intel.com>
@ 2025-11-29 16:36 ` Guanghui Feng
  2025-12-01  9:21   ` Ilpo Järvinen
  0 siblings, 1 reply; 4+ messages in thread
From: Guanghui Feng @ 2025-11-29 16:36 UTC (permalink / raw)
  To: bhelgaas, ilpo.jarvinen
  Cc: linux-pci, linux-kernel, kanie, alikernel-developer,
	Guanghui Feng

When executing a PCIe secondary bus reset, all downstream switches and
endpoints will generate reset events. Simultaneously, all PCIe links
will undergo retraining, and each link will independently re-execute the
LTSSM state machine training. Therefore, after executing the SBR, it is
necessary to wait for all downstream links and devices to complete
recovery. Otherwise, after the SBR returns, accessing devices with some
links or endpoints not yet fully recovered may result in driver errors,
or even trigger device offline issues.

Signed-off-by: Guanghui Feng <guanghuifeng@linux.alibaba.com>
Reviewed-by: Guixin Liu <kanie@linux.alibaba.com>
---
 drivers/pci/pci.c | 138 ++++++++++++++++++++++++++++++++--------------
 1 file changed, 97 insertions(+), 41 deletions(-)

diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index b14dd064006c..76afecb11164 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -4788,6 +4788,63 @@ static int pci_bus_max_d3cold_delay(const struct pci_bus *bus)
 	return max(min_delay, max_delay);
 }
 
+static int pci_readiness_check(struct pci_dev *pdev, struct pci_dev *child,
+		unsigned long start_t, char *reset_type)
+{
+	int elapsed = jiffies_to_msecs(jiffies - start_t);
+
+	if (pci_dev_is_disconnected(pdev) || pci_dev_is_disconnected(child))
+		return 0;
+
+	if (pcie_get_speed_cap(pdev) <= PCIE_SPEED_5_0GT) {
+		u16 status;
+
+		pci_dbg(pdev, "waiting %d ms for downstream link\n", elapsed);
+
+		if (!pci_dev_wait(child, reset_type, 0))
+			return 0;
+
+		if (PCI_RESET_WAIT > elapsed)
+			return PCI_RESET_WAIT - elapsed;
+
+		/*
+		 * If the port supports active link reporting we now check
+		 * whether the link is active and if not bail out early with
+		 * the assumption that the device is not present anymore.
+		 */
+		if (!pdev->link_active_reporting)
+			return -ENOTTY;
+
+		pcie_capability_read_word(pdev, PCI_EXP_LNKSTA, &status);
+		if (!(status & PCI_EXP_LNKSTA_DLLLA))
+			return -ENOTTY;
+
+		if (!pci_dev_wait(child, reset_type, 0))
+			return 0;
+
+		if (PCIE_RESET_READY_POLL_MS > elapsed)
+			return PCIE_RESET_READY_POLL_MS - elapsed;
+
+		return -ENOTTY;
+	}
+
+	pci_dbg(pdev, "waiting %d ms for downstream link, after activation\n",
+		elapsed);
+	if (!pcie_wait_for_link_delay(pdev, true, 0)) {
+		/* Did not train, no need to wait any further */
+		pci_info(pdev, "Data Link Layer Link Active not set in %d msec\n", elapsed);
+		return -ENOTTY;
+	}
+
+	if (!pci_dev_wait(child, reset_type, 0))
+		return 0;
+
+	if (PCIE_RESET_READY_POLL_MS > elapsed)
+		return PCIE_RESET_READY_POLL_MS - elapsed;
+
+	return -ENOTTY;
+}
+
 /**
  * pci_bridge_wait_for_secondary_bus - Wait for secondary bus to be accessible
  * @dev: PCI bridge
@@ -4802,12 +4859,14 @@ static int pci_bus_max_d3cold_delay(const struct pci_bus *bus)
  * 4.3.2.
  *
  * Return 0 on success or -ENOTTY if the first device on the secondary bus
- * failed to become accessible.
+ * failed to become accessible or a value greater than 0 indicates the
+ * left required waiting time..
  */
-int pci_bridge_wait_for_secondary_bus(struct pci_dev *dev, char *reset_type)
+static int __pci_bridge_wait_for_secondary_bus(struct pci_dev *dev, unsigned long start_t,
+		char *reset_type)
 {
-	struct pci_dev *child __free(pci_dev_put) = NULL;
-	int delay;
+	struct pci_dev *child;
+	int delay, ret, elapsed = jiffies_to_msecs(jiffies - start_t);
 
 	if (pci_dev_is_disconnected(dev))
 		return 0;
@@ -4835,8 +4894,6 @@ int pci_bridge_wait_for_secondary_bus(struct pci_dev *dev, char *reset_type)
 		return 0;
 	}
 
-	child = pci_dev_get(list_first_entry(&dev->subordinate->devices,
-					     struct pci_dev, bus_list));
 	up_read(&pci_bus_sem);
 
 	/*
@@ -4844,8 +4901,10 @@ int pci_bridge_wait_for_secondary_bus(struct pci_dev *dev, char *reset_type)
 	 * accessing the device after reset (that is 1000 ms + 100 ms).
 	 */
 	if (!pci_is_pcie(dev)) {
-		pci_dbg(dev, "waiting %d ms for secondary bus\n", 1000 + delay);
-		msleep(1000 + delay);
+		if (1000 + delay > elapsed)
+			return 1000 + delay - elapsed;
+
+		pci_dbg(dev, "waiting %d ms for secondary bus\n", elapsed);
 		return 0;
 	}
 
@@ -4867,41 +4926,40 @@ int pci_bridge_wait_for_secondary_bus(struct pci_dev *dev, char *reset_type)
 	if (!pcie_downstream_port(dev))
 		return 0;
 
-	if (pcie_get_speed_cap(dev) <= PCIE_SPEED_5_0GT) {
-		u16 status;
-
-		pci_dbg(dev, "waiting %d ms for downstream link\n", delay);
-		msleep(delay);
-
-		if (!pci_dev_wait(child, reset_type, PCI_RESET_WAIT - delay))
-			return 0;
+	if (delay > elapsed)
+		return delay - elapsed;
 
+	down_read(&pci_bus_sem);
+	list_for_each_entry(child, &dev->subordinate->devices, bus_list) {
 		/*
-		 * If the port supports active link reporting we now check
-		 * whether the link is active and if not bail out early with
-		 * the assumption that the device is not present anymore.
+		 * Check if all devices under the same bus have completed
+		 * the reset process, including multifunction devices in
+		 * the same bus.
 		 */
-		if (!dev->link_active_reporting)
-			return -ENOTTY;
+		ret = pci_readiness_check(dev, child, start_t, reset_type);
 
-		pcie_capability_read_word(dev, PCI_EXP_LNKSTA, &status);
-		if (!(status & PCI_EXP_LNKSTA_DLLLA))
-			return -ENOTTY;
+		if (ret == 0 && child->subordinate)
+			ret = __pci_bridge_wait_for_secondary_bus(child, start_t, reset_type);
 
-		return pci_dev_wait(child, reset_type,
-				    PCIE_RESET_READY_POLL_MS - PCI_RESET_WAIT);
+		if(ret)
+			break;
 	}
+	up_read(&pci_bus_sem);
 
-	pci_dbg(dev, "waiting %d ms for downstream link, after activation\n",
-		delay);
-	if (!pcie_wait_for_link_delay(dev, true, delay)) {
-		/* Did not train, no need to wait any further */
-		pci_info(dev, "Data Link Layer Link Active not set in %d msec\n", delay);
-		return -ENOTTY;
-	}
+	return ret;
+}
 
-	return pci_dev_wait(child, reset_type,
-			    PCIE_RESET_READY_POLL_MS - delay);
+int pci_bridge_wait_for_secondary_bus(struct pci_dev *dev, char *reset_type)
+{
+	int res = 0;
+	unsigned long start_t = jiffies;
+
+	do {
+		msleep(res);
+		res = __pci_bridge_wait_for_secondary_bus(dev, start_t, reset_type);
+	} while (res > 0);
+
+	return res;
 }
 
 void pci_reset_secondary_bus(struct pci_dev *dev)
@@ -5542,10 +5600,8 @@ static void pci_bus_restore_locked(struct pci_bus *bus)
 
 	list_for_each_entry(dev, &bus->devices, bus_list) {
 		pci_dev_restore(dev);
-		if (dev->subordinate) {
-			pci_bridge_wait_for_secondary_bus(dev, "bus reset");
+		if (dev->subordinate)
 			pci_bus_restore_locked(dev->subordinate);
-		}
 	}
 }
 
@@ -5575,14 +5631,14 @@ static void pci_slot_restore_locked(struct pci_slot *slot)
 {
 	struct pci_dev *dev;
 
+	pci_bridge_wait_for_secondary_bus(slot->bus->self, "slot reset");
+
 	list_for_each_entry(dev, &slot->bus->devices, bus_list) {
 		if (!dev->slot || dev->slot != slot)
 			continue;
 		pci_dev_restore(dev);
-		if (dev->subordinate) {
-			pci_bridge_wait_for_secondary_bus(dev, "slot reset");
+		if (dev->subordinate)
 			pci_bus_restore_locked(dev->subordinate);
-		}
 	}
 }
 
-- 
2.32.0.3.gf3a3e56d6


^ permalink raw reply related	[flat|nested] 4+ messages in thread

* Re: [PATCH v2] PCI: Fix PCIe SBR dev/link wait error
  2025-11-29 16:36 ` [PATCH v2] PCI: Fix PCIe SBR dev/link wait error Guanghui Feng
@ 2025-12-01  9:21   ` Ilpo Järvinen
  2025-12-01 12:21     ` guanghuifeng
  0 siblings, 1 reply; 4+ messages in thread
From: Ilpo Järvinen @ 2025-12-01  9:21 UTC (permalink / raw)
  To: Guanghui Feng; +Cc: bhelgaas, linux-pci, LKML, kanie, alikernel-developer

On Sun, 30 Nov 2025, Guanghui Feng wrote:

> When executing a PCIe secondary bus reset, all downstream switches and
> endpoints will generate reset events. Simultaneously, all PCIe links
> will undergo retraining, and each link will independently re-execute the
> LTSSM state machine training. Therefore, after executing the SBR, it is
> necessary to wait for all downstream links and devices to complete
> recovery. Otherwise, after the SBR returns, accessing devices with some
> links or endpoints not yet fully recovered may result in driver errors,
> or even trigger device offline issues.
> 
> Signed-off-by: Guanghui Feng <guanghuifeng@linux.alibaba.com>
> Reviewed-by: Guixin Liu <kanie@linux.alibaba.com>
> ---

Hi,

In future, when posting an update, please always explain here 
below the --- line what was changed between the versions.

>  drivers/pci/pci.c | 138 ++++++++++++++++++++++++++++++++--------------
>  1 file changed, 97 insertions(+), 41 deletions(-)
> 
> diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
> index b14dd064006c..76afecb11164 100644
> --- a/drivers/pci/pci.c
> +++ b/drivers/pci/pci.c
> @@ -4788,6 +4788,63 @@ static int pci_bus_max_d3cold_delay(const struct pci_bus *bus)
>  	return max(min_delay, max_delay);
>  }
>  
> +static int pci_readiness_check(struct pci_dev *pdev, struct pci_dev *child,
> +		unsigned long start_t, char *reset_type)
> +{
> +	int elapsed = jiffies_to_msecs(jiffies - start_t);
> +
> +	if (pci_dev_is_disconnected(pdev) || pci_dev_is_disconnected(child))
> +		return 0;
> +
> +	if (pcie_get_speed_cap(pdev) <= PCIE_SPEED_5_0GT) {
> +		u16 status;
> +
> +		pci_dbg(pdev, "waiting %d ms for downstream link\n", elapsed);
> +
> +		if (!pci_dev_wait(child, reset_type, 0))
> +			return 0;
> +
> +		if (PCI_RESET_WAIT > elapsed)
> +			return PCI_RESET_WAIT - elapsed;
> +
> +		/*
> +		 * If the port supports active link reporting we now check
> +		 * whether the link is active and if not bail out early with
> +		 * the assumption that the device is not present anymore.
> +		 */
> +		if (!pdev->link_active_reporting)
> +			return -ENOTTY;
> +
> +		pcie_capability_read_word(pdev, PCI_EXP_LNKSTA, &status);
> +		if (!(status & PCI_EXP_LNKSTA_DLLLA))
> +			return -ENOTTY;
> +
> +		if (!pci_dev_wait(child, reset_type, 0))
> +			return 0;
> +
> +		if (PCIE_RESET_READY_POLL_MS > elapsed)
> +			return PCIE_RESET_READY_POLL_MS - elapsed;
> +
> +		return -ENOTTY;
> +	}
> +
> +	pci_dbg(pdev, "waiting %d ms for downstream link, after activation\n",
> +		elapsed);
> +	if (!pcie_wait_for_link_delay(pdev, true, 0)) {
> +		/* Did not train, no need to wait any further */
> +		pci_info(pdev, "Data Link Layer Link Active not set in %d msec\n", elapsed);
> +		return -ENOTTY;
> +	}
> +
> +	if (!pci_dev_wait(child, reset_type, 0))
> +		return 0;
> +
> +	if (PCIE_RESET_READY_POLL_MS > elapsed)
> +		return PCIE_RESET_READY_POLL_MS - elapsed;
> +
> +	return -ENOTTY;
> +}
> +
>  /**
>   * pci_bridge_wait_for_secondary_bus - Wait for secondary bus to be accessible
>   * @dev: PCI bridge
> @@ -4802,12 +4859,14 @@ static int pci_bus_max_d3cold_delay(const struct pci_bus *bus)
>   * 4.3.2.
>   *
>   * Return 0 on success or -ENOTTY if the first device on the secondary bus
> - * failed to become accessible.
> + * failed to become accessible or a value greater than 0 indicates the
> + * left required waiting time..
>   */
> -int pci_bridge_wait_for_secondary_bus(struct pci_dev *dev, char *reset_type)
> +static int __pci_bridge_wait_for_secondary_bus(struct pci_dev *dev, unsigned long start_t,
> +		char *reset_type)
>  {
> -	struct pci_dev *child __free(pci_dev_put) = NULL;
> -	int delay;
> +	struct pci_dev *child;
> +	int delay, ret, elapsed = jiffies_to_msecs(jiffies - start_t);
>  
>  	if (pci_dev_is_disconnected(dev))
>  		return 0;
> @@ -4835,8 +4894,6 @@ int pci_bridge_wait_for_secondary_bus(struct pci_dev *dev, char *reset_type)
>  		return 0;
>  	}
>  
> -	child = pci_dev_get(list_first_entry(&dev->subordinate->devices,
> -					     struct pci_dev, bus_list));
>  	up_read(&pci_bus_sem);
>  
>  	/*
> @@ -4844,8 +4901,10 @@ int pci_bridge_wait_for_secondary_bus(struct pci_dev *dev, char *reset_type)
>  	 * accessing the device after reset (that is 1000 ms + 100 ms).
>  	 */
>  	if (!pci_is_pcie(dev)) {
> -		pci_dbg(dev, "waiting %d ms for secondary bus\n", 1000 + delay);
> -		msleep(1000 + delay);
> +		if (1000 + delay > elapsed)
> +			return 1000 + delay - elapsed;
> +
> +		pci_dbg(dev, "waiting %d ms for secondary bus\n", elapsed);
>  		return 0;
>  	}
>  
> @@ -4867,41 +4926,40 @@ int pci_bridge_wait_for_secondary_bus(struct pci_dev *dev, char *reset_type)
>  	if (!pcie_downstream_port(dev))
>  		return 0;
>  
> -	if (pcie_get_speed_cap(dev) <= PCIE_SPEED_5_0GT) {
> -		u16 status;
> -
> -		pci_dbg(dev, "waiting %d ms for downstream link\n", delay);
> -		msleep(delay);
> -
> -		if (!pci_dev_wait(child, reset_type, PCI_RESET_WAIT - delay))
> -			return 0;
> +	if (delay > elapsed)
> +		return delay - elapsed;
>  
> +	down_read(&pci_bus_sem);
> +	list_for_each_entry(child, &dev->subordinate->devices, bus_list) {
>  		/*
> -		 * If the port supports active link reporting we now check
> -		 * whether the link is active and if not bail out early with
> -		 * the assumption that the device is not present anymore.
> +		 * Check if all devices under the same bus have completed
> +		 * the reset process, including multifunction devices in
> +		 * the same bus.
>  		 */
> -		if (!dev->link_active_reporting)
> -			return -ENOTTY;
> +		ret = pci_readiness_check(dev, child, start_t, reset_type);
>  
> -		pcie_capability_read_word(dev, PCI_EXP_LNKSTA, &status);
> -		if (!(status & PCI_EXP_LNKSTA_DLLLA))
> -			return -ENOTTY;
> +		if (ret == 0 && child->subordinate)
> +			ret = __pci_bridge_wait_for_secondary_bus(child, start_t, reset_type);
>  
> -		return pci_dev_wait(child, reset_type,
> -				    PCIE_RESET_READY_POLL_MS - PCI_RESET_WAIT);
> +		if(ret)
> +			break;
>  	}
> +	up_read(&pci_bus_sem);
>  
> -	pci_dbg(dev, "waiting %d ms for downstream link, after activation\n",
> -		delay);
> -	if (!pcie_wait_for_link_delay(dev, true, delay)) {
> -		/* Did not train, no need to wait any further */
> -		pci_info(dev, "Data Link Layer Link Active not set in %d msec\n", delay);
> -		return -ENOTTY;
> -	}
> +	return ret;
> +}
>  
> -	return pci_dev_wait(child, reset_type,
> -			    PCIE_RESET_READY_POLL_MS - delay);
> +int pci_bridge_wait_for_secondary_bus(struct pci_dev *dev, char *reset_type)
> +{
> +	int res = 0;
> +	unsigned long start_t = jiffies;
> +
> +	do {
> +		msleep(res);
> +		res = __pci_bridge_wait_for_secondary_bus(dev, start_t, reset_type);
> +	} while (res > 0);
> +
> +	return res;
>  }
>  
>  void pci_reset_secondary_bus(struct pci_dev *dev)
> @@ -5542,10 +5600,8 @@ static void pci_bus_restore_locked(struct pci_bus *bus)
>  
>  	list_for_each_entry(dev, &bus->devices, bus_list) {
>  		pci_dev_restore(dev);
> -		if (dev->subordinate) {
> -			pci_bridge_wait_for_secondary_bus(dev, "bus reset");
> +		if (dev->subordinate)
>  			pci_bus_restore_locked(dev->subordinate);
> -		}
>  	}
>  }

???

Unfortunately, this takes a wrong turn and is very much against the 
feedback I gave to you.

>  
> @@ -5575,14 +5631,14 @@ static void pci_slot_restore_locked(struct pci_slot *slot)
>  {
>  	struct pci_dev *dev;
>  
> +	pci_bridge_wait_for_secondary_bus(slot->bus->self, "slot reset");
> +
>  	list_for_each_entry(dev, &slot->bus->devices, bus_list) {
>  		if (!dev->slot || dev->slot != slot)
>  			continue;
>  		pci_dev_restore(dev);
> -		if (dev->subordinate) {
> -			pci_bridge_wait_for_secondary_bus(dev, "slot reset");
> +		if (dev->subordinate)
>  			pci_bus_restore_locked(dev->subordinate);
> -		}


-- 
 i.


^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH v2] PCI: Fix PCIe SBR dev/link wait error
  2025-12-01  9:21   ` Ilpo Järvinen
@ 2025-12-01 12:21     ` guanghuifeng
  2025-12-01 13:08       ` Ilpo Järvinen
  0 siblings, 1 reply; 4+ messages in thread
From: guanghuifeng @ 2025-12-01 12:21 UTC (permalink / raw)
  To: Ilpo Järvinen; +Cc: bhelgaas, linux-pci, LKML, kanie, alikernel-developer


在 2025/12/1 17:21, Ilpo Järvinen 写道:
> On Sun, 30 Nov 2025, Guanghui Feng wrote:
>
>> When executing a PCIe secondary bus reset, all downstream switches and
>> endpoints will generate reset events. Simultaneously, all PCIe links
>> will undergo retraining, and each link will independently re-execute the
>> LTSSM state machine training. Therefore, after executing the SBR, it is
>> necessary to wait for all downstream links and devices to complete
>> recovery. Otherwise, after the SBR returns, accessing devices with some
>> links or endpoints not yet fully recovered may result in driver errors,
>> or even trigger device offline issues.
>>
>> Signed-off-by: Guanghui Feng <guanghuifeng@linux.alibaba.com>
>> Reviewed-by: Guixin Liu <kanie@linux.alibaba.com>
>> ---
> Hi,
>
> In future, when posting an update, please always explain here
> below the --- line what was changed between the versions.
OK
>>   drivers/pci/pci.c | 138 ++++++++++++++++++++++++++++++++--------------
>>   1 file changed, 97 insertions(+), 41 deletions(-)
>>
>> diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
>> index b14dd064006c..76afecb11164 100644
>> --- a/drivers/pci/pci.c
>> +++ b/drivers/pci/pci.c
>> @@ -4788,6 +4788,63 @@ static int pci_bus_max_d3cold_delay(const struct pci_bus *bus)
>>   	return max(min_delay, max_delay);
>>   }
>>   
>> +static int pci_readiness_check(struct pci_dev *pdev, struct pci_dev *child,
>> +		unsigned long start_t, char *reset_type)
>> +{
>> +	int elapsed = jiffies_to_msecs(jiffies - start_t);
>> +
>> +	if (pci_dev_is_disconnected(pdev) || pci_dev_is_disconnected(child))
>> +		return 0;
>> +
>> +	if (pcie_get_speed_cap(pdev) <= PCIE_SPEED_5_0GT) {
>> +		u16 status;
>> +
>> +		pci_dbg(pdev, "waiting %d ms for downstream link\n", elapsed);
>> +
>> +		if (!pci_dev_wait(child, reset_type, 0))
>> +			return 0;
>> +
>> +		if (PCI_RESET_WAIT > elapsed)
>> +			return PCI_RESET_WAIT - elapsed;
>> +
>> +		/*
>> +		 * If the port supports active link reporting we now check
>> +		 * whether the link is active and if not bail out early with
>> +		 * the assumption that the device is not present anymore.
>> +		 */
>> +		if (!pdev->link_active_reporting)
>> +			return -ENOTTY;
>> +
>> +		pcie_capability_read_word(pdev, PCI_EXP_LNKSTA, &status);
>> +		if (!(status & PCI_EXP_LNKSTA_DLLLA))
>> +			return -ENOTTY;
>> +
>> +		if (!pci_dev_wait(child, reset_type, 0))
>> +			return 0;
>> +
>> +		if (PCIE_RESET_READY_POLL_MS > elapsed)
>> +			return PCIE_RESET_READY_POLL_MS - elapsed;
>> +
>> +		return -ENOTTY;
>> +	}
>> +
>> +	pci_dbg(pdev, "waiting %d ms for downstream link, after activation\n",
>> +		elapsed);
>> +	if (!pcie_wait_for_link_delay(pdev, true, 0)) {
>> +		/* Did not train, no need to wait any further */
>> +		pci_info(pdev, "Data Link Layer Link Active not set in %d msec\n", elapsed);
>> +		return -ENOTTY;
>> +	}
>> +
>> +	if (!pci_dev_wait(child, reset_type, 0))
>> +		return 0;
>> +
>> +	if (PCIE_RESET_READY_POLL_MS > elapsed)
>> +		return PCIE_RESET_READY_POLL_MS - elapsed;
>> +
>> +	return -ENOTTY;
>> +}
>> +
>>   /**
>>    * pci_bridge_wait_for_secondary_bus - Wait for secondary bus to be accessible
>>    * @dev: PCI bridge
>> @@ -4802,12 +4859,14 @@ static int pci_bus_max_d3cold_delay(const struct pci_bus *bus)
>>    * 4.3.2.
>>    *
>>    * Return 0 on success or -ENOTTY if the first device on the secondary bus
>> - * failed to become accessible.
>> + * failed to become accessible or a value greater than 0 indicates the
>> + * left required waiting time..
>>    */
>> -int pci_bridge_wait_for_secondary_bus(struct pci_dev *dev, char *reset_type)
>> +static int __pci_bridge_wait_for_secondary_bus(struct pci_dev *dev, unsigned long start_t,
>> +		char *reset_type)
>>   {
>> -	struct pci_dev *child __free(pci_dev_put) = NULL;
>> -	int delay;
>> +	struct pci_dev *child;
>> +	int delay, ret, elapsed = jiffies_to_msecs(jiffies - start_t);
>>   
>>   	if (pci_dev_is_disconnected(dev))
>>   		return 0;
>> @@ -4835,8 +4894,6 @@ int pci_bridge_wait_for_secondary_bus(struct pci_dev *dev, char *reset_type)
>>   		return 0;
>>   	}
>>   
>> -	child = pci_dev_get(list_first_entry(&dev->subordinate->devices,
>> -					     struct pci_dev, bus_list));
>>   	up_read(&pci_bus_sem);
>>   
>>   	/*
>> @@ -4844,8 +4901,10 @@ int pci_bridge_wait_for_secondary_bus(struct pci_dev *dev, char *reset_type)
>>   	 * accessing the device after reset (that is 1000 ms + 100 ms).
>>   	 */
>>   	if (!pci_is_pcie(dev)) {
>> -		pci_dbg(dev, "waiting %d ms for secondary bus\n", 1000 + delay);
>> -		msleep(1000 + delay);
>> +		if (1000 + delay > elapsed)
>> +			return 1000 + delay - elapsed;
>> +
>> +		pci_dbg(dev, "waiting %d ms for secondary bus\n", elapsed);
>>   		return 0;
>>   	}
>>   
>> @@ -4867,41 +4926,40 @@ int pci_bridge_wait_for_secondary_bus(struct pci_dev *dev, char *reset_type)
>>   	if (!pcie_downstream_port(dev))
>>   		return 0;
>>   
>> -	if (pcie_get_speed_cap(dev) <= PCIE_SPEED_5_0GT) {
>> -		u16 status;
>> -
>> -		pci_dbg(dev, "waiting %d ms for downstream link\n", delay);
>> -		msleep(delay);
>> -
>> -		if (!pci_dev_wait(child, reset_type, PCI_RESET_WAIT - delay))
>> -			return 0;
>> +	if (delay > elapsed)
>> +		return delay - elapsed;
>>   
>> +	down_read(&pci_bus_sem);
>> +	list_for_each_entry(child, &dev->subordinate->devices, bus_list) {
>>   		/*
>> -		 * If the port supports active link reporting we now check
>> -		 * whether the link is active and if not bail out early with
>> -		 * the assumption that the device is not present anymore.
>> +		 * Check if all devices under the same bus have completed
>> +		 * the reset process, including multifunction devices in
>> +		 * the same bus.
>>   		 */
>> -		if (!dev->link_active_reporting)
>> -			return -ENOTTY;
>> +		ret = pci_readiness_check(dev, child, start_t, reset_type);
>>   
>> -		pcie_capability_read_word(dev, PCI_EXP_LNKSTA, &status);
>> -		if (!(status & PCI_EXP_LNKSTA_DLLLA))
>> -			return -ENOTTY;
>> +		if (ret == 0 && child->subordinate)
>> +			ret = __pci_bridge_wait_for_secondary_bus(child, start_t, reset_type);
>>   
>> -		return pci_dev_wait(child, reset_type,
>> -				    PCIE_RESET_READY_POLL_MS - PCI_RESET_WAIT);
>> +		if(ret)
>> +			break;
>>   	}
>> +	up_read(&pci_bus_sem);
>>   
>> -	pci_dbg(dev, "waiting %d ms for downstream link, after activation\n",
>> -		delay);
>> -	if (!pcie_wait_for_link_delay(dev, true, delay)) {
>> -		/* Did not train, no need to wait any further */
>> -		pci_info(dev, "Data Link Layer Link Active not set in %d msec\n", delay);
>> -		return -ENOTTY;
>> -	}
>> +	return ret;
>> +}
>>   
>> -	return pci_dev_wait(child, reset_type,
>> -			    PCIE_RESET_READY_POLL_MS - delay);
>> +int pci_bridge_wait_for_secondary_bus(struct pci_dev *dev, char *reset_type)
>> +{
>> +	int res = 0;
>> +	unsigned long start_t = jiffies;
>> +
>> +	do {
>> +		msleep(res);
>> +		res = __pci_bridge_wait_for_secondary_bus(dev, start_t, reset_type);
>> +	} while (res > 0);
>> +
>> +	return res;
>>   }
>>   
>>   void pci_reset_secondary_bus(struct pci_dev *dev)
>> @@ -5542,10 +5600,8 @@ static void pci_bus_restore_locked(struct pci_bus *bus)
>>   
>>   	list_for_each_entry(dev, &bus->devices, bus_list) {
>>   		pci_dev_restore(dev);
>> -		if (dev->subordinate) {
>> -			pci_bridge_wait_for_secondary_bus(dev, "bus reset");
>> +		if (dev->subordinate)
>>   			pci_bus_restore_locked(dev->subordinate);
>> -		}
>>   	}
>>   }
> ???
>
> Unfortunately, this takes a wrong turn and is very much against the
> feedback I gave to you.

1. The implementation of 
pci_bridge_secondary_bus_reset/pci_bridge_wait_for_secondary_bus

    has been modified to wait for all downstream switches/link/devices 
to complete their recovery

    before returning.

2. Therefore, when pci_bus_restore_locked is called via __pci_reset_bus, 
the waiting process has

    already been completed via 
pci_bridge_secondary_bus_reset/pci_bridge_wait_for_secondary_bus,

    so pci_bus_restore_locked does not require additional waiting.

>>   
>> @@ -5575,14 +5631,14 @@ static void pci_slot_restore_locked(struct pci_slot *slot)
>>   {
>>   	struct pci_dev *dev;
>>   
>> +	pci_bridge_wait_for_secondary_bus(slot->bus->self, "slot reset");
>> +
>>   	list_for_each_entry(dev, &slot->bus->devices, bus_list) {
>>   		if (!dev->slot || dev->slot != slot)
>>   			continue;
>>   		pci_dev_restore(dev);
>> -		if (dev->subordinate) {
>> -			pci_bridge_wait_for_secondary_bus(dev, "slot reset");
>> +		if (dev->subordinate)
>>   			pci_bus_restore_locked(dev->subordinate);
>> -		}

1. When pci_slot_restore_locked is called for a PCIe slot reset, 
pci_bridge_wait_for_secondary_bus

    has already been executed to wait for all switches, links, and 
devices to complete their recovery firstly

>

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH v2] PCI: Fix PCIe SBR dev/link wait error
  2025-12-01 12:21     ` guanghuifeng
@ 2025-12-01 13:08       ` Ilpo Järvinen
  0 siblings, 0 replies; 4+ messages in thread
From: Ilpo Järvinen @ 2025-12-01 13:08 UTC (permalink / raw)
  To: guanghuifeng@linux.alibaba.com
  Cc: bhelgaas, linux-pci, LKML, kanie, alikernel-developer

[-- Attachment #1: Type: text/plain, Size: 10747 bytes --]

On Mon, 1 Dec 2025, guanghuifeng@linux.alibaba.com wrote:
> 在 2025/12/1 17:21, Ilpo Järvinen 写道:
> > On Sun, 30 Nov 2025, Guanghui Feng wrote:
> > 
> > > When executing a PCIe secondary bus reset, all downstream switches and
> > > endpoints will generate reset events. Simultaneously, all PCIe links
> > > will undergo retraining, and each link will independently re-execute the
> > > LTSSM state machine training. Therefore, after executing the SBR, it is
> > > necessary to wait for all downstream links and devices to complete
> > > recovery. Otherwise, after the SBR returns, accessing devices with some
> > > links or endpoints not yet fully recovered may result in driver errors,
> > > or even trigger device offline issues.
> > > 
> > > Signed-off-by: Guanghui Feng <guanghuifeng@linux.alibaba.com>
> > > Reviewed-by: Guixin Liu <kanie@linux.alibaba.com>
> > > ---
> > Hi,
> > 
> > In future, when posting an update, please always explain here
> > below the --- line what was changed between the versions.
> OK
> > >   drivers/pci/pci.c | 138 ++++++++++++++++++++++++++++++++--------------
> > >   1 file changed, 97 insertions(+), 41 deletions(-)
> > > 
> > > diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
> > > index b14dd064006c..76afecb11164 100644
> > > --- a/drivers/pci/pci.c
> > > +++ b/drivers/pci/pci.c
> > > @@ -4788,6 +4788,63 @@ static int pci_bus_max_d3cold_delay(const struct
> > > pci_bus *bus)
> > >   	return max(min_delay, max_delay);
> > >   }
> > >   +static int pci_readiness_check(struct pci_dev *pdev, struct pci_dev
> > > *child,
> > > +		unsigned long start_t, char *reset_type)
> > > +{
> > > +	int elapsed = jiffies_to_msecs(jiffies - start_t);
> > > +
> > > +	if (pci_dev_is_disconnected(pdev) || pci_dev_is_disconnected(child))
> > > +		return 0;
> > > +
> > > +	if (pcie_get_speed_cap(pdev) <= PCIE_SPEED_5_0GT) {
> > > +		u16 status;
> > > +
> > > +		pci_dbg(pdev, "waiting %d ms for downstream link\n", elapsed);
> > > +
> > > +		if (!pci_dev_wait(child, reset_type, 0))
> > > +			return 0;
> > > +
> > > +		if (PCI_RESET_WAIT > elapsed)
> > > +			return PCI_RESET_WAIT - elapsed;
> > > +
> > > +		/*
> > > +		 * If the port supports active link reporting we now check
> > > +		 * whether the link is active and if not bail out early with
> > > +		 * the assumption that the device is not present anymore.
> > > +		 */
> > > +		if (!pdev->link_active_reporting)
> > > +			return -ENOTTY;
> > > +
> > > +		pcie_capability_read_word(pdev, PCI_EXP_LNKSTA, &status);
> > > +		if (!(status & PCI_EXP_LNKSTA_DLLLA))
> > > +			return -ENOTTY;
> > > +
> > > +		if (!pci_dev_wait(child, reset_type, 0))
> > > +			return 0;
> > > +
> > > +		if (PCIE_RESET_READY_POLL_MS > elapsed)
> > > +			return PCIE_RESET_READY_POLL_MS - elapsed;
> > > +
> > > +		return -ENOTTY;
> > > +	}
> > > +
> > > +	pci_dbg(pdev, "waiting %d ms for downstream link, after activation\n",
> > > +		elapsed);
> > > +	if (!pcie_wait_for_link_delay(pdev, true, 0)) {
> > > +		/* Did not train, no need to wait any further */
> > > +		pci_info(pdev, "Data Link Layer Link Active not set in %d
> > > msec\n", elapsed);
> > > +		return -ENOTTY;
> > > +	}
> > > +
> > > +	if (!pci_dev_wait(child, reset_type, 0))
> > > +		return 0;
> > > +
> > > +	if (PCIE_RESET_READY_POLL_MS > elapsed)
> > > +		return PCIE_RESET_READY_POLL_MS - elapsed;
> > > +
> > > +	return -ENOTTY;
> > > +}
> > > +
> > >   /**
> > >    * pci_bridge_wait_for_secondary_bus - Wait for secondary bus to be
> > > accessible
> > >    * @dev: PCI bridge
> > > @@ -4802,12 +4859,14 @@ static int pci_bus_max_d3cold_delay(const struct
> > > pci_bus *bus)
> > >    * 4.3.2.
> > >    *
> > >    * Return 0 on success or -ENOTTY if the first device on the secondary
> > > bus
> > > - * failed to become accessible.
> > > + * failed to become accessible or a value greater than 0 indicates the
> > > + * left required waiting time..
> > >    */
> > > -int pci_bridge_wait_for_secondary_bus(struct pci_dev *dev, char
> > > *reset_type)
> > > +static int __pci_bridge_wait_for_secondary_bus(struct pci_dev *dev,
> > > unsigned long start_t,
> > > +		char *reset_type)
> > >   {
> > > -	struct pci_dev *child __free(pci_dev_put) = NULL;
> > > -	int delay;
> > > +	struct pci_dev *child;
> > > +	int delay, ret, elapsed = jiffies_to_msecs(jiffies - start_t);
> > >     	if (pci_dev_is_disconnected(dev))
> > >   		return 0;
> > > @@ -4835,8 +4894,6 @@ int pci_bridge_wait_for_secondary_bus(struct pci_dev
> > > *dev, char *reset_type)
> > >   		return 0;
> > >   	}
> > >   -	child = pci_dev_get(list_first_entry(&dev->subordinate->devices,
> > > -					     struct pci_dev, bus_list));
> > >   	up_read(&pci_bus_sem);
> > >     	/*
> > > @@ -4844,8 +4901,10 @@ int pci_bridge_wait_for_secondary_bus(struct
> > > pci_dev *dev, char *reset_type)
> > >   	 * accessing the device after reset (that is 1000 ms + 100 ms).
> > >   	 */
> > >   	if (!pci_is_pcie(dev)) {
> > > -		pci_dbg(dev, "waiting %d ms for secondary bus\n", 1000 +
> > > delay);
> > > -		msleep(1000 + delay);
> > > +		if (1000 + delay > elapsed)
> > > +			return 1000 + delay - elapsed;
> > > +
> > > +		pci_dbg(dev, "waiting %d ms for secondary bus\n", elapsed);
> > >   		return 0;
> > >   	}
> > >   @@ -4867,41 +4926,40 @@ int pci_bridge_wait_for_secondary_bus(struct
> > > pci_dev *dev, char *reset_type)
> > >   	if (!pcie_downstream_port(dev))
> > >   		return 0;
> > >   -	if (pcie_get_speed_cap(dev) <= PCIE_SPEED_5_0GT) {
> > > -		u16 status;
> > > -
> > > -		pci_dbg(dev, "waiting %d ms for downstream link\n", delay);
> > > -		msleep(delay);
> > > -
> > > -		if (!pci_dev_wait(child, reset_type, PCI_RESET_WAIT - delay))
> > > -			return 0;
> > > +	if (delay > elapsed)
> > > +		return delay - elapsed;
> > >   +	down_read(&pci_bus_sem);
> > > +	list_for_each_entry(child, &dev->subordinate->devices, bus_list) {
> > >   		/*
> > > -		 * If the port supports active link reporting we now check
> > > -		 * whether the link is active and if not bail out early with
> > > -		 * the assumption that the device is not present anymore.
> > > +		 * Check if all devices under the same bus have completed
> > > +		 * the reset process, including multifunction devices in
> > > +		 * the same bus.
> > >   		 */
> > > -		if (!dev->link_active_reporting)
> > > -			return -ENOTTY;
> > > +		ret = pci_readiness_check(dev, child, start_t, reset_type);
> > >   -		pcie_capability_read_word(dev, PCI_EXP_LNKSTA, &status);
> > > -		if (!(status & PCI_EXP_LNKSTA_DLLLA))
> > > -			return -ENOTTY;
> > > +		if (ret == 0 && child->subordinate)
> > > +			ret = __pci_bridge_wait_for_secondary_bus(child,
> > > start_t, reset_type);
> > >   -		return pci_dev_wait(child, reset_type,
> > > -				    PCIE_RESET_READY_POLL_MS -
> > > PCI_RESET_WAIT);
> > > +		if(ret)
> > > +			break;
> > >   	}
> > > +	up_read(&pci_bus_sem);
> > >   -	pci_dbg(dev, "waiting %d ms for downstream link, after activation\n",
> > > -		delay);
> > > -	if (!pcie_wait_for_link_delay(dev, true, delay)) {
> > > -		/* Did not train, no need to wait any further */
> > > -		pci_info(dev, "Data Link Layer Link Active not set in %d
> > > msec\n", delay);
> > > -		return -ENOTTY;
> > > -	}
> > > +	return ret;
> > > +}
> > >   -	return pci_dev_wait(child, reset_type,
> > > -			    PCIE_RESET_READY_POLL_MS - delay);
> > > +int pci_bridge_wait_for_secondary_bus(struct pci_dev *dev, char
> > > *reset_type)
> > > +{
> > > +	int res = 0;
> > > +	unsigned long start_t = jiffies;
> > > +
> > > +	do {
> > > +		msleep(res);
> > > +		res = __pci_bridge_wait_for_secondary_bus(dev, start_t,
> > > reset_type);
> > > +	} while (res > 0);
> > > +
> > > +	return res;
> > >   }
> > >     void pci_reset_secondary_bus(struct pci_dev *dev)
> > > @@ -5542,10 +5600,8 @@ static void pci_bus_restore_locked(struct pci_bus
> > > *bus)
> > >     	list_for_each_entry(dev, &bus->devices, bus_list) {
> > >   		pci_dev_restore(dev);
> > > -		if (dev->subordinate) {
> > > -			pci_bridge_wait_for_secondary_bus(dev, "bus reset");
> > > +		if (dev->subordinate)
> > >   			pci_bus_restore_locked(dev->subordinate);
> > > -		}
> > >   	}
> > >   }
> > ???
> > 
> > Unfortunately, this takes a wrong turn and is very much against the
> > feedback I gave to you.
> 
> 1. The implementation of
> pci_bridge_secondary_bus_reset/pci_bridge_wait_for_secondary_bus
> 
>    has been modified to wait for all downstream switches/link/devices to
> complete their recovery
> 
>    before returning.
> 
> 2. Therefore, when pci_bus_restore_locked is called via __pci_reset_bus, the
> waiting process has
> 
>    already been completed via
> pci_bridge_secondary_bus_reset/pci_bridge_wait_for_secondary_bus,
> 
>    so pci_bus_restore_locked does not require additional waiting.

But you must also restore the bridge's config space before waiting for 
devices in its subordinate bus, which is done wrong in this patch as it 
moves recursion out from the correct place into 
pci_bridge_wait_for_secondary_bus() that does not do any restoring of the 
config space as it recurses downwards (and it's not correct to add 
restore there either, I don't think this part of the change is fixing 
any real issue but instead adding an old issue back which has been fixed 
as I explained in my comments to v1).

It's not enough that you get it working in your case (which might be 
able to cope without restoring config space prior to wait for the 
devices).

> > >   @@ -5575,14 +5631,14 @@ static void pci_slot_restore_locked(struct
> > > pci_slot *slot)
> > >   {
> > >   	struct pci_dev *dev;
> > >   +	pci_bridge_wait_for_secondary_bus(slot->bus->self, "slot reset");
> > > +
> > >   	list_for_each_entry(dev, &slot->bus->devices, bus_list) {
> > >   		if (!dev->slot || dev->slot != slot)
> > >   			continue;
> > >   		pci_dev_restore(dev);
> > > -		if (dev->subordinate) {
> > > -			pci_bridge_wait_for_secondary_bus(dev, "slot reset");
> > > +		if (dev->subordinate)
> > >   			pci_bus_restore_locked(dev->subordinate);
> > > -		}
> 
> 1. When pci_slot_restore_locked is called for a PCIe slot reset,
> pci_bridge_wait_for_secondary_bus
> 
>    has already been executed to wait for all switches, links, and devices to
> complete their recovery firstly

You need to handle nested topologies right, config space restore and wait 
on every level have to be in the correct order. It's NOT CORRECT to wait 
first for the entire hierarchy and only after that restore the config 
spaces.


-- 
 i.

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2025-12-01 13:08 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
     [not found] <2e3a1e6b-40ae-3878-e237-fb9032796af8@linux.intel.com>
2025-11-29 16:36 ` [PATCH v2] PCI: Fix PCIe SBR dev/link wait error Guanghui Feng
2025-12-01  9:21   ` Ilpo Järvinen
2025-12-01 12:21     ` guanghuifeng
2025-12-01 13:08       ` Ilpo Järvinen

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox