[PATCH 1/1] PCI/bwctrl: Disable PCIe BW controller during reset

stable.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH 1/1] PCI/bwctrl: Disable PCIe BW controller during reset
@ 2025-02-17 16:52 Ilpo Järvinen
  2025-02-18  8:59 ` Lukas Wunner
  2025-03-13 14:26 ` Ilpo Järvinen
  0 siblings, 2 replies; 8+ messages in thread
From: Ilpo Järvinen @ 2025-02-17 16:52 UTC (permalink / raw)
  To: Lukas Wunner, Bjorn Helgaas, Ilpo Järvinen, Jonathan Cameron,
	linux-pci, linux-kernel
  Cc: Joel Mathew Thomas, stable

PCIe BW controller enables BW notifications for Downstream Ports by
setting Link Bandwidth Management Interrupt Enable (LBMIE) and Link
Autonomous Bandwidth Interrupt Enable (LABIE) (PCIe Spec. r6.2 sec.
7.5.3.7).

It was discovered that performing a reset can lead to the device
underneath the Downstream Port becoming unavailable if BW notifications
are left enabled throughout the reset sequence (at least LBMIE was
found to cause an issue).

While the PCIe Specifications do not indicate BW notifications could not
be kept enabled during resets, the PCIe Link state during an
intentional reset is not of large interest. Thus, disable BW controller
for the bridge while reset is performed and re-enable it after the
reset has completed to workaround the problems some devices encounter
if BW notifications are left on throughout the reset sequence.

Keep a counter for the disable/enable because MFD will execute
pci_dev_save_and_disable() and pci_dev_restore() back to back for
sibling devices:

[   50.139010] vfio-pci 0000:01:00.0: resetting
[   50.139053] vfio-pci 0000:01:00.1: resetting
[   50.141126] pcieport 0000:00:01.1: PME: Spurious native interrupt!
[   50.141133] pcieport 0000:00:01.1: PME: Spurious native interrupt!
[   50.441466] vfio-pci 0000:01:00.0: reset done
[   50.501534] vfio-pci 0000:01:00.1: reset done

Fixes: 665745f27487 ("PCI/bwctrl: Re-add BW notification portdrv as PCIe BW controller")
Closes: https://bugzilla.kernel.org/show_bug.cgi?id=219765
Tested-by: Joel Mathew Thomas <proxy0@tutamail.com>
Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Cc: stable@vger.kernel.org
---

I suspect the root cause is some kind of violation of specifications.
Resets shouldn't cause devices to become unavailable just because BW
notifications have been enabled.

Before somebody comments on those dual rwsems, I do have yet to be
submitted patch to simplify the locking as per Lukas Wunner's earlier
suggestion. I've just focused on solving the regressions first.

 drivers/pci/pci.c         |  8 +++++++
 drivers/pci/pci.h         |  4 ++++
 drivers/pci/pcie/bwctrl.c | 49 ++++++++++++++++++++++++++++++++-------
 3 files changed, 53 insertions(+), 8 deletions(-)

diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 869d204a70a3..7a53d7474175 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -5148,6 +5148,7 @@ static void pci_dev_save_and_disable(struct pci_dev *dev)
 {
 	const struct pci_error_handlers *err_handler =
 			dev->driver ? dev->driver->err_handler : NULL;
+	struct pci_dev *bridge = pci_upstream_bridge(dev);
 
 	/*
 	 * dev->driver->err_handler->reset_prepare() is protected against
@@ -5166,6 +5167,9 @@ static void pci_dev_save_and_disable(struct pci_dev *dev)
 	 */
 	pci_set_power_state(dev, PCI_D0);
 
+	if (bridge)
+		pcie_bwnotif_disable(bridge);
+
 	pci_save_state(dev);
 	/*
 	 * Disable the device by clearing the Command register, except for
@@ -5181,9 +5185,13 @@ static void pci_dev_restore(struct pci_dev *dev)
 {
 	const struct pci_error_handlers *err_handler =
 			dev->driver ? dev->driver->err_handler : NULL;
+	struct pci_dev *bridge = pci_upstream_bridge(dev);
 
 	pci_restore_state(dev);
 
+	if (bridge)
+		pcie_bwnotif_enable(bridge);
+
 	/*
 	 * dev->driver->err_handler->reset_done() is protected against
 	 * races with ->remove() by the device lock, which must be held by
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 01e51db8d285..856546f1aad9 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -759,12 +759,16 @@ static inline void pcie_ecrc_get_policy(char *str) { }
 #ifdef CONFIG_PCIEPORTBUS
 void pcie_reset_lbms_count(struct pci_dev *port);
 int pcie_lbms_count(struct pci_dev *port, unsigned long *val);
+void pcie_bwnotif_enable(struct pci_dev *port);
+void pcie_bwnotif_disable(struct pci_dev *port);
 #else
 static inline void pcie_reset_lbms_count(struct pci_dev *port) {}
 static inline int pcie_lbms_count(struct pci_dev *port, unsigned long *val)
 {
 	return -EOPNOTSUPP;
 }
+static inline void pcie_bwnotif_enable(struct pci_dev *port) {}
+static inline void pcie_bwnotif_disable(struct pci_dev *port) {}
 #endif
 
 struct pci_dev_reset_methods {
diff --git a/drivers/pci/pcie/bwctrl.c b/drivers/pci/pcie/bwctrl.c
index 0a5e7efbce2c..a117f6f67c07 100644
--- a/drivers/pci/pcie/bwctrl.c
+++ b/drivers/pci/pcie/bwctrl.c
@@ -40,11 +40,13 @@
  * @set_speed_mutex:	Serializes link speed changes
  * @lbms_count:		Count for LBMS (since last reset)
  * @cdev:		Thermal cooling device associated with the port
+ * @disable_count:	BW notifications disabled/enabled counter
  */
 struct pcie_bwctrl_data {
 	struct mutex set_speed_mutex;
 	atomic_t lbms_count;
 	struct thermal_cooling_device *cdev;
+	int disable_count;
 };
 
 /*
@@ -200,10 +202,9 @@ int pcie_set_target_speed(struct pci_dev *port, enum pci_bus_speed speed_req,
 	return ret;
 }
 
-static void pcie_bwnotif_enable(struct pcie_device *srv)
+static void __pcie_bwnotif_enable(struct pci_dev *port)
 {
-	struct pcie_bwctrl_data *data = srv->port->link_bwctrl;
-	struct pci_dev *port = srv->port;
+	struct pcie_bwctrl_data *data = port->link_bwctrl;
 	u16 link_status;
 	int ret;
 
@@ -224,12 +225,44 @@ static void pcie_bwnotif_enable(struct pcie_device *srv)
 	pcie_update_link_speed(port->subordinate);
 }
 
-static void pcie_bwnotif_disable(struct pci_dev *port)
+void pcie_bwnotif_enable(struct pci_dev *port)
+{
+	guard(rwsem_read)(&pcie_bwctrl_setspeed_rwsem);
+	guard(rwsem_read)(&pcie_bwctrl_lbms_rwsem);
+
+	if (!port->link_bwctrl)
+		return;
+
+	port->link_bwctrl->disable_count--;
+	if (!port->link_bwctrl->disable_count) {
+		__pcie_bwnotif_enable(port);
+		pci_dbg(port, "BW notifications enabled\n");
+	}
+	WARN_ON_ONCE(port->link_bwctrl->disable_count < 0);
+}
+
+static void __pcie_bwnotif_disable(struct pci_dev *port)
 {
 	pcie_capability_clear_word(port, PCI_EXP_LNKCTL,
 				   PCI_EXP_LNKCTL_LBMIE | PCI_EXP_LNKCTL_LABIE);
 }
 
+void pcie_bwnotif_disable(struct pci_dev *port)
+{
+	guard(rwsem_read)(&pcie_bwctrl_setspeed_rwsem);
+	guard(rwsem_read)(&pcie_bwctrl_lbms_rwsem);
+
+	if (!port->link_bwctrl)
+		return;
+
+	port->link_bwctrl->disable_count++;
+
+	if (port->link_bwctrl->disable_count == 1) {
+		__pcie_bwnotif_disable(port);
+		pci_dbg(port, "BW notifications disabled\n");
+	}
+}
+
 static irqreturn_t pcie_bwnotif_irq(int irq, void *context)
 {
 	struct pcie_device *srv = context;
@@ -314,7 +347,7 @@ static int pcie_bwnotif_probe(struct pcie_device *srv)
 				return ret;
 			}
 
-			pcie_bwnotif_enable(srv);
+			__pcie_bwnotif_enable(port);
 		}
 	}
 
@@ -336,7 +369,7 @@ static void pcie_bwnotif_remove(struct pcie_device *srv)
 
 	scoped_guard(rwsem_write, &pcie_bwctrl_setspeed_rwsem) {
 		scoped_guard(rwsem_write, &pcie_bwctrl_lbms_rwsem) {
-			pcie_bwnotif_disable(srv->port);
+			__pcie_bwnotif_disable(srv->port);
 
 			free_irq(srv->irq, srv);
 
@@ -347,13 +380,13 @@ static void pcie_bwnotif_remove(struct pcie_device *srv)
 
 static int pcie_bwnotif_suspend(struct pcie_device *srv)
 {
-	pcie_bwnotif_disable(srv->port);
+	__pcie_bwnotif_disable(srv->port);
 	return 0;
 }
 
 static int pcie_bwnotif_resume(struct pcie_device *srv)
 {
-	pcie_bwnotif_enable(srv);
+	__pcie_bwnotif_enable(srv->port);
 	return 0;
 }
 

base-commit: 2014c95afecee3e76ca4a56956a936e23283f05b
-- 
2.39.5


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH 1/1] PCI/bwctrl: Disable PCIe BW controller during reset
  2025-02-17 16:52 [PATCH 1/1] PCI/bwctrl: Disable PCIe BW controller during reset Ilpo Järvinen
@ 2025-02-18  8:59 ` Lukas Wunner
  2025-02-24 15:13   ` Ilpo Järvinen
  2025-03-13 14:26 ` Ilpo Järvinen
  1 sibling, 1 reply; 8+ messages in thread
From: Lukas Wunner @ 2025-02-18  8:59 UTC (permalink / raw)
  To: Ilpo Järvinen
  Cc: Bjorn Helgaas, Jonathan Cameron, linux-pci, linux-kernel,
	Joel Mathew Thomas, stable

On Mon, Feb 17, 2025 at 06:52:58PM +0200, Ilpo Järvinen wrote:
> PCIe BW controller enables BW notifications for Downstream Ports by
> setting Link Bandwidth Management Interrupt Enable (LBMIE) and Link
> Autonomous Bandwidth Interrupt Enable (LABIE) (PCIe Spec. r6.2 sec.
> 7.5.3.7).
> 
> It was discovered that performing a reset can lead to the device
> underneath the Downstream Port becoming unavailable if BW notifications
> are left enabled throughout the reset sequence (at least LBMIE was
> found to cause an issue).

What kind of reset?  FLR?  SBR?  This needs to be specified in the
commit message so that the reader isn't forced to sift through a
bugzilla with dozens of comments and attachments.

The commit message should also mention the type of affected device
(Nvidia GPU AD107M [GeForce RTX 4050 Max-Q / Mobile]).  The Root Port
above is an AMD one, that may be relevant as well.

> While the PCIe Specifications do not indicate BW notifications could not
> be kept enabled during resets, the PCIe Link state during an
> intentional reset is not of large interest. Thus, disable BW controller
> for the bridge while reset is performed and re-enable it after the
> reset has completed to workaround the problems some devices encounter
> if BW notifications are left on throughout the reset sequence.

This approach won't work if the reset is performed without software
intervention.  E.g. if a DPC event occurs, the device likewise undergoes
a reset but there is no prior system software involvement.  Software only
becomes involved *after* the reset has occurred.

I think it needs to be tested if that same issue occurs with DPC.
It's easy to simulate DPC by setting the Software Trigger bit:

setpci -s 00:01.1 ECAP_DPC+6.w=40:40

If the issue does occur with DPC then this fix isn't sufficient.

> Keep a counter for the disable/enable because MFD will execute
> pci_dev_save_and_disable() and pci_dev_restore() back to back for
> sibling devices:
> 
> [   50.139010] vfio-pci 0000:01:00.0: resetting
> [   50.139053] vfio-pci 0000:01:00.1: resetting
> [   50.141126] pcieport 0000:00:01.1: PME: Spurious native interrupt!
> [   50.141133] pcieport 0000:00:01.1: PME: Spurious native interrupt!
> [   50.441466] vfio-pci 0000:01:00.0: reset done
> [   50.501534] vfio-pci 0000:01:00.1: reset done

So why are you citing the PME messages here?  Are they relevant?
Do they not occur when the bandwidth controller is disabled?
If they do not, they may provide a clue what's going on.
But that's not clear from the commit message.

> --- a/drivers/pci/pci.c
> +++ b/drivers/pci/pci.c
> @@ -5166,6 +5167,9 @@ static void pci_dev_save_and_disable(struct pci_dev *dev)
>  	 */
>  	pci_set_power_state(dev, PCI_D0);
>  
> +	if (bridge)
> +		pcie_bwnotif_disable(bridge);
> +
>  	pci_save_state(dev);

Instead of putting this in the PCI core, amend pcie_portdrv_err_handler
with ->reset_prepare and ->reset_done callbacks which call down to all
the port service drivers, then amend bwctrl.c to disable/enable
interrupts in these callbacks.

> +	port->link_bwctrl->disable_count--;
> +	if (!port->link_bwctrl->disable_count) {
> +		__pcie_bwnotif_enable(port);
> +		pci_dbg(port, "BW notifications enabled\n");
> +	}
> +	WARN_ON_ONCE(port->link_bwctrl->disable_count < 0);

So why do you need to count this?  IIUC you get two consecutive
disable and two consecutive enable events.

If the interrupts are already disabled, just do nothing.
Same for enablement.  Any reason this simpler approach
doesn't work?

Thanks,

Lukas

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH 1/1] PCI/bwctrl: Disable PCIe BW controller during reset
  2025-02-18  8:59 ` Lukas Wunner
@ 2025-02-24 15:13   ` Ilpo Järvinen
  2025-02-27  5:31     ` Lukas Wunner
  0 siblings, 1 reply; 8+ messages in thread
From: Ilpo Järvinen @ 2025-02-24 15:13 UTC (permalink / raw)
  To: Lukas Wunner
  Cc: Bjorn Helgaas, Jonathan Cameron, linux-pci, LKML,
	Joel Mathew Thomas, stable

[-- Attachment #1: Type: text/plain, Size: 5429 bytes --]

On Tue, 18 Feb 2025, Lukas Wunner wrote:

> On Mon, Feb 17, 2025 at 06:52:58PM +0200, Ilpo Järvinen wrote:
> > PCIe BW controller enables BW notifications for Downstream Ports by
> > setting Link Bandwidth Management Interrupt Enable (LBMIE) and Link
> > Autonomous Bandwidth Interrupt Enable (LABIE) (PCIe Spec. r6.2 sec.
> > 7.5.3.7).
> > 
> > It was discovered that performing a reset can lead to the device
> > underneath the Downstream Port becoming unavailable if BW notifications
> > are left enabled throughout the reset sequence (at least LBMIE was
> > found to cause an issue).
> 
> What kind of reset?  FLR?  SBR?  This needs to be specified in the
> commit message so that the reader isn't forced to sift through a
> bugzilla with dozens of comments and attachments.

Heh, I never really tried to figure out it because the reset disable 
patch was just a stab into the dark style patch. To my surprise, it ended 
up working (after the initial confusion was resolved) and I just started 
to prepare this patch from that knowledge.

Logs do mention this:

[   21.560206] pcieport 0000:00:01.1: unlocked secondary bus reset via: pciehp_reset_slot+0x98/0x140

...so it seems to be SBR.

> The commit message should also mention the type of affected device
> (Nvidia GPU AD107M [GeForce RTX 4050 Max-Q / Mobile]).  The Root Port
> above is an AMD one, that may be relevant as well.

Okay.

> > While the PCIe Specifications do not indicate BW notifications could not
> > be kept enabled during resets, the PCIe Link state during an
> > intentional reset is not of large interest. Thus, disable BW controller
> > for the bridge while reset is performed and re-enable it after the
> > reset has completed to workaround the problems some devices encounter
> > if BW notifications are left on throughout the reset sequence.
> 
> This approach won't work if the reset is performed without software
> intervention.  E.g. if a DPC event occurs, the device likewise undergoes
> a reset but there is no prior system software involvement.  Software only
> becomes involved *after* the reset has occurred.
> 
> I think it needs to be tested if that same issue occurs with DPC.
> It's easy to simulate DPC by setting the Software Trigger bit:
> 
> setpci -s 00:01.1 ECAP_DPC+6.w=40:40
> 
> If the issue does occur with DPC then this fix isn't sufficient.

Looking into lspci logs, I don't see DPC capability being there for 
00:01.1?!

> > Keep a counter for the disable/enable because MFD will execute
> > pci_dev_save_and_disable() and pci_dev_restore() back to back for
> > sibling devices:
> > 
> > [   50.139010] vfio-pci 0000:01:00.0: resetting
> > [   50.139053] vfio-pci 0000:01:00.1: resetting
> > [   50.141126] pcieport 0000:00:01.1: PME: Spurious native interrupt!
> > [   50.141133] pcieport 0000:00:01.1: PME: Spurious native interrupt!
> > [   50.441466] vfio-pci 0000:01:00.0: reset done
> > [   50.501534] vfio-pci 0000:01:00.1: reset done
> 
> So why are you citing the PME messages here?  Are they relevant?
> Do they not occur when the bandwidth controller is disabled?
> If they do not, they may provide a clue what's going on.
> But that's not clear from the commit message.

They do occur also when BW controller is disabled for the duration of 
reset. What I don't currently have at hand is a comparable log from era 
prior to any BW controller commits (from 6.12). The one currently in 
bugzilla has the out-of-tree module loaded.

PME and BW notifications do share the interrupt so I'd not entirely 
discount their relevance though, and one of my theories (never mentioned 
anywhere until now) was that the extra interrupts that come due to BW 
notifications somehow manage to confuse PME driver. But I never found 
anything to that effect.

I can remove the PME lines though.

> > --- a/drivers/pci/pci.c
> > +++ b/drivers/pci/pci.c
> > @@ -5166,6 +5167,9 @@ static void pci_dev_save_and_disable(struct pci_dev *dev)
> >  	 */
> >  	pci_set_power_state(dev, PCI_D0);
> >  
> > +	if (bridge)
> > +		pcie_bwnotif_disable(bridge);
> > +
> >  	pci_save_state(dev);
> 
> Instead of putting this in the PCI core, amend pcie_portdrv_err_handler
> with ->reset_prepare and ->reset_done callbacks which call down to all
> the port service drivers, then amend bwctrl.c to disable/enable
> interrupts in these callbacks.

Will it work? I mean if the port itself is not reset (0000:00:01.1 in this 
case), do these callbacks get called for it?

> > +	port->link_bwctrl->disable_count--;
> > +	if (!port->link_bwctrl->disable_count) {
> > +		__pcie_bwnotif_enable(port);
> > +		pci_dbg(port, "BW notifications enabled\n");
> > +	}
> > +	WARN_ON_ONCE(port->link_bwctrl->disable_count < 0);
> 
> So why do you need to count this?  IIUC you get two consecutive
> disable and two consecutive enable events.
> 
> If the interrupts are already disabled, just do nothing.
> Same for enablement.  Any reason this simpler approach
> doesn't work?

If enabling on restore of the first dev, BW notifications would get 
enabled before the last of the devices has been restored. While it might 
work, the test patch, after all, did work without this complexity, IMO it 
seems unwise to create such a racy pattern.

For disable side, I think it would be possible to always disable 
unconditionally.

-- 
 i.

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH 1/1] PCI/bwctrl: Disable PCIe BW controller during reset
  2025-02-24 15:13   ` Ilpo Järvinen
@ 2025-02-27  5:31     ` Lukas Wunner
  2025-02-28  8:37       ` Lukas Wunner
  2025-03-03 11:58       ` Ilpo Järvinen
  0 siblings, 2 replies; 8+ messages in thread
From: Lukas Wunner @ 2025-02-27  5:31 UTC (permalink / raw)
  To: Ilpo Järvinen
  Cc: Bjorn Helgaas, Jonathan Cameron, linux-pci, LKML,
	Joel Mathew Thomas, stable

On Mon, Feb 24, 2025 at 05:13:15PM +0200, Ilpo Järvinen wrote:
> On Tue, 18 Feb 2025, Lukas Wunner wrote:
> > On Mon, Feb 17, 2025 at 06:52:58PM +0200, Ilpo Järvinen wrote:
> > > PCIe BW controller enables BW notifications for Downstream Ports by
> > > setting Link Bandwidth Management Interrupt Enable (LBMIE) and Link
> > > Autonomous Bandwidth Interrupt Enable (LABIE) (PCIe Spec. r6.2 sec.
> > > 7.5.3.7).
> > > 
> > > It was discovered that performing a reset can lead to the device
> > > underneath the Downstream Port becoming unavailable if BW notifications
> > > are left enabled throughout the reset sequence (at least LBMIE was
> > > found to cause an issue).
> > 
> > What kind of reset?  FLR?  SBR?  This needs to be specified in the
> > commit message so that the reader isn't forced to sift through a
> > bugzilla with dozens of comments and attachments.
> 
> Heh, I never really tried to figure out it because the reset disable 
> patch was just a stab into the dark style patch. To my surprise, it ended 
> up working (after the initial confusion was resolved) and I just started 
> to prepare this patch from that knowledge.

If the present patch is of the type "changing this somehow makes the
problem go away" instead of a complete root-cause analysis, it would
have been appropriate to mark it as an RFC.

I've started to dig into the bugzilla and the very first attachment
(dmesg for the non-working case) shows:

  vfio-pci 0000:01:00.0: timed out waiting for pending transaction; performing function level reset anyway

That message is emitted by pcie_flr().  Perhaps the Nvidia GPU takes
more time than usual to finish pending transactions, so the first
thing I would have tried would be to raise the timeout significantly
and see if that helps.  Yet I'm not seeing any patch or comment in
the bugzilla where this was attempted.  Please provide a patch for
the reporter to verify this hypothesis.

> Logs do mention this:
> 
> [   21.560206] pcieport 0000:00:01.1: unlocked secondary bus reset via: pciehp_reset_slot+0x98/0x140
> 
> ...so it seems to be SBR.

Looking at the vfio code, vfio_pci_core_enable() (which is called on
binding the vfio driver to the GPU) invokes pci_try_reset_function().
This will execute the reset method configured via sysfs.  The same
is done on unbind via vfio_pci_core_disable().

So you should have asked the reporter for the contents of:
/sys/bus/pci/devices/0000:01:00.0/reset_method
/sys/bus/pci/devices/0000:01:00.1/reset_method

In particular, I would like to know whether the contents differ across
different kernel versions.

There's another way to perform a reset:   Via an ioctl.  This ends up
calling vfio_pci_dev_set_hot_reset(), which invokes pci_reset_bus()
to perform an SBR.

Looking at dmesg output in log_linux_6.13.2-arch1-1_pcie_port_pm_off.log
it seems that vfio first performs a function reset of the GPU on bind...

[   40.171564] vfio-pci 0000:01:00.0: resetting
[   40.276485] vfio-pci 0000:01:00.0: reset done

...and then goes on to perform an SBR both of the GPU and its audio
device...

[   40.381082] vfio-pci 0000:01:00.0: resetting
[   40.381180] vfio-pci 0000:01:00.1: resetting
[   40.381228] pcieport 0000:00:01.1: unlocked secondary bus reset via: pciehp_reset_slot+0x98/0x140
[   40.620442] vfio-pci 0000:01:00.0: reset done
[   40.620479] vfio-pci 0000:01:00.1: reset done

...which is odd because the audio device apparently wasn't bound to
vfio-pci, otherwise there would have been a function reset.  So why
does vfio think it can safely reset it?

Oddly, there is a third function reset of only the GPU:

[   40.621894] vfio-pci 0000:01:00.0: resetting
[   40.724430] vfio-pci 0000:01:00.0: reset done

The reporter writes that pcie_port_pm=off avoids the PME messages.
If the reset_method is "pm", I could imagine that the Nvidia GPU
signals a PME event during the D0 -> D3hot -> D0 transition.

I also note that the vfio-pci driver allows runtime PM.  So both the
GPU and its audio device may runtime suspend to D3hot.  This in turn
lets the Root Port runtime suspend to D3hot.  It looks like the
reporter is using a laptop with an integrated AMD GPU and a
discrete Nvidia GPU.  On such products the platform often allows
powering down the discrete GPU and this is usually controlled
through ACPI Power Resources attached to the Root Port.
Those are powered off after the Root Port goes to D3hot.
You should have asked the reporter for an acpidump.

pcie_bwnotif_irq() accesses the Link Status register without
acquiring a runtime PM reference on the PCIe port.  This feels
wrong and may also contribute to the issue reported here.
Acquiring a runtime PM ref may sleep, so I think you need to
change the driver to use a threaded IRQ handler.

Nvidia GPUs are known to hide the audio device if no audio-capable
display is attached (e.g. HDMI).  quirk_nvidia_hda() unhides the
audio device on boot and resume.  It might be necessary to also run
the quirk after resetting the GPU.  Knowing which reset_method
was used is important to decide if that's necessary, and also
whether a display was attached.

Moreover Nvidia GPUs are known to change the link speed on idle
to reduce power consumption.  Perhaps resetting the GPU causes
a change of link speed and thus execution of pcie_bwnotif_irq()?

> > This approach won't work if the reset is performed without software
> > intervention.  E.g. if a DPC event occurs, the device likewise undergoes
> > a reset but there is no prior system software involvement.  Software only
> > becomes involved *after* the reset has occurred.
> > 
> > I think it needs to be tested if that same issue occurs with DPC.
> > It's easy to simulate DPC by setting the Software Trigger bit:
> > 
> > setpci -s 00:01.1 ECAP_DPC+6.w=40:40
> > 
> > If the issue does occur with DPC then this fix isn't sufficient.
> 
> Looking into lspci logs, I don't see DPC capability being there for 
> 00:01.1?!

Hm, so we can't verify whether your approach is safe for DPC.

> > Instead of putting this in the PCI core, amend pcie_portdrv_err_handler
> > with ->reset_prepare and ->reset_done callbacks which call down to all
> > the port service drivers, then amend bwctrl.c to disable/enable
> > interrupts in these callbacks.
> 
> Will it work? I mean if the port itself is not reset (0000:00:01.1 in this 
> case), do these callbacks get called for it?

Never mind, indeed this won't work.

Thanks,

Lukas

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH 1/1] PCI/bwctrl: Disable PCIe BW controller during reset
  2025-02-27  5:31     ` Lukas Wunner
@ 2025-02-28  8:37       ` Lukas Wunner
  2025-03-03 10:44         ` Ilpo Järvinen
  2025-03-03 11:58       ` Ilpo Järvinen
  1 sibling, 1 reply; 8+ messages in thread
From: Lukas Wunner @ 2025-02-28  8:37 UTC (permalink / raw)
  To: Ilpo Järvinen
  Cc: Bjorn Helgaas, Jonathan Cameron, linux-pci, LKML,
	Joel Mathew Thomas, stable

On Thu, Feb 27, 2025 at 06:31:08AM +0100, Lukas Wunner wrote:
> pcie_bwnotif_irq() accesses the Link Status register without
> acquiring a runtime PM reference on the PCIe port.  This feels
> wrong and may also contribute to the issue reported here.
> Acquiring a runtime PM ref may sleep, so I think you need to
> change the driver to use a threaded IRQ handler.

I've realized we've had a discussion before why a threaded IRQ handler
doesn't make sense...

https://lore.kernel.org/all/Z35qJ3H_8u5LQDJ6@wunner.de/

...but I'm still worried that a Downstream Port in a nested-switch
configuration may be runtime suspended while the hardirq handler
is running.  Is there anything preventing that from happening?

To access config space of a port, it's sufficient if its upstream
bridge is runtime active (i.e. in PCI D0).

So basically the below is what I have in mind.  This assumes that
the upstream bridge is still in D0 when the interrupt handler runs
because in atomic context we can't wait for it to be runtime resumed.
Seems like a fair assumption to me but what do I know...

-- >8 --

diff --git a/drivers/pci/pcie/bwctrl.c b/drivers/pci/pcie/bwctrl.c
index 0a5e7efbce2c..fea8f7412266 100644
--- a/drivers/pci/pcie/bwctrl.c
+++ b/drivers/pci/pcie/bwctrl.c
@@ -28,6 +28,7 @@
 #include <linux/mutex.h>
 #include <linux/pci.h>
 #include <linux/pci-bwctrl.h>
+#include <linux/pm_runtime.h>
 #include <linux/rwsem.h>
 #include <linux/slab.h>
 #include <linux/types.h>
@@ -235,9 +236,13 @@ static irqreturn_t pcie_bwnotif_irq(int irq, void *context)
 	struct pcie_device *srv = context;
 	struct pcie_bwctrl_data *data = srv->port->link_bwctrl;
 	struct pci_dev *port = srv->port;
+	struct device *parent __free(pm_runtime_put) = port->dev.parent;
 	u16 link_status, events;
 	int ret;
 
+	if (parent)
+		pm_runtime_get_noresume(parent);
+
 	ret = pcie_capability_read_word(port, PCI_EXP_LNKSTA, &link_status);
 	if (ret != PCIBIOS_SUCCESSFUL)
 		return IRQ_NONE;
diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h
index d39dc863f612..038228de773d 100644
--- a/include/linux/pm_runtime.h
+++ b/include/linux/pm_runtime.h
@@ -448,6 +448,8 @@ static inline int pm_runtime_put(struct device *dev)
 	return __pm_runtime_idle(dev, RPM_GET_PUT | RPM_ASYNC);
 }
 
+DEFINE_FREE(pm_runtime_put, struct device *, if (_T) pm_runtime_put(_T))
+
 /**
  * __pm_runtime_put_autosuspend - Drop device usage counter and queue autosuspend if 0.
  * @dev: Target device.

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH 1/1] PCI/bwctrl: Disable PCIe BW controller during reset
  2025-02-28  8:37       ` Lukas Wunner
@ 2025-03-03 10:44         ` Ilpo Järvinen
  0 siblings, 0 replies; 8+ messages in thread
From: Ilpo Järvinen @ 2025-03-03 10:44 UTC (permalink / raw)
  To: Lukas Wunner
  Cc: Bjorn Helgaas, Jonathan Cameron, linux-pci, LKML,
	Joel Mathew Thomas, stable

On Fri, 28 Feb 2025, Lukas Wunner wrote:

> On Thu, Feb 27, 2025 at 06:31:08AM +0100, Lukas Wunner wrote:
> > pcie_bwnotif_irq() accesses the Link Status register without
> > acquiring a runtime PM reference on the PCIe port.  This feels
> > wrong and may also contribute to the issue reported here.
> > Acquiring a runtime PM ref may sleep, so I think you need to
> > change the driver to use a threaded IRQ handler.
> 
> I've realized we've had a discussion before why a threaded IRQ handler
> doesn't make sense...

Yes.
 
> https://lore.kernel.org/all/Z35qJ3H_8u5LQDJ6@wunner.de/
>
> ...but I'm still worried that a Downstream Port in a nested-switch
> configuration may be runtime suspended while the hardirq handler
> is running.  Is there anything preventing that from happening?

I don't think there is.

> To access config space of a port, it's sufficient if its upstream
> bridge is runtime active (i.e. in PCI D0).
> 
> So basically the below is what I have in mind.  This assumes that
> the upstream bridge is still in D0 when the interrupt handler runs
> because in atomic context we can't wait for it to be runtime resumed.
> Seems like a fair assumption to me but what do I know...

bwctrl doesn't even want to resume the port in the irqhandler. If the port
is suspended, why would it have LBMS/LABS, and we disabled notifications 
anyway in suspend handler anyway so we're not even expecting them to come 
during a period of suspend (which does not mean there couldn't be 
interrupts due to other sources).

So there should be no problem in not calling resume for it.

> -- >8 --
> 
> diff --git a/drivers/pci/pcie/bwctrl.c b/drivers/pci/pcie/bwctrl.c
> index 0a5e7efbce2c..fea8f7412266 100644
> --- a/drivers/pci/pcie/bwctrl.c
> +++ b/drivers/pci/pcie/bwctrl.c
> @@ -28,6 +28,7 @@
>  #include <linux/mutex.h>
>  #include <linux/pci.h>
>  #include <linux/pci-bwctrl.h>
> +#include <linux/pm_runtime.h>
>  #include <linux/rwsem.h>
>  #include <linux/slab.h>
>  #include <linux/types.h>
> @@ -235,9 +236,13 @@ static irqreturn_t pcie_bwnotif_irq(int irq, void *context)
>  	struct pcie_device *srv = context;
>  	struct pcie_bwctrl_data *data = srv->port->link_bwctrl;
>  	struct pci_dev *port = srv->port;
> +	struct device *parent __free(pm_runtime_put) = port->dev.parent;
>  	u16 link_status, events;
>  	int ret;
>  
> +	if (parent)
> +		pm_runtime_get_noresume(parent);
> +

Should this then check if its suspended and return early if it is 
suspended?

pm_runtime_suspended() has some caveats in the kerneldoc though so I'm a 
bit unsure if it can be called safely here, probably not.

>  	ret = pcie_capability_read_word(port, PCI_EXP_LNKSTA, &link_status);
>  	if (ret != PCIBIOS_SUCCESSFUL)
>  		return IRQ_NONE;
> diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h
> index d39dc863f612..038228de773d 100644
> --- a/include/linux/pm_runtime.h
> +++ b/include/linux/pm_runtime.h
> @@ -448,6 +448,8 @@ static inline int pm_runtime_put(struct device *dev)
>  	return __pm_runtime_idle(dev, RPM_GET_PUT | RPM_ASYNC);
>  }
>  
> +DEFINE_FREE(pm_runtime_put, struct device *, if (_T) pm_runtime_put(_T))
> +
>  /**
>   * __pm_runtime_put_autosuspend - Drop device usage counter and queue autosuspend if 0.
>   * @dev: Target device.
> 

-- 
 i.


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH 1/1] PCI/bwctrl: Disable PCIe BW controller during reset
  2025-02-27  5:31     ` Lukas Wunner
  2025-02-28  8:37       ` Lukas Wunner
@ 2025-03-03 11:58       ` Ilpo Järvinen
  1 sibling, 0 replies; 8+ messages in thread
From: Ilpo Järvinen @ 2025-03-03 11:58 UTC (permalink / raw)
  To: Lukas Wunner
  Cc: Bjorn Helgaas, Jonathan Cameron, linux-pci, LKML,
	Joel Mathew Thomas, stable

[-- Attachment #1: Type: text/plain, Size: 8596 bytes --]

On Thu, 27 Feb 2025, Lukas Wunner wrote:

> On Mon, Feb 24, 2025 at 05:13:15PM +0200, Ilpo Järvinen wrote:
> > On Tue, 18 Feb 2025, Lukas Wunner wrote:
> > > On Mon, Feb 17, 2025 at 06:52:58PM +0200, Ilpo Järvinen wrote:
> > > > PCIe BW controller enables BW notifications for Downstream Ports by
> > > > setting Link Bandwidth Management Interrupt Enable (LBMIE) and Link
> > > > Autonomous Bandwidth Interrupt Enable (LABIE) (PCIe Spec. r6.2 sec.
> > > > 7.5.3.7).
> > > > 
> > > > It was discovered that performing a reset can lead to the device
> > > > underneath the Downstream Port becoming unavailable if BW notifications
> > > > are left enabled throughout the reset sequence (at least LBMIE was
> > > > found to cause an issue).
> > > 
> > > What kind of reset?  FLR?  SBR?  This needs to be specified in the
> > > commit message so that the reader isn't forced to sift through a
> > > bugzilla with dozens of comments and attachments.
> > 
> > Heh, I never really tried to figure out it because the reset disable 
> > patch was just a stab into the dark style patch. To my surprise, it ended 
> > up working (after the initial confusion was resolved) and I just started 
> > to prepare this patch from that knowledge.
> 
> If the present patch is of the type "changing this somehow makes the
> problem go away" instead of a complete root-cause analysis, it would
> have been appropriate to mark it as an RFC.

I'll keep that mind in the future.

I don't think your depiction is entirely accurate of the situtation 
though. The reporter had confirmed that if bwctrl is change(s) are 
reverted, the problem is not observed.

So I set to understand why bwctrl has any impact here at all since it 
should touch only a different device (and even that in relatively limited 
ways). And that is how I found that if bwctrl is not enabled during reset, 
the problem is also not observed.

I also noted that the patch just works around the problems and there was 
also informal speculation about the suspected root cause in the patch (the 
only other theory I've a about the root cause relates to extra interrupts 
causing a problem through hp/pme interrupt handlers).

> I've started to dig into the bugzilla and the very first attachment
> (dmesg for the non-working case) shows:
> 
>   vfio-pci 0000:01:00.0: timed out waiting for pending transaction; performing function level reset anyway
> 
> That message is emitted by pcie_flr().  Perhaps the Nvidia GPU takes
> more time than usual to finish pending transactions, so the first
> thing I would have tried would be to raise the timeout significantly
> and see if that helps.  Yet I'm not seeing any patch or comment in
> the bugzilla where this was attempted.  Please provide a patch for
> the reporter to verify this hypothesis.

I've problem in understanding how reverting bwctrl change does "solve" to 
this. Bwctrl is not even supposed to touch Nvidia GPU at all AFAIK.

> > Logs do mention this:
> > 
> > [   21.560206] pcieport 0000:00:01.1: unlocked secondary bus reset via: pciehp_reset_slot+0x98/0x140
> > 
> > ...so it seems to be SBR.
> 
> Looking at the vfio code, vfio_pci_core_enable() (which is called on
> binding the vfio driver to the GPU) invokes pci_try_reset_function().
> This will execute the reset method configured via sysfs.  The same
> is done on unbind via vfio_pci_core_disable().
> 
> So you should have asked the reporter for the contents of:
> /sys/bus/pci/devices/0000:01:00.0/reset_method
> /sys/bus/pci/devices/0000:01:00.1/reset_method
> 
> In particular, I would like to know whether the contents differ across
> different kernel versions.
> 
> There's another way to perform a reset:   Via an ioctl.  This ends up
> calling vfio_pci_dev_set_hot_reset(), which invokes pci_reset_bus()
> to perform an SBR.
> 
> Looking at dmesg output in log_linux_6.13.2-arch1-1_pcie_port_pm_off.log
> it seems that vfio first performs a function reset of the GPU on bind...
> 
> [   40.171564] vfio-pci 0000:01:00.0: resetting
> [   40.276485] vfio-pci 0000:01:00.0: reset done
> 
> ...and then goes on to perform an SBR both of the GPU and its audio
> device...
> 
> [   40.381082] vfio-pci 0000:01:00.0: resetting
> [   40.381180] vfio-pci 0000:01:00.1: resetting
> [   40.381228] pcieport 0000:00:01.1: unlocked secondary bus reset via: pciehp_reset_slot+0x98/0x140
> [   40.620442] vfio-pci 0000:01:00.0: reset done
> [   40.620479] vfio-pci 0000:01:00.1: reset done
> 
> ...which is odd because the audio device apparently wasn't bound to
> vfio-pci, otherwise there would have been a function reset.  So why
> does vfio think it can safely reset it?
> 
> Oddly, there is a third function reset of only the GPU:
> 
> [   40.621894] vfio-pci 0000:01:00.0: resetting
> [   40.724430] vfio-pci 0000:01:00.0: reset done
> 
> The reporter writes that pcie_port_pm=off avoids the PME messages.
> If the reset_method is "pm", I could imagine that the Nvidia GPU
> signals a PME event during the D0 -> D3hot -> D0 transition.
>
> I also note that the vfio-pci driver allows runtime PM.  So both the
> GPU and its audio device may runtime suspend to D3hot.  This in turn
> lets the Root Port runtime suspend to D3hot.  It looks like the
> reporter is using a laptop with an integrated AMD GPU and a
> discrete Nvidia GPU.  On such products the platform often allows
> powering down the discrete GPU and this is usually controlled
> through ACPI Power Resources attached to the Root Port.
> Those are powered off after the Root Port goes to D3hot.
> You should have asked the reporter for an acpidump.

A lots of these suggestions do not make much sense to me given that 
reverting bwctrl alone does not exhibit the problem. E.g., the reset 
method should be exactly the same.

I can see there could be some way through either hp or PME interrupt 
handlers, where an extra interrupt that comes due to bwctrl (LBMIE) being 
enabled triggers one of the such behaviors. But none of the above 
description theoritizes anything to that direction.

> pcie_bwnotif_irq() accesses the Link Status register without
> acquiring a runtime PM reference on the PCIe port.  This feels
> wrong and may also contribute to the issue reported here.
> Acquiring a runtime PM ref may sleep, so I think you need to
> change the driver to use a threaded IRQ handler.
> 
> Nvidia GPUs are known to hide the audio device if no audio-capable
> display is attached (e.g. HDMI).  quirk_nvidia_hda() unhides the
> audio device on boot and resume.  It might be necessary to also run
> the quirk after resetting the GPU.  Knowing which reset_method
> was used is important to decide if that's necessary, and also
> whether a display was attached.

...You seem to have a lot of ideas on this. ;-)

> Moreover Nvidia GPUs are known to change the link speed on idle
> to reduce power consumption.  Perhaps resetting the GPU causes
> a change of link speed and thus execution of pcie_bwnotif_irq()?

Why would execution of pcie_bwnotif_irq() be a problem, it's not that
complicated?

I'm more worried that having LBMIE enabled causes also the other interrupt 
handlers to execute due to the shared interrupt.

> > > This approach won't work if the reset is performed without software
> > > intervention.  E.g. if a DPC event occurs, the device likewise undergoes
> > > a reset but there is no prior system software involvement.  Software only
> > > becomes involved *after* the reset has occurred.
> > > 
> > > I think it needs to be tested if that same issue occurs with DPC.
> > > It's easy to simulate DPC by setting the Software Trigger bit:
> > > 
> > > setpci -s 00:01.1 ECAP_DPC+6.w=40:40
> > > 
> > > If the issue does occur with DPC then this fix isn't sufficient.
> > 
> > Looking into lspci logs, I don't see DPC capability being there for 
> > 00:01.1?!
> 
> Hm, so we can't verify whether your approach is safe for DPC.
> 
> 
> > > Instead of putting this in the PCI core, amend pcie_portdrv_err_handler
> > > with ->reset_prepare and ->reset_done callbacks which call down to all
> > > the port service drivers, then amend bwctrl.c to disable/enable
> > > interrupts in these callbacks.
> > 
> > Will it work? I mean if the port itself is not reset (0000:00:01.1 in this 
> > case), do these callbacks get called for it?
> 
> Never mind, indeed this won't work.
> 
> Thanks,
> 
> Lukas
> 

I'm sorry for the delay, I've been sick for a while.

-- 
 i.

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH 1/1] PCI/bwctrl: Disable PCIe BW controller during reset
  2025-02-17 16:52 [PATCH 1/1] PCI/bwctrl: Disable PCIe BW controller during reset Ilpo Järvinen
  2025-02-18  8:59 ` Lukas Wunner
@ 2025-03-13 14:26 ` Ilpo Järvinen
  1 sibling, 0 replies; 8+ messages in thread
From: Ilpo Järvinen @ 2025-03-13 14:26 UTC (permalink / raw)
  To: Lukas Wunner, Bjorn Helgaas
  Cc: Jonathan Cameron, linux-pci, LKML, Joel Mathew Thomas, stable

[-- Attachment #1: Type: text/plain, Size: 8306 bytes --]

On Mon, 17 Feb 2025, Ilpo Järvinen wrote:

> PCIe BW controller enables BW notifications for Downstream Ports by
> setting Link Bandwidth Management Interrupt Enable (LBMIE) and Link
> Autonomous Bandwidth Interrupt Enable (LABIE) (PCIe Spec. r6.2 sec.
> 7.5.3.7).
> 
> It was discovered that performing a reset can lead to the device
> underneath the Downstream Port becoming unavailable if BW notifications
> are left enabled throughout the reset sequence (at least LBMIE was
> found to cause an issue).
> 
> While the PCIe Specifications do not indicate BW notifications could not
> be kept enabled during resets, the PCIe Link state during an
> intentional reset is not of large interest. Thus, disable BW controller
> for the bridge while reset is performed and re-enable it after the
> reset has completed to workaround the problems some devices encounter
> if BW notifications are left on throughout the reset sequence.
> 
> Keep a counter for the disable/enable because MFD will execute
> pci_dev_save_and_disable() and pci_dev_restore() back to back for
> sibling devices:
> 
> [   50.139010] vfio-pci 0000:01:00.0: resetting
> [   50.139053] vfio-pci 0000:01:00.1: resetting
> [   50.141126] pcieport 0000:00:01.1: PME: Spurious native interrupt!
> [   50.141133] pcieport 0000:00:01.1: PME: Spurious native interrupt!
> [   50.441466] vfio-pci 0000:01:00.0: reset done
> [   50.501534] vfio-pci 0000:01:00.1: reset done
> 
> Fixes: 665745f27487 ("PCI/bwctrl: Re-add BW notification portdrv as PCIe BW controller")
> Closes: https://bugzilla.kernel.org/show_bug.cgi?id=219765
> Tested-by: Joel Mathew Thomas <proxy0@tutamail.com>
> Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
> Cc: stable@vger.kernel.org
> ---
> 
> I suspect the root cause is some kind of violation of specifications.
> Resets shouldn't cause devices to become unavailable just because BW
> notifications have been enabled.
> 
> Before somebody comments on those dual rwsems, I do have yet to be
> submitted patch to simplify the locking as per Lukas Wunner's earlier
> suggestion. I've just focused on solving the regressions first.

Hi all,

This problem was root caused to a problem in hotplug code so this patch 
should not be applied. I've just sent a series to address the 
synchronization problems the hotplug code and it should be considered 
instead to fix the issues this patch attempted to workaround.

-- 
 i.

>  drivers/pci/pci.c         |  8 +++++++
>  drivers/pci/pci.h         |  4 ++++
>  drivers/pci/pcie/bwctrl.c | 49 ++++++++++++++++++++++++++++++++-------
>  3 files changed, 53 insertions(+), 8 deletions(-)
> 
> diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
> index 869d204a70a3..7a53d7474175 100644
> --- a/drivers/pci/pci.c
> +++ b/drivers/pci/pci.c
> @@ -5148,6 +5148,7 @@ static void pci_dev_save_and_disable(struct pci_dev *dev)
>  {
>  	const struct pci_error_handlers *err_handler =
>  			dev->driver ? dev->driver->err_handler : NULL;
> +	struct pci_dev *bridge = pci_upstream_bridge(dev);
>  
>  	/*
>  	 * dev->driver->err_handler->reset_prepare() is protected against
> @@ -5166,6 +5167,9 @@ static void pci_dev_save_and_disable(struct pci_dev *dev)
>  	 */
>  	pci_set_power_state(dev, PCI_D0);
>  
> +	if (bridge)
> +		pcie_bwnotif_disable(bridge);
> +
>  	pci_save_state(dev);
>  	/*
>  	 * Disable the device by clearing the Command register, except for
> @@ -5181,9 +5185,13 @@ static void pci_dev_restore(struct pci_dev *dev)
>  {
>  	const struct pci_error_handlers *err_handler =
>  			dev->driver ? dev->driver->err_handler : NULL;
> +	struct pci_dev *bridge = pci_upstream_bridge(dev);
>  
>  	pci_restore_state(dev);
>  
> +	if (bridge)
> +		pcie_bwnotif_enable(bridge);
> +
>  	/*
>  	 * dev->driver->err_handler->reset_done() is protected against
>  	 * races with ->remove() by the device lock, which must be held by
> diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
> index 01e51db8d285..856546f1aad9 100644
> --- a/drivers/pci/pci.h
> +++ b/drivers/pci/pci.h
> @@ -759,12 +759,16 @@ static inline void pcie_ecrc_get_policy(char *str) { }
>  #ifdef CONFIG_PCIEPORTBUS
>  void pcie_reset_lbms_count(struct pci_dev *port);
>  int pcie_lbms_count(struct pci_dev *port, unsigned long *val);
> +void pcie_bwnotif_enable(struct pci_dev *port);
> +void pcie_bwnotif_disable(struct pci_dev *port);
>  #else
>  static inline void pcie_reset_lbms_count(struct pci_dev *port) {}
>  static inline int pcie_lbms_count(struct pci_dev *port, unsigned long *val)
>  {
>  	return -EOPNOTSUPP;
>  }
> +static inline void pcie_bwnotif_enable(struct pci_dev *port) {}
> +static inline void pcie_bwnotif_disable(struct pci_dev *port) {}
>  #endif
>  
>  struct pci_dev_reset_methods {
> diff --git a/drivers/pci/pcie/bwctrl.c b/drivers/pci/pcie/bwctrl.c
> index 0a5e7efbce2c..a117f6f67c07 100644
> --- a/drivers/pci/pcie/bwctrl.c
> +++ b/drivers/pci/pcie/bwctrl.c
> @@ -40,11 +40,13 @@
>   * @set_speed_mutex:	Serializes link speed changes
>   * @lbms_count:		Count for LBMS (since last reset)
>   * @cdev:		Thermal cooling device associated with the port
> + * @disable_count:	BW notifications disabled/enabled counter
>   */
>  struct pcie_bwctrl_data {
>  	struct mutex set_speed_mutex;
>  	atomic_t lbms_count;
>  	struct thermal_cooling_device *cdev;
> +	int disable_count;
>  };
>  
>  /*
> @@ -200,10 +202,9 @@ int pcie_set_target_speed(struct pci_dev *port, enum pci_bus_speed speed_req,
>  	return ret;
>  }
>  
> -static void pcie_bwnotif_enable(struct pcie_device *srv)
> +static void __pcie_bwnotif_enable(struct pci_dev *port)
>  {
> -	struct pcie_bwctrl_data *data = srv->port->link_bwctrl;
> -	struct pci_dev *port = srv->port;
> +	struct pcie_bwctrl_data *data = port->link_bwctrl;
>  	u16 link_status;
>  	int ret;
>  
> @@ -224,12 +225,44 @@ static void pcie_bwnotif_enable(struct pcie_device *srv)
>  	pcie_update_link_speed(port->subordinate);
>  }
>  
> -static void pcie_bwnotif_disable(struct pci_dev *port)
> +void pcie_bwnotif_enable(struct pci_dev *port)
> +{
> +	guard(rwsem_read)(&pcie_bwctrl_setspeed_rwsem);
> +	guard(rwsem_read)(&pcie_bwctrl_lbms_rwsem);
> +
> +	if (!port->link_bwctrl)
> +		return;
> +
> +	port->link_bwctrl->disable_count--;
> +	if (!port->link_bwctrl->disable_count) {
> +		__pcie_bwnotif_enable(port);
> +		pci_dbg(port, "BW notifications enabled\n");
> +	}
> +	WARN_ON_ONCE(port->link_bwctrl->disable_count < 0);
> +}
> +
> +static void __pcie_bwnotif_disable(struct pci_dev *port)
>  {
>  	pcie_capability_clear_word(port, PCI_EXP_LNKCTL,
>  				   PCI_EXP_LNKCTL_LBMIE | PCI_EXP_LNKCTL_LABIE);
>  }
>  
> +void pcie_bwnotif_disable(struct pci_dev *port)
> +{
> +	guard(rwsem_read)(&pcie_bwctrl_setspeed_rwsem);
> +	guard(rwsem_read)(&pcie_bwctrl_lbms_rwsem);
> +
> +	if (!port->link_bwctrl)
> +		return;
> +
> +	port->link_bwctrl->disable_count++;
> +
> +	if (port->link_bwctrl->disable_count == 1) {
> +		__pcie_bwnotif_disable(port);
> +		pci_dbg(port, "BW notifications disabled\n");
> +	}
> +}
> +
>  static irqreturn_t pcie_bwnotif_irq(int irq, void *context)
>  {
>  	struct pcie_device *srv = context;
> @@ -314,7 +347,7 @@ static int pcie_bwnotif_probe(struct pcie_device *srv)
>  				return ret;
>  			}
>  
> -			pcie_bwnotif_enable(srv);
> +			__pcie_bwnotif_enable(port);
>  		}
>  	}
>  
> @@ -336,7 +369,7 @@ static void pcie_bwnotif_remove(struct pcie_device *srv)
>  
>  	scoped_guard(rwsem_write, &pcie_bwctrl_setspeed_rwsem) {
>  		scoped_guard(rwsem_write, &pcie_bwctrl_lbms_rwsem) {
> -			pcie_bwnotif_disable(srv->port);
> +			__pcie_bwnotif_disable(srv->port);
>  
>  			free_irq(srv->irq, srv);
>  
> @@ -347,13 +380,13 @@ static void pcie_bwnotif_remove(struct pcie_device *srv)
>  
>  static int pcie_bwnotif_suspend(struct pcie_device *srv)
>  {
> -	pcie_bwnotif_disable(srv->port);
> +	__pcie_bwnotif_disable(srv->port);
>  	return 0;
>  }
>  
>  static int pcie_bwnotif_resume(struct pcie_device *srv)
>  {
> -	pcie_bwnotif_enable(srv);
> +	__pcie_bwnotif_enable(srv->port);
>  	return 0;
>  }
>  
> 
> base-commit: 2014c95afecee3e76ca4a56956a936e23283f05b
> 

^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2025-03-13 14:26 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-02-17 16:52 [PATCH 1/1] PCI/bwctrl: Disable PCIe BW controller during reset Ilpo Järvinen
2025-02-18  8:59 ` Lukas Wunner
2025-02-24 15:13   ` Ilpo Järvinen
2025-02-27  5:31     ` Lukas Wunner
2025-02-28  8:37       ` Lukas Wunner
2025-03-03 10:44         ` Ilpo Järvinen
2025-03-03 11:58       ` Ilpo Järvinen
2025-03-13 14:26 ` Ilpo Järvinen

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).