[REGRESSION 02/04] Crash during resume of pcie bridge

linux-pci.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

From: Bert Karwatzki <spasswolf@web.de>
To: linux-kernel@vger.kernel.org
Cc: "Bert Karwatzki" <spasswolf@web.de>,
	linux-next@vger.kernel.org, linux-stable@vger.kernel.org,
	regressions@lists.linux.dev, linux-pci@vger.kernel.org,
	linux-acpi@vger.kernel.org,
	"Mario Limonciello" <superm1@kernel.org>,
	"Christian König" <christian.koenig@amd.com>,
	"Rafael J . Wysocki" <rafael.j.wysocki@intel.com>
Subject: [REGRESSION 02/04] Crash during resume of pcie bridge
Date: Mon,  6 Oct 2025 14:09:41 +0200	[thread overview]
Message-ID: <20251006120944.7880-3-spasswolf@web.de> (raw)
In-Reply-To: <20251006120944.7880-1-spasswolf@web.de>

The next step is to monitor pm_runtime_get_sync(), rpm_resume() and __pm_runtime_resume().
Here we need to use conditional debugging output or else we get messages at a
rate of about a million lines per minute.

These is the additional debugging used in
6.17.0-rc6-next-20250917-gpudebug-00024-g5c6b49b810db

diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c
index 7420b9851fe0..895898c3cd56 100644
--- a/drivers/base/power/runtime.c
+++ b/drivers/base/power/runtime.c
@@ -787,12 +787,18 @@ static int rpm_resume(struct device *dev, int rpmflags)
 	struct device *parent = NULL;
 	int retval = 0;
 
+	if (!strcmp(dev_name(dev), "0000:00:01.1"))
+		dev_info(dev, "%s %d\n", __func__, __LINE__);
 	trace_rpm_resume(dev, rpmflags);
 
  repeat:
 	if (dev->power.runtime_error) {
+		if (!strcmp(dev_name(dev), "0000:00:01.1"))
+			dev_info(dev, "%s %d\n", __func__, __LINE__);
 		retval = -EINVAL;
 	} else if (dev->power.disable_depth > 0) {
+		if (!strcmp(dev_name(dev), "0000:00:01.1"))
+			dev_info(dev, "%s %d\n", __func__, __LINE__);
 		if (dev->power.runtime_status == RPM_ACTIVE &&
 		    dev->power.last_status == RPM_ACTIVE)
 			retval = 1;
@@ -808,31 +814,45 @@ static int rpm_resume(struct device *dev, int rpmflags)
 	 * rather than cancelling it now only to restart it again in the near
 	 * future.
 	 */
+	if (!strcmp(dev_name(dev), "0000:00:01.1"))
+		dev_info(dev, "%s %d\n", __func__, __LINE__);
 	dev->power.request = RPM_REQ_NONE;
 	if (!dev->power.timer_autosuspends)
 		pm_runtime_deactivate_timer(dev);
 
 	if (dev->power.runtime_status == RPM_ACTIVE) {
+		if (!strcmp(dev_name(dev), "0000:00:01.1"))
+			dev_info(dev, "%s %d\n", __func__, __LINE__);
 		retval = 1;
 		goto out;
 	}
 
 	if (dev->power.runtime_status == RPM_RESUMING ||
 	    dev->power.runtime_status == RPM_SUSPENDING) {
+		if (!strcmp(dev_name(dev), "0000:00:01.1"))
+			dev_info(dev, "%s %d\n", __func__, __LINE__);
 		DEFINE_WAIT(wait);
 
 		if (rpmflags & (RPM_ASYNC | RPM_NOWAIT)) {
+			if (!strcmp(dev_name(dev), "0000:00:01.1"))
+				dev_info(dev, "%s %d\n", __func__, __LINE__);
 			if (dev->power.runtime_status == RPM_SUSPENDING) {
+				if (!strcmp(dev_name(dev), "0000:00:01.1"))
+					dev_info(dev, "%s %d\n", __func__, __LINE__);
 				dev->power.deferred_resume = true;
 				if (rpmflags & RPM_NOWAIT)
 					retval = -EINPROGRESS;
 			} else {
+				if (!strcmp(dev_name(dev), "0000:00:01.1"))
+					dev_info(dev, "%s %d\n", __func__, __LINE__);
 				retval = -EINPROGRESS;
 			}
 			goto out;
 		}
 
 		if (dev->power.irq_safe) {
+			if (!strcmp(dev_name(dev), "0000:00:01.1"))
+				dev_info(dev, "%s %d\n", __func__, __LINE__);
 			spin_unlock(&dev->power.lock);
 
 			cpu_relax();
@@ -856,6 +876,8 @@ static int rpm_resume(struct device *dev, int rpmflags)
 			spin_lock_irq(&dev->power.lock);
 		}
 		finish_wait(&dev->power.wait_queue, &wait);
+		if (!strcmp(dev_name(dev), "0000:00:01.1"))
+			dev_info(dev, "%s %d\n", __func__, __LINE__);
 		goto repeat;
 	}
 
@@ -865,22 +887,32 @@ static int rpm_resume(struct device *dev, int rpmflags)
 	 * the resume will actually succeed.
 	 */
 	if (dev->power.no_callbacks && !parent && dev->parent) {
+		if (!strcmp(dev_name(dev), "0000:00:01.1"))
+			dev_info(dev, "%s %d\n", __func__, __LINE__);
 		spin_lock_nested(&dev->parent->power.lock, SINGLE_DEPTH_NESTING);
 		if (dev->parent->power.disable_depth > 0 ||
 		    dev->parent->power.ignore_children ||
 		    dev->parent->power.runtime_status == RPM_ACTIVE) {
+			if (!strcmp(dev_name(dev), "0000:00:01.1"))
+				dev_info(dev, "%s %d\n", __func__, __LINE__);
 			atomic_inc(&dev->parent->power.child_count);
 			spin_unlock(&dev->parent->power.lock);
 			retval = 1;
 			goto no_callback;	/* Assume success. */
 		}
 		spin_unlock(&dev->parent->power.lock);
+		if (!strcmp(dev_name(dev), "0000:00:01.1"))
+			dev_info(dev, "%s %d\n", __func__, __LINE__);
 	}
 
 	/* Carry out an asynchronous or a synchronous resume. */
 	if (rpmflags & RPM_ASYNC) {
+		if (!strcmp(dev_name(dev), "0000:00:01.1"))
+			dev_info(dev, "%s %d\n", __func__, __LINE__);
 		dev->power.request = RPM_REQ_RESUME;
 		if (!dev->power.request_pending) {
+			if (!strcmp(dev_name(dev), "0000:00:01.1"))
+				dev_info(dev, "%s %d\n", __func__, __LINE__);
 			dev->power.request_pending = true;
 			queue_work(pm_wq, &dev->power.work);
 		}
@@ -894,6 +926,8 @@ static int rpm_resume(struct device *dev, int rpmflags)
 		 * necessary.  Not needed if dev is irq-safe; then the
 		 * parent is permanently resumed.
 		 */
+		if (!strcmp(dev_name(dev), "0000:00:01.1"))
+			dev_info(dev, "%s %d\n", __func__, __LINE__);
 		parent = dev->parent;
 		if (dev->power.irq_safe)
 			goto skip_parent;
@@ -909,6 +943,8 @@ static int rpm_resume(struct device *dev, int rpmflags)
 		 */
 		if (!parent->power.disable_depth &&
 		    !parent->power.ignore_children) {
+			if (!strcmp(dev_name(dev), "0000:00:01.1"))
+				dev_info(dev, "%s %d\n", __func__, __LINE__);
 			rpm_resume(parent, 0);
 			if (parent->power.runtime_status != RPM_ACTIVE)
 				retval = -EBUSY;
@@ -919,10 +955,14 @@ static int rpm_resume(struct device *dev, int rpmflags)
 		if (retval)
 			goto out;
 
+		if (!strcmp(dev_name(dev), "0000:00:01.1"))
+			dev_info(dev, "%s %d\n", __func__, __LINE__);
 		goto repeat;
 	}
  skip_parent:
 
+	if (!strcmp(dev_name(dev), "0000:00:01.1"))
+		dev_info(dev, "%s %d\n", __func__, __LINE__);
 	if (dev->power.no_callbacks)
 		goto no_callback;	/* Assume success. */
 
@@ -933,11 +973,15 @@ static int rpm_resume(struct device *dev, int rpmflags)
 	dev_pm_disable_wake_irq_check(dev, false);
 	retval = rpm_callback(callback, dev);
 	if (retval) {
+		if (!strcmp(dev_name(dev), "0000:00:01.1"))
+			dev_info(dev, "%s %d\n", __func__, __LINE__);
 		__update_runtime_status(dev, RPM_SUSPENDED);
 		pm_runtime_cancel_pending(dev);
 		dev_pm_enable_wake_irq_check(dev, false);
 	} else {
  no_callback:
+		if (!strcmp(dev_name(dev), "0000:00:01.1"))
+			dev_info(dev, "%s %d\n", __func__, __LINE__);
 		__update_runtime_status(dev, RPM_ACTIVE);
 		pm_runtime_mark_last_busy(dev);
 		if (parent)
@@ -949,7 +993,11 @@ static int rpm_resume(struct device *dev, int rpmflags)
 		rpm_idle(dev, RPM_ASYNC);
 
  out:
+	if (!strcmp(dev_name(dev), "0000:00:01.1"))
+		dev_info(dev, "%s %d\n", __func__, __LINE__);
 	if (parent && !dev->power.irq_safe) {
+		if (!strcmp(dev_name(dev), "0000:00:01.1"))
+			dev_info(dev, "%s %d\n", __func__, __LINE__);
 		spin_unlock_irq(&dev->power.lock);
 
 		pm_runtime_put(parent);
@@ -959,6 +1007,8 @@ static int rpm_resume(struct device *dev, int rpmflags)
 
 	trace_rpm_return_int(dev, _THIS_IP_, retval);
 
+	if (!strcmp(dev_name(dev), "0000:00:01.1"))
+		dev_info(dev, "%s %d\n", __func__, __LINE__);
 	return retval;
 }
 
@@ -1181,17 +1231,27 @@ int __pm_runtime_resume(struct device *dev, int rpmflags)
 {
 	unsigned long flags;
 	int retval;
+	if (!strcmp(dev_name(dev), "0000:00:01.1"))
+		dev_info(dev, "%s %d\n", __func__, __LINE__);
 
 	might_sleep_if(!(rpmflags & RPM_ASYNC) && !dev->power.irq_safe &&
 			dev->power.runtime_status != RPM_ACTIVE);
 
+	if (!strcmp(dev_name(dev), "0000:00:01.1"))
+		dev_info(dev, "%s %d\n", __func__, __LINE__);
 	if (rpmflags & RPM_GET_PUT)
 		atomic_inc(&dev->power.usage_count);
 
+	if (!strcmp(dev_name(dev), "0000:00:01.1"))
+		dev_info(dev, "%s %d\n", __func__, __LINE__);
 	spin_lock_irqsave(&dev->power.lock, flags);
+	if (!strcmp(dev_name(dev), "0000:00:01.1"))
+		dev_info(dev, "%s %d\n", __func__, __LINE__);
 	retval = rpm_resume(dev, rpmflags);
 	spin_unlock_irqrestore(&dev->power.lock, flags);
 
+	if (!strcmp(dev_name(dev), "0000:00:01.1"))
+		dev_info(dev, "%s %d\n", __func__, __LINE__);
 	return retval;
 }
 EXPORT_SYMBOL_GPL(__pm_runtime_resume);
diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h
index d88d6b6ccf5b..0888b0d5ec73 100644
--- a/include/linux/pm_runtime.h
+++ b/include/linux/pm_runtime.h
@@ -508,6 +508,8 @@ static inline int pm_runtime_get(struct device *dev)
  */
 static inline int pm_runtime_get_sync(struct device *dev)
 {
+	if (!strcmp(dev_name(dev), "0000:00:01.1"))
+		dev_info(dev, "%s\n", __func__);
 	return __pm_runtime_resume(dev, RPM_GET_PUT);
 }
 
With this there is no crash after 60h uptime with ~3093 GPP0 notifies, probably the
printk()s are mitigating the crash in some way (i.e. there's a race and the printk()s
are slowing down only one side ...).

It would be nice if we could get a crash while all the printk()s are in place,
but I'm not sure if we can ...

Stopped 6.17.0-rc6-next-20250917-gpudebug-00024-g5c6b49b810db after 60h and 3093 GPP0 notifies without crash.

Bert Karwatzki

next prev parent reply	other threads:[~2025-10-06 12:09 UTC|newest]

Thread overview: 31+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-10-06 12:09 [REGRESSION 00/04] Crash during resume of pcie bridge Bert Karwatzki
2025-10-06 12:09 ` [REGRESSION 01/04] " Bert Karwatzki
2025-10-06 12:09 ` Bert Karwatzki [this message]
2025-10-06 12:09 ` [REGRESSION 03/04] " Bert Karwatzki
2025-10-06 12:09 ` [REGRESSION 04/04] " Bert Karwatzki
2025-10-06 12:39 ` [REGRESSION 00/04] " Christian König
2025-10-06 16:22   ` Bert Karwatzki
2025-10-07  6:50     ` Bert Karwatzki
2025-10-07 21:33 ` Mario Limonciello
2025-10-13 16:29   ` Bert Karwatzki
2025-10-13 18:51     ` Mario Limonciello
2025-10-14 10:50       ` Christian König
     [not found]         ` <1853e2af7f70cf726df278137b6d2d89d9d9dc82.camel@web.de>
2025-10-31 13:38           ` Bert Karwatzki
2025-10-31 13:47             ` Bert Karwatzki
2025-10-31 18:35               ` Bert Karwatzki
2025-11-05 11:44                 ` Bert Karwatzki
2025-11-05 21:31                   ` Mario Limonciello (AMD) (kernel.org)
2025-11-07 13:09                     ` Bert Karwatzki
2025-11-07 17:09                       ` Bert Karwatzki
2025-11-10 13:33                         ` Christian König
2025-11-16 21:08                           ` Crash during resume of pcie bridge due to infinite loop in ACPICA Bert Karwatzki
2025-11-17 16:40                             ` Rafael J. Wysocki
2025-11-24 22:34                               ` Bert Karwatzki
2025-11-25 19:46                                 ` Rafael J. Wysocki
2025-11-27  0:08                                   ` Bert Karwatzki
2025-11-27 13:02                                     ` Rafael J. Wysocki
2025-11-28 20:47                                       ` Bert Karwatzki
2025-12-02 18:59                                         ` Rafael J. Wysocki
2025-12-02 19:53                                           ` Bert Karwatzki
2025-12-02 20:01                                             ` Rafael J. Wysocki
2025-12-05 10:05                                               ` Crash during resume of pcie bridge due to incorrect error handling Bert Karwatzki

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:7420b9851fe dfblob:895898c3cd5 dfblob:d88d6b6ccf5
dfblob:0888b0d5ec7 )
 OR (
bs:"[REGRESSION 02/04] Crash during resume of pcie bridge" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20251006120944.7880-3-spasswolf@web.de \
    --to=spasswolf@web.de \
    --cc=christian.koenig@amd.com \
    --cc=linux-acpi@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-next@vger.kernel.org \
    --cc=linux-pci@vger.kernel.org \
    --cc=linux-stable@vger.kernel.org \
    --cc=rafael.j.wysocki@intel.com \
    --cc=regressions@lists.linux.dev \
    --cc=superm1@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).