All of lore.kernel.org
 help / color / mirror / Atom feed
From: Bert Karwatzki <spasswolf@web.de>
To: linux-kernel@vger.kernel.org
Cc: "Bert Karwatzki" <spasswolf@web.de>,
	linux-next@vger.kernel.org, linux-stable@vger.kernel.org,
	regressions@lists.linux.dev, linux-pci@vger.kernel.org,
	linux-acpi@vger.kernel.org,
	"Mario Limonciello" <superm1@kernel.org>,
	"Christian König" <christian.koenig@amd.com>,
	"Rafael J . Wysocki" <rafael.j.wysocki@intel.com>
Subject: [REGRESSION 02/04] Crash during resume of pcie bridge
Date: Mon,  6 Oct 2025 14:09:41 +0200	[thread overview]
Message-ID: <20251006120944.7880-3-spasswolf@web.de> (raw)
In-Reply-To: <20251006120944.7880-1-spasswolf@web.de>

The next step is to monitor pm_runtime_get_sync(), rpm_resume() and __pm_runtime_resume().
Here we need to use conditional debugging output or else we get messages at a
rate of about a million lines per minute.

These is the additional debugging used in
6.17.0-rc6-next-20250917-gpudebug-00024-g5c6b49b810db

diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c
index 7420b9851fe0..895898c3cd56 100644
--- a/drivers/base/power/runtime.c
+++ b/drivers/base/power/runtime.c
@@ -787,12 +787,18 @@ static int rpm_resume(struct device *dev, int rpmflags)
 	struct device *parent = NULL;
 	int retval = 0;
 
+	if (!strcmp(dev_name(dev), "0000:00:01.1"))
+		dev_info(dev, "%s %d\n", __func__, __LINE__);
 	trace_rpm_resume(dev, rpmflags);
 
  repeat:
 	if (dev->power.runtime_error) {
+		if (!strcmp(dev_name(dev), "0000:00:01.1"))
+			dev_info(dev, "%s %d\n", __func__, __LINE__);
 		retval = -EINVAL;
 	} else if (dev->power.disable_depth > 0) {
+		if (!strcmp(dev_name(dev), "0000:00:01.1"))
+			dev_info(dev, "%s %d\n", __func__, __LINE__);
 		if (dev->power.runtime_status == RPM_ACTIVE &&
 		    dev->power.last_status == RPM_ACTIVE)
 			retval = 1;
@@ -808,31 +814,45 @@ static int rpm_resume(struct device *dev, int rpmflags)
 	 * rather than cancelling it now only to restart it again in the near
 	 * future.
 	 */
+	if (!strcmp(dev_name(dev), "0000:00:01.1"))
+		dev_info(dev, "%s %d\n", __func__, __LINE__);
 	dev->power.request = RPM_REQ_NONE;
 	if (!dev->power.timer_autosuspends)
 		pm_runtime_deactivate_timer(dev);
 
 	if (dev->power.runtime_status == RPM_ACTIVE) {
+		if (!strcmp(dev_name(dev), "0000:00:01.1"))
+			dev_info(dev, "%s %d\n", __func__, __LINE__);
 		retval = 1;
 		goto out;
 	}
 
 	if (dev->power.runtime_status == RPM_RESUMING ||
 	    dev->power.runtime_status == RPM_SUSPENDING) {
+		if (!strcmp(dev_name(dev), "0000:00:01.1"))
+			dev_info(dev, "%s %d\n", __func__, __LINE__);
 		DEFINE_WAIT(wait);
 
 		if (rpmflags & (RPM_ASYNC | RPM_NOWAIT)) {
+			if (!strcmp(dev_name(dev), "0000:00:01.1"))
+				dev_info(dev, "%s %d\n", __func__, __LINE__);
 			if (dev->power.runtime_status == RPM_SUSPENDING) {
+				if (!strcmp(dev_name(dev), "0000:00:01.1"))
+					dev_info(dev, "%s %d\n", __func__, __LINE__);
 				dev->power.deferred_resume = true;
 				if (rpmflags & RPM_NOWAIT)
 					retval = -EINPROGRESS;
 			} else {
+				if (!strcmp(dev_name(dev), "0000:00:01.1"))
+					dev_info(dev, "%s %d\n", __func__, __LINE__);
 				retval = -EINPROGRESS;
 			}
 			goto out;
 		}
 
 		if (dev->power.irq_safe) {
+			if (!strcmp(dev_name(dev), "0000:00:01.1"))
+				dev_info(dev, "%s %d\n", __func__, __LINE__);
 			spin_unlock(&dev->power.lock);
 
 			cpu_relax();
@@ -856,6 +876,8 @@ static int rpm_resume(struct device *dev, int rpmflags)
 			spin_lock_irq(&dev->power.lock);
 		}
 		finish_wait(&dev->power.wait_queue, &wait);
+		if (!strcmp(dev_name(dev), "0000:00:01.1"))
+			dev_info(dev, "%s %d\n", __func__, __LINE__);
 		goto repeat;
 	}
 
@@ -865,22 +887,32 @@ static int rpm_resume(struct device *dev, int rpmflags)
 	 * the resume will actually succeed.
 	 */
 	if (dev->power.no_callbacks && !parent && dev->parent) {
+		if (!strcmp(dev_name(dev), "0000:00:01.1"))
+			dev_info(dev, "%s %d\n", __func__, __LINE__);
 		spin_lock_nested(&dev->parent->power.lock, SINGLE_DEPTH_NESTING);
 		if (dev->parent->power.disable_depth > 0 ||
 		    dev->parent->power.ignore_children ||
 		    dev->parent->power.runtime_status == RPM_ACTIVE) {
+			if (!strcmp(dev_name(dev), "0000:00:01.1"))
+				dev_info(dev, "%s %d\n", __func__, __LINE__);
 			atomic_inc(&dev->parent->power.child_count);
 			spin_unlock(&dev->parent->power.lock);
 			retval = 1;
 			goto no_callback;	/* Assume success. */
 		}
 		spin_unlock(&dev->parent->power.lock);
+		if (!strcmp(dev_name(dev), "0000:00:01.1"))
+			dev_info(dev, "%s %d\n", __func__, __LINE__);
 	}
 
 	/* Carry out an asynchronous or a synchronous resume. */
 	if (rpmflags & RPM_ASYNC) {
+		if (!strcmp(dev_name(dev), "0000:00:01.1"))
+			dev_info(dev, "%s %d\n", __func__, __LINE__);
 		dev->power.request = RPM_REQ_RESUME;
 		if (!dev->power.request_pending) {
+			if (!strcmp(dev_name(dev), "0000:00:01.1"))
+				dev_info(dev, "%s %d\n", __func__, __LINE__);
 			dev->power.request_pending = true;
 			queue_work(pm_wq, &dev->power.work);
 		}
@@ -894,6 +926,8 @@ static int rpm_resume(struct device *dev, int rpmflags)
 		 * necessary.  Not needed if dev is irq-safe; then the
 		 * parent is permanently resumed.
 		 */
+		if (!strcmp(dev_name(dev), "0000:00:01.1"))
+			dev_info(dev, "%s %d\n", __func__, __LINE__);
 		parent = dev->parent;
 		if (dev->power.irq_safe)
 			goto skip_parent;
@@ -909,6 +943,8 @@ static int rpm_resume(struct device *dev, int rpmflags)
 		 */
 		if (!parent->power.disable_depth &&
 		    !parent->power.ignore_children) {
+			if (!strcmp(dev_name(dev), "0000:00:01.1"))
+				dev_info(dev, "%s %d\n", __func__, __LINE__);
 			rpm_resume(parent, 0);
 			if (parent->power.runtime_status != RPM_ACTIVE)
 				retval = -EBUSY;
@@ -919,10 +955,14 @@ static int rpm_resume(struct device *dev, int rpmflags)
 		if (retval)
 			goto out;
 
+		if (!strcmp(dev_name(dev), "0000:00:01.1"))
+			dev_info(dev, "%s %d\n", __func__, __LINE__);
 		goto repeat;
 	}
  skip_parent:
 
+	if (!strcmp(dev_name(dev), "0000:00:01.1"))
+		dev_info(dev, "%s %d\n", __func__, __LINE__);
 	if (dev->power.no_callbacks)
 		goto no_callback;	/* Assume success. */
 
@@ -933,11 +973,15 @@ static int rpm_resume(struct device *dev, int rpmflags)
 	dev_pm_disable_wake_irq_check(dev, false);
 	retval = rpm_callback(callback, dev);
 	if (retval) {
+		if (!strcmp(dev_name(dev), "0000:00:01.1"))
+			dev_info(dev, "%s %d\n", __func__, __LINE__);
 		__update_runtime_status(dev, RPM_SUSPENDED);
 		pm_runtime_cancel_pending(dev);
 		dev_pm_enable_wake_irq_check(dev, false);
 	} else {
  no_callback:
+		if (!strcmp(dev_name(dev), "0000:00:01.1"))
+			dev_info(dev, "%s %d\n", __func__, __LINE__);
 		__update_runtime_status(dev, RPM_ACTIVE);
 		pm_runtime_mark_last_busy(dev);
 		if (parent)
@@ -949,7 +993,11 @@ static int rpm_resume(struct device *dev, int rpmflags)
 		rpm_idle(dev, RPM_ASYNC);
 
  out:
+	if (!strcmp(dev_name(dev), "0000:00:01.1"))
+		dev_info(dev, "%s %d\n", __func__, __LINE__);
 	if (parent && !dev->power.irq_safe) {
+		if (!strcmp(dev_name(dev), "0000:00:01.1"))
+			dev_info(dev, "%s %d\n", __func__, __LINE__);
 		spin_unlock_irq(&dev->power.lock);
 
 		pm_runtime_put(parent);
@@ -959,6 +1007,8 @@ static int rpm_resume(struct device *dev, int rpmflags)
 
 	trace_rpm_return_int(dev, _THIS_IP_, retval);
 
+	if (!strcmp(dev_name(dev), "0000:00:01.1"))
+		dev_info(dev, "%s %d\n", __func__, __LINE__);
 	return retval;
 }
 
@@ -1181,17 +1231,27 @@ int __pm_runtime_resume(struct device *dev, int rpmflags)
 {
 	unsigned long flags;
 	int retval;
+	if (!strcmp(dev_name(dev), "0000:00:01.1"))
+		dev_info(dev, "%s %d\n", __func__, __LINE__);
 
 	might_sleep_if(!(rpmflags & RPM_ASYNC) && !dev->power.irq_safe &&
 			dev->power.runtime_status != RPM_ACTIVE);
 
+	if (!strcmp(dev_name(dev), "0000:00:01.1"))
+		dev_info(dev, "%s %d\n", __func__, __LINE__);
 	if (rpmflags & RPM_GET_PUT)
 		atomic_inc(&dev->power.usage_count);
 
+	if (!strcmp(dev_name(dev), "0000:00:01.1"))
+		dev_info(dev, "%s %d\n", __func__, __LINE__);
 	spin_lock_irqsave(&dev->power.lock, flags);
+	if (!strcmp(dev_name(dev), "0000:00:01.1"))
+		dev_info(dev, "%s %d\n", __func__, __LINE__);
 	retval = rpm_resume(dev, rpmflags);
 	spin_unlock_irqrestore(&dev->power.lock, flags);
 
+	if (!strcmp(dev_name(dev), "0000:00:01.1"))
+		dev_info(dev, "%s %d\n", __func__, __LINE__);
 	return retval;
 }
 EXPORT_SYMBOL_GPL(__pm_runtime_resume);
diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h
index d88d6b6ccf5b..0888b0d5ec73 100644
--- a/include/linux/pm_runtime.h
+++ b/include/linux/pm_runtime.h
@@ -508,6 +508,8 @@ static inline int pm_runtime_get(struct device *dev)
  */
 static inline int pm_runtime_get_sync(struct device *dev)
 {
+	if (!strcmp(dev_name(dev), "0000:00:01.1"))
+		dev_info(dev, "%s\n", __func__);
 	return __pm_runtime_resume(dev, RPM_GET_PUT);
 }
 
With this there is no crash after 60h uptime with ~3093 GPP0 notifies, probably the
printk()s are mitigating the crash in some way (i.e. there's a race and the printk()s
are slowing down only one side ...).

It would be nice if we could get a crash while all the printk()s are in place,
but I'm not sure if we can ...

Stopped 6.17.0-rc6-next-20250917-gpudebug-00024-g5c6b49b810db after 60h and 3093 GPP0 notifies without crash.

Bert Karwatzki



  parent reply	other threads:[~2025-10-06 12:09 UTC|newest]

Thread overview: 32+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-10-06 12:09 [REGRESSION 00/04] Crash during resume of pcie bridge Bert Karwatzki
2025-10-06 12:09 ` [REGRESSION 01/04] " Bert Karwatzki
2025-10-06 12:09 ` Bert Karwatzki [this message]
2025-10-06 12:09 ` [REGRESSION 03/04] " Bert Karwatzki
2025-10-06 12:09 ` [REGRESSION 04/04] " Bert Karwatzki
2025-10-06 12:39 ` [REGRESSION 00/04] " Christian König
2025-10-06 16:22   ` Bert Karwatzki
2025-10-07  6:50     ` Bert Karwatzki
2025-10-07 21:33 ` Mario Limonciello
2025-10-13 16:29   ` Bert Karwatzki
2025-10-13 18:51     ` Mario Limonciello
2025-10-14 10:50       ` Christian König
2025-10-27  9:57         ` Bert Karwatzki
2025-10-31 13:38           ` Bert Karwatzki
2025-10-31 13:47             ` Bert Karwatzki
2025-10-31 18:35               ` Bert Karwatzki
2025-11-05 11:44                 ` Bert Karwatzki
2025-11-05 21:31                   ` Mario Limonciello (AMD) (kernel.org)
2025-11-07 13:09                     ` Bert Karwatzki
2025-11-07 17:09                       ` Bert Karwatzki
2025-11-10 13:33                         ` Christian König
2025-11-16 21:08                           ` Crash during resume of pcie bridge due to infinite loop in ACPICA Bert Karwatzki
2025-11-17 16:40                             ` Rafael J. Wysocki
2025-11-24 22:34                               ` Bert Karwatzki
2025-11-25 19:46                                 ` Rafael J. Wysocki
2025-11-27  0:08                                   ` Bert Karwatzki
2025-11-27 13:02                                     ` Rafael J. Wysocki
2025-11-28 20:47                                       ` Bert Karwatzki
2025-12-02 18:59                                         ` Rafael J. Wysocki
2025-12-02 19:53                                           ` Bert Karwatzki
2025-12-02 20:01                                             ` Rafael J. Wysocki
2025-12-05 10:05                                               ` Crash during resume of pcie bridge due to incorrect error handling Bert Karwatzki

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20251006120944.7880-3-spasswolf@web.de \
    --to=spasswolf@web.de \
    --cc=christian.koenig@amd.com \
    --cc=linux-acpi@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-next@vger.kernel.org \
    --cc=linux-pci@vger.kernel.org \
    --cc=linux-stable@vger.kernel.org \
    --cc=rafael.j.wysocki@intel.com \
    --cc=regressions@lists.linux.dev \
    --cc=superm1@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.