From: Bert Karwatzki <spasswolf@web.de>
To: linux-kernel@vger.kernel.org
Cc: "Bert Karwatzki" <spasswolf@web.de>,
linux-next@vger.kernel.org, linux-stable@vger.kernel.org,
regressions@lists.linux.dev, linux-pci@vger.kernel.org,
linux-acpi@vger.kernel.org,
"Mario Limonciello" <superm1@kernel.org>,
"Christian König" <christian.koenig@amd.com>,
"Rafael J . Wysocki" <rafael.j.wysocki@intel.com>
Subject: [REGRESSION 02/04] Crash during resume of pcie bridge
Date: Mon, 6 Oct 2025 14:09:41 +0200 [thread overview]
Message-ID: <20251006120944.7880-3-spasswolf@web.de> (raw)
In-Reply-To: <20251006120944.7880-1-spasswolf@web.de>
The next step is to monitor pm_runtime_get_sync(), rpm_resume() and __pm_runtime_resume().
Here we need to use conditional debugging output or else we get messages at a
rate of about a million lines per minute.
These is the additional debugging used in
6.17.0-rc6-next-20250917-gpudebug-00024-g5c6b49b810db
diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c
index 7420b9851fe0..895898c3cd56 100644
--- a/drivers/base/power/runtime.c
+++ b/drivers/base/power/runtime.c
@@ -787,12 +787,18 @@ static int rpm_resume(struct device *dev, int rpmflags)
struct device *parent = NULL;
int retval = 0;
+ if (!strcmp(dev_name(dev), "0000:00:01.1"))
+ dev_info(dev, "%s %d\n", __func__, __LINE__);
trace_rpm_resume(dev, rpmflags);
repeat:
if (dev->power.runtime_error) {
+ if (!strcmp(dev_name(dev), "0000:00:01.1"))
+ dev_info(dev, "%s %d\n", __func__, __LINE__);
retval = -EINVAL;
} else if (dev->power.disable_depth > 0) {
+ if (!strcmp(dev_name(dev), "0000:00:01.1"))
+ dev_info(dev, "%s %d\n", __func__, __LINE__);
if (dev->power.runtime_status == RPM_ACTIVE &&
dev->power.last_status == RPM_ACTIVE)
retval = 1;
@@ -808,31 +814,45 @@ static int rpm_resume(struct device *dev, int rpmflags)
* rather than cancelling it now only to restart it again in the near
* future.
*/
+ if (!strcmp(dev_name(dev), "0000:00:01.1"))
+ dev_info(dev, "%s %d\n", __func__, __LINE__);
dev->power.request = RPM_REQ_NONE;
if (!dev->power.timer_autosuspends)
pm_runtime_deactivate_timer(dev);
if (dev->power.runtime_status == RPM_ACTIVE) {
+ if (!strcmp(dev_name(dev), "0000:00:01.1"))
+ dev_info(dev, "%s %d\n", __func__, __LINE__);
retval = 1;
goto out;
}
if (dev->power.runtime_status == RPM_RESUMING ||
dev->power.runtime_status == RPM_SUSPENDING) {
+ if (!strcmp(dev_name(dev), "0000:00:01.1"))
+ dev_info(dev, "%s %d\n", __func__, __LINE__);
DEFINE_WAIT(wait);
if (rpmflags & (RPM_ASYNC | RPM_NOWAIT)) {
+ if (!strcmp(dev_name(dev), "0000:00:01.1"))
+ dev_info(dev, "%s %d\n", __func__, __LINE__);
if (dev->power.runtime_status == RPM_SUSPENDING) {
+ if (!strcmp(dev_name(dev), "0000:00:01.1"))
+ dev_info(dev, "%s %d\n", __func__, __LINE__);
dev->power.deferred_resume = true;
if (rpmflags & RPM_NOWAIT)
retval = -EINPROGRESS;
} else {
+ if (!strcmp(dev_name(dev), "0000:00:01.1"))
+ dev_info(dev, "%s %d\n", __func__, __LINE__);
retval = -EINPROGRESS;
}
goto out;
}
if (dev->power.irq_safe) {
+ if (!strcmp(dev_name(dev), "0000:00:01.1"))
+ dev_info(dev, "%s %d\n", __func__, __LINE__);
spin_unlock(&dev->power.lock);
cpu_relax();
@@ -856,6 +876,8 @@ static int rpm_resume(struct device *dev, int rpmflags)
spin_lock_irq(&dev->power.lock);
}
finish_wait(&dev->power.wait_queue, &wait);
+ if (!strcmp(dev_name(dev), "0000:00:01.1"))
+ dev_info(dev, "%s %d\n", __func__, __LINE__);
goto repeat;
}
@@ -865,22 +887,32 @@ static int rpm_resume(struct device *dev, int rpmflags)
* the resume will actually succeed.
*/
if (dev->power.no_callbacks && !parent && dev->parent) {
+ if (!strcmp(dev_name(dev), "0000:00:01.1"))
+ dev_info(dev, "%s %d\n", __func__, __LINE__);
spin_lock_nested(&dev->parent->power.lock, SINGLE_DEPTH_NESTING);
if (dev->parent->power.disable_depth > 0 ||
dev->parent->power.ignore_children ||
dev->parent->power.runtime_status == RPM_ACTIVE) {
+ if (!strcmp(dev_name(dev), "0000:00:01.1"))
+ dev_info(dev, "%s %d\n", __func__, __LINE__);
atomic_inc(&dev->parent->power.child_count);
spin_unlock(&dev->parent->power.lock);
retval = 1;
goto no_callback; /* Assume success. */
}
spin_unlock(&dev->parent->power.lock);
+ if (!strcmp(dev_name(dev), "0000:00:01.1"))
+ dev_info(dev, "%s %d\n", __func__, __LINE__);
}
/* Carry out an asynchronous or a synchronous resume. */
if (rpmflags & RPM_ASYNC) {
+ if (!strcmp(dev_name(dev), "0000:00:01.1"))
+ dev_info(dev, "%s %d\n", __func__, __LINE__);
dev->power.request = RPM_REQ_RESUME;
if (!dev->power.request_pending) {
+ if (!strcmp(dev_name(dev), "0000:00:01.1"))
+ dev_info(dev, "%s %d\n", __func__, __LINE__);
dev->power.request_pending = true;
queue_work(pm_wq, &dev->power.work);
}
@@ -894,6 +926,8 @@ static int rpm_resume(struct device *dev, int rpmflags)
* necessary. Not needed if dev is irq-safe; then the
* parent is permanently resumed.
*/
+ if (!strcmp(dev_name(dev), "0000:00:01.1"))
+ dev_info(dev, "%s %d\n", __func__, __LINE__);
parent = dev->parent;
if (dev->power.irq_safe)
goto skip_parent;
@@ -909,6 +943,8 @@ static int rpm_resume(struct device *dev, int rpmflags)
*/
if (!parent->power.disable_depth &&
!parent->power.ignore_children) {
+ if (!strcmp(dev_name(dev), "0000:00:01.1"))
+ dev_info(dev, "%s %d\n", __func__, __LINE__);
rpm_resume(parent, 0);
if (parent->power.runtime_status != RPM_ACTIVE)
retval = -EBUSY;
@@ -919,10 +955,14 @@ static int rpm_resume(struct device *dev, int rpmflags)
if (retval)
goto out;
+ if (!strcmp(dev_name(dev), "0000:00:01.1"))
+ dev_info(dev, "%s %d\n", __func__, __LINE__);
goto repeat;
}
skip_parent:
+ if (!strcmp(dev_name(dev), "0000:00:01.1"))
+ dev_info(dev, "%s %d\n", __func__, __LINE__);
if (dev->power.no_callbacks)
goto no_callback; /* Assume success. */
@@ -933,11 +973,15 @@ static int rpm_resume(struct device *dev, int rpmflags)
dev_pm_disable_wake_irq_check(dev, false);
retval = rpm_callback(callback, dev);
if (retval) {
+ if (!strcmp(dev_name(dev), "0000:00:01.1"))
+ dev_info(dev, "%s %d\n", __func__, __LINE__);
__update_runtime_status(dev, RPM_SUSPENDED);
pm_runtime_cancel_pending(dev);
dev_pm_enable_wake_irq_check(dev, false);
} else {
no_callback:
+ if (!strcmp(dev_name(dev), "0000:00:01.1"))
+ dev_info(dev, "%s %d\n", __func__, __LINE__);
__update_runtime_status(dev, RPM_ACTIVE);
pm_runtime_mark_last_busy(dev);
if (parent)
@@ -949,7 +993,11 @@ static int rpm_resume(struct device *dev, int rpmflags)
rpm_idle(dev, RPM_ASYNC);
out:
+ if (!strcmp(dev_name(dev), "0000:00:01.1"))
+ dev_info(dev, "%s %d\n", __func__, __LINE__);
if (parent && !dev->power.irq_safe) {
+ if (!strcmp(dev_name(dev), "0000:00:01.1"))
+ dev_info(dev, "%s %d\n", __func__, __LINE__);
spin_unlock_irq(&dev->power.lock);
pm_runtime_put(parent);
@@ -959,6 +1007,8 @@ static int rpm_resume(struct device *dev, int rpmflags)
trace_rpm_return_int(dev, _THIS_IP_, retval);
+ if (!strcmp(dev_name(dev), "0000:00:01.1"))
+ dev_info(dev, "%s %d\n", __func__, __LINE__);
return retval;
}
@@ -1181,17 +1231,27 @@ int __pm_runtime_resume(struct device *dev, int rpmflags)
{
unsigned long flags;
int retval;
+ if (!strcmp(dev_name(dev), "0000:00:01.1"))
+ dev_info(dev, "%s %d\n", __func__, __LINE__);
might_sleep_if(!(rpmflags & RPM_ASYNC) && !dev->power.irq_safe &&
dev->power.runtime_status != RPM_ACTIVE);
+ if (!strcmp(dev_name(dev), "0000:00:01.1"))
+ dev_info(dev, "%s %d\n", __func__, __LINE__);
if (rpmflags & RPM_GET_PUT)
atomic_inc(&dev->power.usage_count);
+ if (!strcmp(dev_name(dev), "0000:00:01.1"))
+ dev_info(dev, "%s %d\n", __func__, __LINE__);
spin_lock_irqsave(&dev->power.lock, flags);
+ if (!strcmp(dev_name(dev), "0000:00:01.1"))
+ dev_info(dev, "%s %d\n", __func__, __LINE__);
retval = rpm_resume(dev, rpmflags);
spin_unlock_irqrestore(&dev->power.lock, flags);
+ if (!strcmp(dev_name(dev), "0000:00:01.1"))
+ dev_info(dev, "%s %d\n", __func__, __LINE__);
return retval;
}
EXPORT_SYMBOL_GPL(__pm_runtime_resume);
diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h
index d88d6b6ccf5b..0888b0d5ec73 100644
--- a/include/linux/pm_runtime.h
+++ b/include/linux/pm_runtime.h
@@ -508,6 +508,8 @@ static inline int pm_runtime_get(struct device *dev)
*/
static inline int pm_runtime_get_sync(struct device *dev)
{
+ if (!strcmp(dev_name(dev), "0000:00:01.1"))
+ dev_info(dev, "%s\n", __func__);
return __pm_runtime_resume(dev, RPM_GET_PUT);
}
With this there is no crash after 60h uptime with ~3093 GPP0 notifies, probably the
printk()s are mitigating the crash in some way (i.e. there's a race and the printk()s
are slowing down only one side ...).
It would be nice if we could get a crash while all the printk()s are in place,
but I'm not sure if we can ...
Stopped 6.17.0-rc6-next-20250917-gpudebug-00024-g5c6b49b810db after 60h and 3093 GPP0 notifies without crash.
Bert Karwatzki
next prev parent reply other threads:[~2025-10-06 12:09 UTC|newest]
Thread overview: 31+ messages / expand[flat|nested] mbox.gz Atom feed top
2025-10-06 12:09 [REGRESSION 00/04] Crash during resume of pcie bridge Bert Karwatzki
2025-10-06 12:09 ` [REGRESSION 01/04] " Bert Karwatzki
2025-10-06 12:09 ` Bert Karwatzki [this message]
2025-10-06 12:09 ` [REGRESSION 03/04] " Bert Karwatzki
2025-10-06 12:09 ` [REGRESSION 04/04] " Bert Karwatzki
2025-10-06 12:39 ` [REGRESSION 00/04] " Christian König
2025-10-06 16:22 ` Bert Karwatzki
2025-10-07 6:50 ` Bert Karwatzki
2025-10-07 21:33 ` Mario Limonciello
2025-10-13 16:29 ` Bert Karwatzki
2025-10-13 18:51 ` Mario Limonciello
2025-10-14 10:50 ` Christian König
[not found] ` <1853e2af7f70cf726df278137b6d2d89d9d9dc82.camel@web.de>
2025-10-31 13:38 ` Bert Karwatzki
2025-10-31 13:47 ` Bert Karwatzki
2025-10-31 18:35 ` Bert Karwatzki
2025-11-05 11:44 ` Bert Karwatzki
2025-11-05 21:31 ` Mario Limonciello (AMD) (kernel.org)
2025-11-07 13:09 ` Bert Karwatzki
2025-11-07 17:09 ` Bert Karwatzki
2025-11-10 13:33 ` Christian König
2025-11-16 21:08 ` Crash during resume of pcie bridge due to infinite loop in ACPICA Bert Karwatzki
2025-11-17 16:40 ` Rafael J. Wysocki
2025-11-24 22:34 ` Bert Karwatzki
2025-11-25 19:46 ` Rafael J. Wysocki
2025-11-27 0:08 ` Bert Karwatzki
2025-11-27 13:02 ` Rafael J. Wysocki
2025-11-28 20:47 ` Bert Karwatzki
2025-12-02 18:59 ` Rafael J. Wysocki
2025-12-02 19:53 ` Bert Karwatzki
2025-12-02 20:01 ` Rafael J. Wysocki
2025-12-05 10:05 ` Crash during resume of pcie bridge due to incorrect error handling Bert Karwatzki
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20251006120944.7880-3-spasswolf@web.de \
--to=spasswolf@web.de \
--cc=christian.koenig@amd.com \
--cc=linux-acpi@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-next@vger.kernel.org \
--cc=linux-pci@vger.kernel.org \
--cc=linux-stable@vger.kernel.org \
--cc=rafael.j.wysocki@intel.com \
--cc=regressions@lists.linux.dev \
--cc=superm1@kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).