[PATCH v4 0/2] Unwind failed suspend

AMD-GFX Archive on lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH v4 0/2] Unwind failed suspend
@ 2025-10-23 16:52 Mario Limonciello
  2025-10-23 16:52 ` [PATCH v4 1/2] drm/amd: Unwind for failed device suspend Mario Limonciello
  2025-10-23 16:52 ` [PATCH v4 2/2] drm/amd: Reset the GPU if pmops failed Mario Limonciello
  0 siblings, 2 replies; 7+ messages in thread
From: Mario Limonciello @ 2025-10-23 16:52 UTC (permalink / raw)
  To: mario.limonciello, amd-gfx

If a suspend fails the PM core doesn't clean it up, the device
is just left in a bad state.  If this happens during memory pressure
it could be a hung system from just trying to suspend.

For all phases of suspend that return an error code, add an unwind
flow that will resume the parts that have failed.

If this fails, then reset the GPU during complete() callback.

v4:
 * fix git am failure from fuzz

Mario Limonciello (2):
  drm/amd: Unwind for failed device suspend
  drm/amd: Reset the GPU if pmops failed

 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 80 +++++++++++++++++++---
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c    | 11 +++
 2 files changed, 83 insertions(+), 8 deletions(-)

-- 
2.51.1

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [PATCH v4 1/2] drm/amd: Unwind for failed device suspend
  2025-10-23 16:52 [PATCH v4 0/2] Unwind failed suspend Mario Limonciello
@ 2025-10-23 16:52 ` Mario Limonciello
  2025-10-29 21:19   ` Alex Deucher
  2025-10-23 16:52 ` [PATCH v4 2/2] drm/amd: Reset the GPU if pmops failed Mario Limonciello
  1 sibling, 1 reply; 7+ messages in thread
From: Mario Limonciello @ 2025-10-23 16:52 UTC (permalink / raw)
  To: mario.limonciello, amd-gfx

If device suspend has failed, add a recovery flow that will attempt
to unwind the suspend and get things back up and running.

Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/4627
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 80 +++++++++++++++++++---
 1 file changed, 72 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 3ffb9bb1ec0b..645b15aa34f1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -5231,7 +5231,7 @@ void amdgpu_device_complete(struct drm_device *dev)
 int amdgpu_device_suspend(struct drm_device *dev, bool notify_clients)
 {
 	struct amdgpu_device *adev = drm_to_adev(dev);
-	int r = 0;
+	int r, rec;
 
 	if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
 		return 0;
@@ -5247,8 +5247,9 @@ int amdgpu_device_suspend(struct drm_device *dev, bool notify_clients)
 			return r;
 	}
 
-	if (amdgpu_acpi_smart_shift_update(adev, AMDGPU_SS_DEV_D3))
-		dev_warn(adev->dev, "smart shift update failed\n");
+	r = amdgpu_acpi_smart_shift_update(adev, AMDGPU_SS_DEV_D3);
+	if (r)
+		goto unwind_sriov;
 
 	if (notify_clients)
 		drm_client_dev_suspend(adev_to_drm(adev), false);
@@ -5259,16 +5260,16 @@ int amdgpu_device_suspend(struct drm_device *dev, bool notify_clients)
 
 	r = amdgpu_device_ip_suspend_phase1(adev);
 	if (r)
-		return r;
+		goto unwind_smartshift;
 
 	amdgpu_amdkfd_suspend(adev, !amdgpu_sriov_vf(adev) && !adev->in_runpm);
 	r = amdgpu_userq_suspend(adev);
 	if (r)
-		return r;
+		goto unwind_ip_phase1;
 
 	r = amdgpu_device_evict_resources(adev);
 	if (r)
-		return r;
+		goto unwind_userq;
 
 	amdgpu_ttm_set_buffer_funcs_status(adev, false);
 
@@ -5276,16 +5277,79 @@ int amdgpu_device_suspend(struct drm_device *dev, bool notify_clients)
 
 	r = amdgpu_device_ip_suspend_phase2(adev);
 	if (r)
-		return r;
+		goto unwind_evict;
 
 	if (amdgpu_sriov_vf(adev))
 		amdgpu_virt_release_full_gpu(adev, false);
 
 	r = amdgpu_dpm_notify_rlc_state(adev, false);
 	if (r)
-		return r;
+		goto unwind_ip_phase2;
 
 	return 0;
+
+unwind_ip_phase2:
+	/* suspend phase 2 = resume phase 2 + resume phase 1 */
+	rec = amdgpu_device_ip_resume_phase2(adev);
+	if (rec) {
+		dev_warn(adev->dev, "failed to re-initialize IPs phase2: %d\n", rec);
+		return r;
+	}
+	rec = amdgpu_device_fw_loading(adev);
+	if (rec) {
+		dev_warn(adev->dev, "failed to reload firmwares: %d\n", rec);
+		return r;
+	}
+	rec = amdgpu_device_ip_resume_phase1(adev);
+	if (rec) {
+		dev_warn(adev->dev, "failed to re-initialize IPs phase1: %d\n", rec);
+		return r;
+	}
+
+unwind_evict:
+	if (adev->mman.buffer_funcs_ring->sched.ready)
+		amdgpu_ttm_set_buffer_funcs_status(adev, true);
+	amdgpu_fence_driver_hw_init(adev);
+
+unwind_userq:
+	rec = amdgpu_userq_resume(adev);
+	if (rec) {
+		dev_warn(adev->dev, "failed to re-initialize user queues: %d\n", rec);
+		return r;
+	}
+	rec = amdgpu_amdkfd_resume(adev, !amdgpu_sriov_vf(adev) && !adev->in_runpm);
+	if (rec) {
+		dev_warn(adev->dev, "failed to re-initialize kfd: %d\n", rec);
+		return r;
+	}
+
+unwind_ip_phase1:
+	/* suspend phase 1 = resume phase 3 */
+	rec = amdgpu_device_ip_resume_phase3(adev);
+	if (rec) {
+		dev_warn(adev->dev, "failed to re-initialize IPs phase1: %d\n", rec);
+		return r;
+	}
+
+unwind_smartshift:
+	rec = amdgpu_acpi_smart_shift_update(adev, AMDGPU_SS_DEV_D0);
+	if (rec) {
+		dev_warn(adev->dev, "failed to re-update smart shift: %d\n", rec);
+		return r;
+	}
+
+unwind_sriov:
+	if (amdgpu_sriov_vf(adev)) {
+		rec = amdgpu_virt_request_full_gpu(adev, true);
+		if (rec) {
+			dev_warn(adev->dev, "failed to reinitialize sriov: %d\n", rec);
+			return r;
+		}
+	}
+
+	adev->in_suspend = adev->in_s0ix = adev->in_s3 = false;
+
+	return r;
 }
 
 static inline int amdgpu_virt_resume(struct amdgpu_device *adev)
-- 
2.51.1


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH v4 2/2] drm/amd: Reset the GPU if pmops failed
  2025-10-23 16:52 [PATCH v4 0/2] Unwind failed suspend Mario Limonciello
  2025-10-23 16:52 ` [PATCH v4 1/2] drm/amd: Unwind for failed device suspend Mario Limonciello
@ 2025-10-23 16:52 ` Mario Limonciello
  2025-10-29 21:28   ` Alex Deucher
  1 sibling, 1 reply; 7+ messages in thread
From: Mario Limonciello @ 2025-10-23 16:52 UTC (permalink / raw)
  To: mario.limonciello, amd-gfx

If the GPU fails to suspend the return code is passed up to the caller
but it's left in an inconsistent state.  This could lead to hangs
if userspace tries to access it.

The last stage of all pmpops calls (success or fail) is the complete()
callback.  If by the time the PM core reaches this state the GPU is still
in suspend something went really wrong, so reset it.

Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index a36e15beafeb..e2d598dd462d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -2590,6 +2590,17 @@ static int amdgpu_pmops_prepare(struct device *dev)
 
 static void amdgpu_pmops_complete(struct device *dev)
 {
+	struct drm_device *drm_dev = dev_get_drvdata(dev);
+	struct amdgpu_device *adev = drm_to_adev(drm_dev);
+
+	/* sequence failed, use a big 🔨 try to cleanup */
+	if (adev->in_suspend) {
+		adev->in_suspend = adev->in_s0ix = adev->in_s3 = false;
+		dev_crit(adev->dev, "pmpops sequence failed, resetting\n");
+		amdgpu_asic_reset(adev);
+		return;
+	}
+
 	amdgpu_device_complete(dev_get_drvdata(dev));
 }
 
-- 
2.51.1


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCH v4 1/2] drm/amd: Unwind for failed device suspend
  2025-10-23 16:52 ` [PATCH v4 1/2] drm/amd: Unwind for failed device suspend Mario Limonciello
@ 2025-10-29 21:19   ` Alex Deucher
  2025-10-30 14:33     ` Mario Limonciello
  0 siblings, 1 reply; 7+ messages in thread
From: Alex Deucher @ 2025-10-29 21:19 UTC (permalink / raw)
  To: Mario Limonciello; +Cc: amd-gfx

On Thu, Oct 23, 2025 at 12:53 PM Mario Limonciello
<mario.limonciello@amd.com> wrote:
>
> If device suspend has failed, add a recovery flow that will attempt
> to unwind the suspend and get things back up and running.
>
> Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/4627
> Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>

Patch is:
Acked-by: Alex Deucher <alexander.deucher@amd.com>

> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 80 +++++++++++++++++++---
>  1 file changed, 72 insertions(+), 8 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 3ffb9bb1ec0b..645b15aa34f1 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -5231,7 +5231,7 @@ void amdgpu_device_complete(struct drm_device *dev)
>  int amdgpu_device_suspend(struct drm_device *dev, bool notify_clients)
>  {
>         struct amdgpu_device *adev = drm_to_adev(dev);
> -       int r = 0;
> +       int r, rec;
>
>         if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
>                 return 0;
> @@ -5247,8 +5247,9 @@ int amdgpu_device_suspend(struct drm_device *dev, bool notify_clients)
>                         return r;
>         }
>
> -       if (amdgpu_acpi_smart_shift_update(adev, AMDGPU_SS_DEV_D3))
> -               dev_warn(adev->dev, "smart shift update failed\n");
> +       r = amdgpu_acpi_smart_shift_update(adev, AMDGPU_SS_DEV_D3);
> +       if (r)
> +               goto unwind_sriov;
>
>         if (notify_clients)
>                 drm_client_dev_suspend(adev_to_drm(adev), false);
> @@ -5259,16 +5260,16 @@ int amdgpu_device_suspend(struct drm_device *dev, bool notify_clients)
>
>         r = amdgpu_device_ip_suspend_phase1(adev);
>         if (r)
> -               return r;
> +               goto unwind_smartshift;
>
>         amdgpu_amdkfd_suspend(adev, !amdgpu_sriov_vf(adev) && !adev->in_runpm);
>         r = amdgpu_userq_suspend(adev);
>         if (r)
> -               return r;
> +               goto unwind_ip_phase1;
>
>         r = amdgpu_device_evict_resources(adev);
>         if (r)
> -               return r;
> +               goto unwind_userq;
>
>         amdgpu_ttm_set_buffer_funcs_status(adev, false);
>
> @@ -5276,16 +5277,79 @@ int amdgpu_device_suspend(struct drm_device *dev, bool notify_clients)
>
>         r = amdgpu_device_ip_suspend_phase2(adev);
>         if (r)
> -               return r;
> +               goto unwind_evict;
>
>         if (amdgpu_sriov_vf(adev))
>                 amdgpu_virt_release_full_gpu(adev, false);
>
>         r = amdgpu_dpm_notify_rlc_state(adev, false);
>         if (r)
> -               return r;
> +               goto unwind_ip_phase2;
>
>         return 0;
> +
> +unwind_ip_phase2:
> +       /* suspend phase 2 = resume phase 2 + resume phase 1 */
> +       rec = amdgpu_device_ip_resume_phase2(adev);
> +       if (rec) {
> +               dev_warn(adev->dev, "failed to re-initialize IPs phase2: %d\n", rec);
> +               return r;
> +       }
> +       rec = amdgpu_device_fw_loading(adev);
> +       if (rec) {
> +               dev_warn(adev->dev, "failed to reload firmwares: %d\n", rec);
> +               return r;
> +       }
> +       rec = amdgpu_device_ip_resume_phase1(adev);
> +       if (rec) {
> +               dev_warn(adev->dev, "failed to re-initialize IPs phase1: %d\n", rec);
> +               return r;
> +       }
> +
> +unwind_evict:
> +       if (adev->mman.buffer_funcs_ring->sched.ready)
> +               amdgpu_ttm_set_buffer_funcs_status(adev, true);
> +       amdgpu_fence_driver_hw_init(adev);
> +
> +unwind_userq:
> +       rec = amdgpu_userq_resume(adev);
> +       if (rec) {
> +               dev_warn(adev->dev, "failed to re-initialize user queues: %d\n", rec);
> +               return r;
> +       }
> +       rec = amdgpu_amdkfd_resume(adev, !amdgpu_sriov_vf(adev) && !adev->in_runpm);
> +       if (rec) {
> +               dev_warn(adev->dev, "failed to re-initialize kfd: %d\n", rec);
> +               return r;
> +       }
> +
> +unwind_ip_phase1:
> +       /* suspend phase 1 = resume phase 3 */
> +       rec = amdgpu_device_ip_resume_phase3(adev);
> +       if (rec) {
> +               dev_warn(adev->dev, "failed to re-initialize IPs phase1: %d\n", rec);
> +               return r;
> +       }
> +
> +unwind_smartshift:
> +       rec = amdgpu_acpi_smart_shift_update(adev, AMDGPU_SS_DEV_D0);
> +       if (rec) {
> +               dev_warn(adev->dev, "failed to re-update smart shift: %d\n", rec);
> +               return r;
> +       }
> +
> +unwind_sriov:
> +       if (amdgpu_sriov_vf(adev)) {
> +               rec = amdgpu_virt_request_full_gpu(adev, true);
> +               if (rec) {
> +                       dev_warn(adev->dev, "failed to reinitialize sriov: %d\n", rec);
> +                       return r;
> +               }
> +       }
> +
> +       adev->in_suspend = adev->in_s0ix = adev->in_s3 = false;
> +
> +       return r;
>  }
>
>  static inline int amdgpu_virt_resume(struct amdgpu_device *adev)
> --
> 2.51.1
>

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH v4 2/2] drm/amd: Reset the GPU if pmops failed
  2025-10-23 16:52 ` [PATCH v4 2/2] drm/amd: Reset the GPU if pmops failed Mario Limonciello
@ 2025-10-29 21:28   ` Alex Deucher
  2025-10-30 14:35     ` Mario Limonciello
  0 siblings, 1 reply; 7+ messages in thread
From: Alex Deucher @ 2025-10-29 21:28 UTC (permalink / raw)
  To: Mario Limonciello; +Cc: amd-gfx

On Thu, Oct 23, 2025 at 1:01 PM Mario Limonciello
<mario.limonciello@amd.com> wrote:
>
> If the GPU fails to suspend the return code is passed up to the caller
> but it's left in an inconsistent state.  This could lead to hangs
> if userspace tries to access it.
>
> The last stage of all pmpops calls (success or fail) is the complete()
> callback.  If by the time the PM core reaches this state the GPU is still
> in suspend something went really wrong, so reset it.

What happens at that stage?  Resetting would put the GPU back into a
known state from a hardware perspective (amdgpu_asic_reset() just does
the hardware reset), but software would still think things are
suspended so the sw side would still be broken.  If you want to try
and reset both the hw and sw state, you'd need to do the whole gpu
recovery sequence via a worker thread.  E.g., see
amdgpu_debugfs_reset_work() for reference.

Alex

>
> Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 11 +++++++++++
>  1 file changed, 11 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> index a36e15beafeb..e2d598dd462d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> @@ -2590,6 +2590,17 @@ static int amdgpu_pmops_prepare(struct device *dev)
>
>  static void amdgpu_pmops_complete(struct device *dev)
>  {
> +       struct drm_device *drm_dev = dev_get_drvdata(dev);
> +       struct amdgpu_device *adev = drm_to_adev(drm_dev);
> +
> +       /* sequence failed, use a big 🔨 try to cleanup */
> +       if (adev->in_suspend) {
> +               adev->in_suspend = adev->in_s0ix = adev->in_s3 = false;
> +               dev_crit(adev->dev, "pmpops sequence failed, resetting\n");
> +               amdgpu_asic_reset(adev);
> +               return;
> +       }
> +
>         amdgpu_device_complete(dev_get_drvdata(dev));
>  }
>
> --
> 2.51.1
>

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH v4 1/2] drm/amd: Unwind for failed device suspend
  2025-10-29 21:19   ` Alex Deucher
@ 2025-10-30 14:33     ` Mario Limonciello
  0 siblings, 0 replies; 7+ messages in thread
From: Mario Limonciello @ 2025-10-30 14:33 UTC (permalink / raw)
  To: Alex Deucher; +Cc: amd-gfx



On 10/29/2025 4:19 PM, Alex Deucher wrote:
> On Thu, Oct 23, 2025 at 12:53 PM Mario Limonciello
> <mario.limonciello@amd.com> wrote:
>>
>> If device suspend has failed, add a recovery flow that will attempt
>> to unwind the suspend and get things back up and running.
>>
>> Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/4627
>> Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
> 
> Patch is:
> Acked-by: Alex Deucher <alexander.deucher@amd.com>

I made some minor changes to this patch from testing feedback in a v5. 
It's now patch 4/5 with a few other things I found in 2-3 and part of 
another of your patch as 1.

Could you check that one?

https://lore.kernel.org/amd-gfx/20251026042942.549389-1-superm1@kernel.org/
> 
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 80 +++++++++++++++++++---
>>   1 file changed, 72 insertions(+), 8 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> index 3ffb9bb1ec0b..645b15aa34f1 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> @@ -5231,7 +5231,7 @@ void amdgpu_device_complete(struct drm_device *dev)
>>   int amdgpu_device_suspend(struct drm_device *dev, bool notify_clients)
>>   {
>>          struct amdgpu_device *adev = drm_to_adev(dev);
>> -       int r = 0;
>> +       int r, rec;
>>
>>          if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
>>                  return 0;
>> @@ -5247,8 +5247,9 @@ int amdgpu_device_suspend(struct drm_device *dev, bool notify_clients)
>>                          return r;
>>          }
>>
>> -       if (amdgpu_acpi_smart_shift_update(adev, AMDGPU_SS_DEV_D3))
>> -               dev_warn(adev->dev, "smart shift update failed\n");
>> +       r = amdgpu_acpi_smart_shift_update(adev, AMDGPU_SS_DEV_D3);
>> +       if (r)
>> +               goto unwind_sriov;
>>
>>          if (notify_clients)
>>                  drm_client_dev_suspend(adev_to_drm(adev), false);
>> @@ -5259,16 +5260,16 @@ int amdgpu_device_suspend(struct drm_device *dev, bool notify_clients)
>>
>>          r = amdgpu_device_ip_suspend_phase1(adev);
>>          if (r)
>> -               return r;
>> +               goto unwind_smartshift;
>>
>>          amdgpu_amdkfd_suspend(adev, !amdgpu_sriov_vf(adev) && !adev->in_runpm);
>>          r = amdgpu_userq_suspend(adev);
>>          if (r)
>> -               return r;
>> +               goto unwind_ip_phase1;
>>
>>          r = amdgpu_device_evict_resources(adev);
>>          if (r)
>> -               return r;
>> +               goto unwind_userq;
>>
>>          amdgpu_ttm_set_buffer_funcs_status(adev, false);
>>
>> @@ -5276,16 +5277,79 @@ int amdgpu_device_suspend(struct drm_device *dev, bool notify_clients)
>>
>>          r = amdgpu_device_ip_suspend_phase2(adev);
>>          if (r)
>> -               return r;
>> +               goto unwind_evict;
>>
>>          if (amdgpu_sriov_vf(adev))
>>                  amdgpu_virt_release_full_gpu(adev, false);
>>
>>          r = amdgpu_dpm_notify_rlc_state(adev, false);
>>          if (r)
>> -               return r;
>> +               goto unwind_ip_phase2;
>>
>>          return 0;
>> +
>> +unwind_ip_phase2:
>> +       /* suspend phase 2 = resume phase 2 + resume phase 1 */
>> +       rec = amdgpu_device_ip_resume_phase2(adev);
>> +       if (rec) {
>> +               dev_warn(adev->dev, "failed to re-initialize IPs phase2: %d\n", rec);
>> +               return r;
>> +       }
>> +       rec = amdgpu_device_fw_loading(adev);
>> +       if (rec) {
>> +               dev_warn(adev->dev, "failed to reload firmwares: %d\n", rec);
>> +               return r;
>> +       }
>> +       rec = amdgpu_device_ip_resume_phase1(adev);
>> +       if (rec) {
>> +               dev_warn(adev->dev, "failed to re-initialize IPs phase1: %d\n", rec);
>> +               return r;
>> +       }
>> +
>> +unwind_evict:
>> +       if (adev->mman.buffer_funcs_ring->sched.ready)
>> +               amdgpu_ttm_set_buffer_funcs_status(adev, true);
>> +       amdgpu_fence_driver_hw_init(adev);
>> +
>> +unwind_userq:
>> +       rec = amdgpu_userq_resume(adev);
>> +       if (rec) {
>> +               dev_warn(adev->dev, "failed to re-initialize user queues: %d\n", rec);
>> +               return r;
>> +       }
>> +       rec = amdgpu_amdkfd_resume(adev, !amdgpu_sriov_vf(adev) && !adev->in_runpm);
>> +       if (rec) {
>> +               dev_warn(adev->dev, "failed to re-initialize kfd: %d\n", rec);
>> +               return r;
>> +       }
>> +
>> +unwind_ip_phase1:
>> +       /* suspend phase 1 = resume phase 3 */
>> +       rec = amdgpu_device_ip_resume_phase3(adev);
>> +       if (rec) {
>> +               dev_warn(adev->dev, "failed to re-initialize IPs phase1: %d\n", rec);
>> +               return r;
>> +       }
>> +
>> +unwind_smartshift:
>> +       rec = amdgpu_acpi_smart_shift_update(adev, AMDGPU_SS_DEV_D0);
>> +       if (rec) {
>> +               dev_warn(adev->dev, "failed to re-update smart shift: %d\n", rec);
>> +               return r;
>> +       }
>> +
>> +unwind_sriov:
>> +       if (amdgpu_sriov_vf(adev)) {
>> +               rec = amdgpu_virt_request_full_gpu(adev, true);
>> +               if (rec) {
>> +                       dev_warn(adev->dev, "failed to reinitialize sriov: %d\n", rec);
>> +                       return r;
>> +               }
>> +       }
>> +
>> +       adev->in_suspend = adev->in_s0ix = adev->in_s3 = false;
>> +
>> +       return r;
>>   }
>>
>>   static inline int amdgpu_virt_resume(struct amdgpu_device *adev)
>> --
>> 2.51.1
>>


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH v4 2/2] drm/amd: Reset the GPU if pmops failed
  2025-10-29 21:28   ` Alex Deucher
@ 2025-10-30 14:35     ` Mario Limonciello
  0 siblings, 0 replies; 7+ messages in thread
From: Mario Limonciello @ 2025-10-30 14:35 UTC (permalink / raw)
  To: Alex Deucher; +Cc: amd-gfx



On 10/29/2025 4:28 PM, Alex Deucher wrote:
> On Thu, Oct 23, 2025 at 1:01 PM Mario Limonciello
> <mario.limonciello@amd.com> wrote:
>>
>> If the GPU fails to suspend the return code is passed up to the caller
>> but it's left in an inconsistent state.  This could lead to hangs
>> if userspace tries to access it.
>>
>> The last stage of all pmpops calls (success or fail) is the complete()
>> callback.  If by the time the PM core reaches this state the GPU is still
>> in suspend something went really wrong, so reset it.
> 
> What happens at that stage?  Resetting would put the GPU back into a
> known state from a hardware perspective (amdgpu_asic_reset() just does
> the hardware reset), but software would still think things are
> suspended so the sw side would still be broken.  If you want to try
> and reset both the hw and sw state, you'd need to do the whole gpu
> recovery sequence via a worker thread.  E.g., see
> amdgpu_debugfs_reset_work() for reference.

I guess I go back and forth on this 5th patch.

It's hypothetical and mostly in case I missed something in the unwind; 
and I I'm leaning to drop instead of add the complexity.

> 
> Alex
> 
>>
>> Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 11 +++++++++++
>>   1 file changed, 11 insertions(+)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>> index a36e15beafeb..e2d598dd462d 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>> @@ -2590,6 +2590,17 @@ static int amdgpu_pmops_prepare(struct device *dev)
>>
>>   static void amdgpu_pmops_complete(struct device *dev)
>>   {
>> +       struct drm_device *drm_dev = dev_get_drvdata(dev);
>> +       struct amdgpu_device *adev = drm_to_adev(drm_dev);
>> +
>> +       /* sequence failed, use a big 🔨 try to cleanup */
>> +       if (adev->in_suspend) {
>> +               adev->in_suspend = adev->in_s0ix = adev->in_s3 = false;
>> +               dev_crit(adev->dev, "pmpops sequence failed, resetting\n");
>> +               amdgpu_asic_reset(adev);
>> +               return;
>> +       }
>> +
>>          amdgpu_device_complete(dev_get_drvdata(dev));
>>   }
>>
>> --
>> 2.51.1
>>


^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2025-10-30 14:35 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-10-23 16:52 [PATCH v4 0/2] Unwind failed suspend Mario Limonciello
2025-10-23 16:52 ` [PATCH v4 1/2] drm/amd: Unwind for failed device suspend Mario Limonciello
2025-10-29 21:19   ` Alex Deucher
2025-10-30 14:33     ` Mario Limonciello
2025-10-23 16:52 ` [PATCH v4 2/2] drm/amd: Reset the GPU if pmops failed Mario Limonciello
2025-10-29 21:28   ` Alex Deucher
2025-10-30 14:35     ` Mario Limonciello

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox