AMD-GFX Archive on lore.kernel.org
 help / color / mirror / Atom feed
From: Mario Limonciello <mario.limonciello@amd.com>
To: Alex Deucher <alexdeucher@gmail.com>
Cc: amd-gfx@lists.freedesktop.org
Subject: Re: [PATCH v4 1/2] drm/amd: Unwind for failed device suspend
Date: Thu, 30 Oct 2025 09:33:54 -0500	[thread overview]
Message-ID: <4fd325d1-86a4-42dc-bed0-d13d76e05226@amd.com> (raw)
In-Reply-To: <CADnq5_N784+4XCPF3VCvpWZ86wKmfcbYrkvaEsU6jgNufrcOaQ@mail.gmail.com>



On 10/29/2025 4:19 PM, Alex Deucher wrote:
> On Thu, Oct 23, 2025 at 12:53 PM Mario Limonciello
> <mario.limonciello@amd.com> wrote:
>>
>> If device suspend has failed, add a recovery flow that will attempt
>> to unwind the suspend and get things back up and running.
>>
>> Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/4627
>> Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
> 
> Patch is:
> Acked-by: Alex Deucher <alexander.deucher@amd.com>

I made some minor changes to this patch from testing feedback in a v5. 
It's now patch 4/5 with a few other things I found in 2-3 and part of 
another of your patch as 1.

Could you check that one?

https://lore.kernel.org/amd-gfx/20251026042942.549389-1-superm1@kernel.org/
> 
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 80 +++++++++++++++++++---
>>   1 file changed, 72 insertions(+), 8 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> index 3ffb9bb1ec0b..645b15aa34f1 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> @@ -5231,7 +5231,7 @@ void amdgpu_device_complete(struct drm_device *dev)
>>   int amdgpu_device_suspend(struct drm_device *dev, bool notify_clients)
>>   {
>>          struct amdgpu_device *adev = drm_to_adev(dev);
>> -       int r = 0;
>> +       int r, rec;
>>
>>          if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
>>                  return 0;
>> @@ -5247,8 +5247,9 @@ int amdgpu_device_suspend(struct drm_device *dev, bool notify_clients)
>>                          return r;
>>          }
>>
>> -       if (amdgpu_acpi_smart_shift_update(adev, AMDGPU_SS_DEV_D3))
>> -               dev_warn(adev->dev, "smart shift update failed\n");
>> +       r = amdgpu_acpi_smart_shift_update(adev, AMDGPU_SS_DEV_D3);
>> +       if (r)
>> +               goto unwind_sriov;
>>
>>          if (notify_clients)
>>                  drm_client_dev_suspend(adev_to_drm(adev), false);
>> @@ -5259,16 +5260,16 @@ int amdgpu_device_suspend(struct drm_device *dev, bool notify_clients)
>>
>>          r = amdgpu_device_ip_suspend_phase1(adev);
>>          if (r)
>> -               return r;
>> +               goto unwind_smartshift;
>>
>>          amdgpu_amdkfd_suspend(adev, !amdgpu_sriov_vf(adev) && !adev->in_runpm);
>>          r = amdgpu_userq_suspend(adev);
>>          if (r)
>> -               return r;
>> +               goto unwind_ip_phase1;
>>
>>          r = amdgpu_device_evict_resources(adev);
>>          if (r)
>> -               return r;
>> +               goto unwind_userq;
>>
>>          amdgpu_ttm_set_buffer_funcs_status(adev, false);
>>
>> @@ -5276,16 +5277,79 @@ int amdgpu_device_suspend(struct drm_device *dev, bool notify_clients)
>>
>>          r = amdgpu_device_ip_suspend_phase2(adev);
>>          if (r)
>> -               return r;
>> +               goto unwind_evict;
>>
>>          if (amdgpu_sriov_vf(adev))
>>                  amdgpu_virt_release_full_gpu(adev, false);
>>
>>          r = amdgpu_dpm_notify_rlc_state(adev, false);
>>          if (r)
>> -               return r;
>> +               goto unwind_ip_phase2;
>>
>>          return 0;
>> +
>> +unwind_ip_phase2:
>> +       /* suspend phase 2 = resume phase 2 + resume phase 1 */
>> +       rec = amdgpu_device_ip_resume_phase2(adev);
>> +       if (rec) {
>> +               dev_warn(adev->dev, "failed to re-initialize IPs phase2: %d\n", rec);
>> +               return r;
>> +       }
>> +       rec = amdgpu_device_fw_loading(adev);
>> +       if (rec) {
>> +               dev_warn(adev->dev, "failed to reload firmwares: %d\n", rec);
>> +               return r;
>> +       }
>> +       rec = amdgpu_device_ip_resume_phase1(adev);
>> +       if (rec) {
>> +               dev_warn(adev->dev, "failed to re-initialize IPs phase1: %d\n", rec);
>> +               return r;
>> +       }
>> +
>> +unwind_evict:
>> +       if (adev->mman.buffer_funcs_ring->sched.ready)
>> +               amdgpu_ttm_set_buffer_funcs_status(adev, true);
>> +       amdgpu_fence_driver_hw_init(adev);
>> +
>> +unwind_userq:
>> +       rec = amdgpu_userq_resume(adev);
>> +       if (rec) {
>> +               dev_warn(adev->dev, "failed to re-initialize user queues: %d\n", rec);
>> +               return r;
>> +       }
>> +       rec = amdgpu_amdkfd_resume(adev, !amdgpu_sriov_vf(adev) && !adev->in_runpm);
>> +       if (rec) {
>> +               dev_warn(adev->dev, "failed to re-initialize kfd: %d\n", rec);
>> +               return r;
>> +       }
>> +
>> +unwind_ip_phase1:
>> +       /* suspend phase 1 = resume phase 3 */
>> +       rec = amdgpu_device_ip_resume_phase3(adev);
>> +       if (rec) {
>> +               dev_warn(adev->dev, "failed to re-initialize IPs phase1: %d\n", rec);
>> +               return r;
>> +       }
>> +
>> +unwind_smartshift:
>> +       rec = amdgpu_acpi_smart_shift_update(adev, AMDGPU_SS_DEV_D0);
>> +       if (rec) {
>> +               dev_warn(adev->dev, "failed to re-update smart shift: %d\n", rec);
>> +               return r;
>> +       }
>> +
>> +unwind_sriov:
>> +       if (amdgpu_sriov_vf(adev)) {
>> +               rec = amdgpu_virt_request_full_gpu(adev, true);
>> +               if (rec) {
>> +                       dev_warn(adev->dev, "failed to reinitialize sriov: %d\n", rec);
>> +                       return r;
>> +               }
>> +       }
>> +
>> +       adev->in_suspend = adev->in_s0ix = adev->in_s3 = false;
>> +
>> +       return r;
>>   }
>>
>>   static inline int amdgpu_virt_resume(struct amdgpu_device *adev)
>> --
>> 2.51.1
>>


  reply	other threads:[~2025-10-30 14:34 UTC|newest]

Thread overview: 7+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-10-23 16:52 [PATCH v4 0/2] Unwind failed suspend Mario Limonciello
2025-10-23 16:52 ` [PATCH v4 1/2] drm/amd: Unwind for failed device suspend Mario Limonciello
2025-10-29 21:19   ` Alex Deucher
2025-10-30 14:33     ` Mario Limonciello [this message]
2025-10-23 16:52 ` [PATCH v4 2/2] drm/amd: Reset the GPU if pmops failed Mario Limonciello
2025-10-29 21:28   ` Alex Deucher
2025-10-30 14:35     ` Mario Limonciello

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=4fd325d1-86a4-42dc-bed0-d13d76e05226@amd.com \
    --to=mario.limonciello@amd.com \
    --cc=alexdeucher@gmail.com \
    --cc=amd-gfx@lists.freedesktop.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox