* [PATCH 1/2] drm/amdgpu: Fix mes remove_hw_queue lock
@ 2026-06-18 1:23 Amber Lin
2026-06-18 1:23 ` [PATCH 2/2] drm/amdkfd: Remove extra function Amber Lin
0 siblings, 1 reply; 3+ messages in thread
From: Amber Lin @ 2026-06-18 1:23 UTC (permalink / raw)
To: amd-gfx, Jesse.Zhang; +Cc: Amber Lin
down_read/up_read adev->reset_domain semaphore should be placed around
remove queue.
Fixes: f5587b4740c0 ("drm/amdgpu: Remove faulty queue before resume")
Signed-off-by: Amber Lin <Amber.Lin@amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 5 +++++
drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 4 ----
2 files changed, 5 insertions(+), 4 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
index 0506b90f318e..982b41606d48 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
@@ -2358,9 +2358,14 @@ int amdgpu_gfx_reset_mes_compute(struct amdgpu_device *adev,
* preempted successfuly. Remove it before resume all so it
* doesn't get mapped back
*/
+ if (!down_read_trylock(&adev->reset_domain->sem)) {
+ r = -EIO;
+ goto out;
+ }
amdgpu_mes_lock(&adev->mes);
r = adev->mes.funcs->remove_hw_queue(&adev->mes, queue_input);
amdgpu_mes_unlock(&adev->mes);
+ up_read(&adev->reset_domain->sem);
}
out:
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index fc1d179148c0..4e60d9364e37 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -493,12 +493,8 @@ static int recover_bad_queue_mes(struct device_queue_manager *dqm, struct queue
struct amdgpu_device *adev = (struct amdgpu_device *)dqm->dev->adev;
int r = 0;
- if (!down_read_trylock(&adev->reset_domain->sem))
- return -EIO;
-
r = reset_queues_mes(dqm, q);
- up_read(&adev->reset_domain->sem);
return r;
}
--
2.43.0
^ permalink raw reply related [flat|nested] 3+ messages in thread
* [PATCH 2/2] drm/amdkfd: Remove extra function
2026-06-18 1:23 [PATCH 1/2] drm/amdgpu: Fix mes remove_hw_queue lock Amber Lin
@ 2026-06-18 1:23 ` Amber Lin
2026-06-18 1:32 ` Zhang, Jesse(Jie)
0 siblings, 1 reply; 3+ messages in thread
From: Amber Lin @ 2026-06-18 1:23 UTC (permalink / raw)
To: amd-gfx, Jesse.Zhang; +Cc: Amber Lin
recover_bad_queue_mes is not needed. Call reset_queues_mes directly.
Signed-off-by: Amber Lin <Amber.Lin@amd.com>
---
.../drm/amd/amdkfd/kfd_device_queue_manager.c | 19 +++++--------------
1 file changed, 5 insertions(+), 14 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index 4e60d9364e37..83be54372b18 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -72,11 +72,11 @@ static int allocate_sdma_queue(struct device_queue_manager *dqm,
struct queue *q, const uint32_t *restore_sdma_id);
static int reset_queues_on_hws_hang(struct device_queue_manager *dqm, bool is_sdma);
-static int recover_bad_queue_mes(struct device_queue_manager *dqm, struct queue *q);
static struct queue *find_queue_by_doorbell_offset(struct device_queue_manager *dqm,
u32 doorbell_offset);
static void set_queue_as_reset(struct device_queue_manager *dqm, struct queue *q,
struct qcm_process_device *qpd);
+static int reset_queues_mes(struct device_queue_manager *dqm, struct queue *q);
static inline
enum KFD_MQD_TYPE get_mqd_type_from_queue_type(enum kfd_queue_type type)
@@ -333,11 +333,12 @@ static int remove_queue_mes_on_reset_option(struct device_queue_manager *dqm, st
amdgpu_mes_unlock(&adev->mes);
up_read(&adev->reset_domain->sem);
+ /* If is_for_reset set, it is a mes internal cleanup */
if (!r || is_for_reset)
return r;
- /* remove_hw_queue failed. try to recover */
- r = recover_bad_queue_mes(dqm, q);
+ /* remove_hw_queue failure indicates a queue hang. reset the queue */
+ r = reset_queues_mes(dqm, q);
if (r && amdgpu_gpu_recovery) {
dev_err(adev->dev, "failed to remove queue from MES, doorbell=0x%x\n",
q->properties.doorbell_off);
@@ -488,16 +489,6 @@ static int reset_queues_mes(struct device_queue_manager *dqm, struct queue *q)
return r;
}
-static int recover_bad_queue_mes(struct device_queue_manager *dqm, struct queue *q)
-{
- struct amdgpu_device *adev = (struct amdgpu_device *)dqm->dev->adev;
- int r = 0;
-
- r = reset_queues_mes(dqm, q);
-
- return r;
-}
-
static void increment_queue_count(struct device_queue_manager *dqm,
struct qcm_process_device *qpd,
struct queue *q)
@@ -3269,7 +3260,7 @@ int kfd_dqm_suspend_bad_queue_mes(struct kfd_node *knode, u32 pasid, u32 doorbel
list_for_each_entry(q, &qpd->queues_list, list) {
if (q->doorbell_id == doorbell_id && q->properties.is_active) {
- recover_bad_queue_mes(dqm, q);
+ reset_queues_mes(dqm, q);
q->properties.is_evicted = true;
q->properties.is_active = false;
decrement_queue_count(dqm, qpd, q);
--
2.43.0
^ permalink raw reply related [flat|nested] 3+ messages in thread
* RE: [PATCH 2/2] drm/amdkfd: Remove extra function
2026-06-18 1:23 ` [PATCH 2/2] drm/amdkfd: Remove extra function Amber Lin
@ 2026-06-18 1:32 ` Zhang, Jesse(Jie)
0 siblings, 0 replies; 3+ messages in thread
From: Zhang, Jesse(Jie) @ 2026-06-18 1:32 UTC (permalink / raw)
To: Lin, Amber, amd-gfx@lists.freedesktop.org
AMD General
Series is Reviewed-by: <Jesse.zhang@amd.com>
> -----Original Message-----
> From: Lin, Amber <Amber.Lin@amd.com>
> Sent: Thursday, June 18, 2026 9:23 AM
> To: amd-gfx@lists.freedesktop.org; Zhang, Jesse(Jie) <Jesse.Zhang@amd.com>
> Cc: Lin, Amber <Amber.Lin@amd.com>
> Subject: [PATCH 2/2] drm/amdkfd: Remove extra function
>
> recover_bad_queue_mes is not needed. Call reset_queues_mes directly.
>
> Signed-off-by: Amber Lin <Amber.Lin@amd.com>
> ---
> .../drm/amd/amdkfd/kfd_device_queue_manager.c | 19 +++++--------------
> 1 file changed, 5 insertions(+), 14 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> index 4e60d9364e37..83be54372b18 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> @@ -72,11 +72,11 @@ static int allocate_sdma_queue(struct
> device_queue_manager *dqm,
> struct queue *q, const uint32_t *restore_sdma_id);
>
> static int reset_queues_on_hws_hang(struct device_queue_manager *dqm, bool
> is_sdma); -static int recover_bad_queue_mes(struct device_queue_manager *dqm,
> struct queue *q); static struct queue *find_queue_by_doorbell_offset(struct
> device_queue_manager *dqm,
> u32 doorbell_offset);
> static void set_queue_as_reset(struct device_queue_manager *dqm, struct queue
> *q,
> struct qcm_process_device *qpd);
> +static int reset_queues_mes(struct device_queue_manager *dqm, struct
> +queue *q);
>
> static inline
> enum KFD_MQD_TYPE get_mqd_type_from_queue_type(enum kfd_queue_type
> type) @@ -333,11 +333,12 @@ static int
> remove_queue_mes_on_reset_option(struct device_queue_manager *dqm, st
> amdgpu_mes_unlock(&adev->mes);
> up_read(&adev->reset_domain->sem);
>
> + /* If is_for_reset set, it is a mes internal cleanup */
> if (!r || is_for_reset)
> return r;
>
> - /* remove_hw_queue failed. try to recover */
> - r = recover_bad_queue_mes(dqm, q);
> + /* remove_hw_queue failure indicates a queue hang. reset the queue */
> + r = reset_queues_mes(dqm, q);
> if (r && amdgpu_gpu_recovery) {
> dev_err(adev->dev, "failed to remove queue from MES,
> doorbell=0x%x\n",
> q->properties.doorbell_off);
> @@ -488,16 +489,6 @@ static int reset_queues_mes(struct
> device_queue_manager *dqm, struct queue *q)
> return r;
> }
>
> -static int recover_bad_queue_mes(struct device_queue_manager *dqm, struct
> queue *q) -{
> - struct amdgpu_device *adev = (struct amdgpu_device *)dqm->dev->adev;
> - int r = 0;
> -
> - r = reset_queues_mes(dqm, q);
> -
> - return r;
> -}
> -
> static void increment_queue_count(struct device_queue_manager *dqm,
> struct qcm_process_device *qpd,
> struct queue *q)
> @@ -3269,7 +3260,7 @@ int kfd_dqm_suspend_bad_queue_mes(struct kfd_node
> *knode, u32 pasid, u32 doorbel
>
> list_for_each_entry(q, &qpd->queues_list, list) {
> if (q->doorbell_id == doorbell_id && q->properties.is_active) {
> - recover_bad_queue_mes(dqm, q);
> + reset_queues_mes(dqm, q);
> q->properties.is_evicted = true;
> q->properties.is_active = false;
> decrement_queue_count(dqm, qpd, q);
> --
> 2.43.0
^ permalink raw reply [flat|nested] 3+ messages in thread
end of thread, other threads:[~2026-06-18 1:32 UTC | newest]
Thread overview: 3+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-06-18 1:23 [PATCH 1/2] drm/amdgpu: Fix mes remove_hw_queue lock Amber Lin
2026-06-18 1:23 ` [PATCH 2/2] drm/amdkfd: Remove extra function Amber Lin
2026-06-18 1:32 ` Zhang, Jesse(Jie)
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.