All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/2] drm/amdgpu: Fix mes remove_hw_queue lock
@ 2026-06-18  1:23 Amber Lin
  2026-06-18  1:23 ` [PATCH 2/2] drm/amdkfd: Remove extra function Amber Lin
  0 siblings, 1 reply; 3+ messages in thread
From: Amber Lin @ 2026-06-18  1:23 UTC (permalink / raw)
  To: amd-gfx, Jesse.Zhang; +Cc: Amber Lin

down_read/up_read adev->reset_domain semaphore should be placed around
remove queue.

Fixes: f5587b4740c0 ("drm/amdgpu: Remove faulty queue before resume")
Signed-off-by: Amber Lin <Amber.Lin@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c               | 5 +++++
 drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 4 ----
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
index 0506b90f318e..982b41606d48 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
@@ -2358,9 +2358,14 @@ int amdgpu_gfx_reset_mes_compute(struct amdgpu_device *adev,
 		 * preempted successfuly. Remove it before resume all so it
 		 * doesn't get mapped back
 		 */
+		if (!down_read_trylock(&adev->reset_domain->sem)) {
+			r = -EIO;
+			goto out;
+		}
 		amdgpu_mes_lock(&adev->mes);
 		r = adev->mes.funcs->remove_hw_queue(&adev->mes, queue_input);
 		amdgpu_mes_unlock(&adev->mes);
+		up_read(&adev->reset_domain->sem);
 	}
 
 out:
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index fc1d179148c0..4e60d9364e37 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -493,12 +493,8 @@ static int recover_bad_queue_mes(struct device_queue_manager *dqm, struct queue
 	struct amdgpu_device *adev = (struct amdgpu_device *)dqm->dev->adev;
 	int r = 0;
 
-	if (!down_read_trylock(&adev->reset_domain->sem))
-		return -EIO;
-
 	r = reset_queues_mes(dqm, q);
 
-	up_read(&adev->reset_domain->sem);
 	return r;
 }
 
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 3+ messages in thread

* [PATCH 2/2] drm/amdkfd: Remove extra function
  2026-06-18  1:23 [PATCH 1/2] drm/amdgpu: Fix mes remove_hw_queue lock Amber Lin
@ 2026-06-18  1:23 ` Amber Lin
  2026-06-18  1:32   ` Zhang, Jesse(Jie)
  0 siblings, 1 reply; 3+ messages in thread
From: Amber Lin @ 2026-06-18  1:23 UTC (permalink / raw)
  To: amd-gfx, Jesse.Zhang; +Cc: Amber Lin

recover_bad_queue_mes is not needed. Call reset_queues_mes directly.

Signed-off-by: Amber Lin <Amber.Lin@amd.com>
---
 .../drm/amd/amdkfd/kfd_device_queue_manager.c | 19 +++++--------------
 1 file changed, 5 insertions(+), 14 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index 4e60d9364e37..83be54372b18 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -72,11 +72,11 @@ static int allocate_sdma_queue(struct device_queue_manager *dqm,
 				struct queue *q, const uint32_t *restore_sdma_id);
 
 static int reset_queues_on_hws_hang(struct device_queue_manager *dqm, bool is_sdma);
-static int recover_bad_queue_mes(struct device_queue_manager *dqm, struct queue *q);
 static struct queue *find_queue_by_doorbell_offset(struct device_queue_manager *dqm,
 						   u32 doorbell_offset);
 static void set_queue_as_reset(struct device_queue_manager *dqm, struct queue *q,
 			       struct qcm_process_device *qpd);
+static int reset_queues_mes(struct device_queue_manager *dqm, struct queue *q);
 
 static inline
 enum KFD_MQD_TYPE get_mqd_type_from_queue_type(enum kfd_queue_type type)
@@ -333,11 +333,12 @@ static int remove_queue_mes_on_reset_option(struct device_queue_manager *dqm, st
 	amdgpu_mes_unlock(&adev->mes);
 	up_read(&adev->reset_domain->sem);
 
+	/* If is_for_reset set, it is a mes internal cleanup */
 	if (!r || is_for_reset)
 		return r;
 
-	/* remove_hw_queue failed. try to recover */
-	r = recover_bad_queue_mes(dqm, q);
+	/* remove_hw_queue failure indicates a queue hang. reset the queue */
+	r = reset_queues_mes(dqm, q);
 	if (r && amdgpu_gpu_recovery) {
 		dev_err(adev->dev, "failed to remove queue from MES, doorbell=0x%x\n",
 			q->properties.doorbell_off);
@@ -488,16 +489,6 @@ static int reset_queues_mes(struct device_queue_manager *dqm, struct queue *q)
 	return r;
 }
 
-static int recover_bad_queue_mes(struct device_queue_manager *dqm, struct queue *q)
-{
-	struct amdgpu_device *adev = (struct amdgpu_device *)dqm->dev->adev;
-	int r = 0;
-
-	r = reset_queues_mes(dqm, q);
-
-	return r;
-}
-
 static void increment_queue_count(struct device_queue_manager *dqm,
 				  struct qcm_process_device *qpd,
 				  struct queue *q)
@@ -3269,7 +3260,7 @@ int kfd_dqm_suspend_bad_queue_mes(struct kfd_node *knode, u32 pasid, u32 doorbel
 
 		list_for_each_entry(q, &qpd->queues_list, list) {
 			if (q->doorbell_id == doorbell_id && q->properties.is_active) {
-				recover_bad_queue_mes(dqm, q);
+				reset_queues_mes(dqm, q);
 				q->properties.is_evicted = true;
 				q->properties.is_active = false;
 				decrement_queue_count(dqm, qpd, q);
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 3+ messages in thread

* RE: [PATCH 2/2] drm/amdkfd: Remove extra function
  2026-06-18  1:23 ` [PATCH 2/2] drm/amdkfd: Remove extra function Amber Lin
@ 2026-06-18  1:32   ` Zhang, Jesse(Jie)
  0 siblings, 0 replies; 3+ messages in thread
From: Zhang, Jesse(Jie) @ 2026-06-18  1:32 UTC (permalink / raw)
  To: Lin, Amber, amd-gfx@lists.freedesktop.org

AMD General

Series is Reviewed-by: <Jesse.zhang@amd.com>

> -----Original Message-----
> From: Lin, Amber <Amber.Lin@amd.com>
> Sent: Thursday, June 18, 2026 9:23 AM
> To: amd-gfx@lists.freedesktop.org; Zhang, Jesse(Jie) <Jesse.Zhang@amd.com>
> Cc: Lin, Amber <Amber.Lin@amd.com>
> Subject: [PATCH 2/2] drm/amdkfd: Remove extra function
>
> recover_bad_queue_mes is not needed. Call reset_queues_mes directly.
>
> Signed-off-by: Amber Lin <Amber.Lin@amd.com>
> ---
>  .../drm/amd/amdkfd/kfd_device_queue_manager.c | 19 +++++--------------
>  1 file changed, 5 insertions(+), 14 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> index 4e60d9364e37..83be54372b18 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> @@ -72,11 +72,11 @@ static int allocate_sdma_queue(struct
> device_queue_manager *dqm,
>                               struct queue *q, const uint32_t *restore_sdma_id);
>
>  static int reset_queues_on_hws_hang(struct device_queue_manager *dqm, bool
> is_sdma); -static int recover_bad_queue_mes(struct device_queue_manager *dqm,
> struct queue *q);  static struct queue *find_queue_by_doorbell_offset(struct
> device_queue_manager *dqm,
>                                                  u32 doorbell_offset);
>  static void set_queue_as_reset(struct device_queue_manager *dqm, struct queue
> *q,
>                              struct qcm_process_device *qpd);
> +static int reset_queues_mes(struct device_queue_manager *dqm, struct
> +queue *q);
>
>  static inline
>  enum KFD_MQD_TYPE get_mqd_type_from_queue_type(enum kfd_queue_type
> type) @@ -333,11 +333,12 @@ static int
> remove_queue_mes_on_reset_option(struct device_queue_manager *dqm, st
>       amdgpu_mes_unlock(&adev->mes);
>       up_read(&adev->reset_domain->sem);
>
> +     /* If is_for_reset set, it is a mes internal cleanup */
>       if (!r || is_for_reset)
>               return r;
>
> -     /* remove_hw_queue failed. try to recover */
> -     r = recover_bad_queue_mes(dqm, q);
> +     /* remove_hw_queue failure indicates a queue hang. reset the queue */
> +     r = reset_queues_mes(dqm, q);
>       if (r && amdgpu_gpu_recovery) {
>               dev_err(adev->dev, "failed to remove queue from MES,
> doorbell=0x%x\n",
>                       q->properties.doorbell_off);
> @@ -488,16 +489,6 @@ static int reset_queues_mes(struct
> device_queue_manager *dqm, struct queue *q)
>       return r;
>  }
>
> -static int recover_bad_queue_mes(struct device_queue_manager *dqm, struct
> queue *q) -{
> -     struct amdgpu_device *adev = (struct amdgpu_device *)dqm->dev->adev;
> -     int r = 0;
> -
> -     r = reset_queues_mes(dqm, q);
> -
> -     return r;
> -}
> -
>  static void increment_queue_count(struct device_queue_manager *dqm,
>                                 struct qcm_process_device *qpd,
>                                 struct queue *q)
> @@ -3269,7 +3260,7 @@ int kfd_dqm_suspend_bad_queue_mes(struct kfd_node
> *knode, u32 pasid, u32 doorbel
>
>               list_for_each_entry(q, &qpd->queues_list, list) {
>                       if (q->doorbell_id == doorbell_id && q->properties.is_active) {
> -                             recover_bad_queue_mes(dqm, q);
> +                             reset_queues_mes(dqm, q);
>                               q->properties.is_evicted = true;
>                               q->properties.is_active = false;
>                               decrement_queue_count(dqm, qpd, q);
> --
> 2.43.0


^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2026-06-18  1:32 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-06-18  1:23 [PATCH 1/2] drm/amdgpu: Fix mes remove_hw_queue lock Amber Lin
2026-06-18  1:23 ` [PATCH 2/2] drm/amdkfd: Remove extra function Amber Lin
2026-06-18  1:32   ` Zhang, Jesse(Jie)

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.