* [PATCH 1/2] drm/amdgpu: Fix mes remove_hw_queue lock
@ 2026-06-18 1:23 Amber Lin
2026-06-18 1:23 ` [PATCH 2/2] drm/amdkfd: Remove extra function Amber Lin
0 siblings, 1 reply; 3+ messages in thread
From: Amber Lin @ 2026-06-18 1:23 UTC (permalink / raw)
To: amd-gfx, Jesse.Zhang; +Cc: Amber Lin
down_read/up_read adev->reset_domain semaphore should be placed around
remove queue.
Fixes: f5587b4740c0 ("drm/amdgpu: Remove faulty queue before resume")
Signed-off-by: Amber Lin <Amber.Lin@amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 5 +++++
drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 4 ----
2 files changed, 5 insertions(+), 4 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
index 0506b90f318e..982b41606d48 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
@@ -2358,9 +2358,14 @@ int amdgpu_gfx_reset_mes_compute(struct amdgpu_device *adev,
* preempted successfuly. Remove it before resume all so it
* doesn't get mapped back
*/
+ if (!down_read_trylock(&adev->reset_domain->sem)) {
+ r = -EIO;
+ goto out;
+ }
amdgpu_mes_lock(&adev->mes);
r = adev->mes.funcs->remove_hw_queue(&adev->mes, queue_input);
amdgpu_mes_unlock(&adev->mes);
+ up_read(&adev->reset_domain->sem);
}
out:
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index fc1d179148c0..4e60d9364e37 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -493,12 +493,8 @@ static int recover_bad_queue_mes(struct device_queue_manager *dqm, struct queue
struct amdgpu_device *adev = (struct amdgpu_device *)dqm->dev->adev;
int r = 0;
- if (!down_read_trylock(&adev->reset_domain->sem))
- return -EIO;
-
r = reset_queues_mes(dqm, q);
- up_read(&adev->reset_domain->sem);
return r;
}
--
2.43.0
^ permalink raw reply related [flat|nested] 3+ messages in thread* [PATCH 2/2] drm/amdkfd: Remove extra function 2026-06-18 1:23 [PATCH 1/2] drm/amdgpu: Fix mes remove_hw_queue lock Amber Lin @ 2026-06-18 1:23 ` Amber Lin 2026-06-18 1:32 ` Zhang, Jesse(Jie) 0 siblings, 1 reply; 3+ messages in thread From: Amber Lin @ 2026-06-18 1:23 UTC (permalink / raw) To: amd-gfx, Jesse.Zhang; +Cc: Amber Lin recover_bad_queue_mes is not needed. Call reset_queues_mes directly. Signed-off-by: Amber Lin <Amber.Lin@amd.com> --- .../drm/amd/amdkfd/kfd_device_queue_manager.c | 19 +++++-------------- 1 file changed, 5 insertions(+), 14 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c index 4e60d9364e37..83be54372b18 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c @@ -72,11 +72,11 @@ static int allocate_sdma_queue(struct device_queue_manager *dqm, struct queue *q, const uint32_t *restore_sdma_id); static int reset_queues_on_hws_hang(struct device_queue_manager *dqm, bool is_sdma); -static int recover_bad_queue_mes(struct device_queue_manager *dqm, struct queue *q); static struct queue *find_queue_by_doorbell_offset(struct device_queue_manager *dqm, u32 doorbell_offset); static void set_queue_as_reset(struct device_queue_manager *dqm, struct queue *q, struct qcm_process_device *qpd); +static int reset_queues_mes(struct device_queue_manager *dqm, struct queue *q); static inline enum KFD_MQD_TYPE get_mqd_type_from_queue_type(enum kfd_queue_type type) @@ -333,11 +333,12 @@ static int remove_queue_mes_on_reset_option(struct device_queue_manager *dqm, st amdgpu_mes_unlock(&adev->mes); up_read(&adev->reset_domain->sem); + /* If is_for_reset set, it is a mes internal cleanup */ if (!r || is_for_reset) return r; - /* remove_hw_queue failed. try to recover */ - r = recover_bad_queue_mes(dqm, q); + /* remove_hw_queue failure indicates a queue hang. reset the queue */ + r = reset_queues_mes(dqm, q); if (r && amdgpu_gpu_recovery) { dev_err(adev->dev, "failed to remove queue from MES, doorbell=0x%x\n", q->properties.doorbell_off); @@ -488,16 +489,6 @@ static int reset_queues_mes(struct device_queue_manager *dqm, struct queue *q) return r; } -static int recover_bad_queue_mes(struct device_queue_manager *dqm, struct queue *q) -{ - struct amdgpu_device *adev = (struct amdgpu_device *)dqm->dev->adev; - int r = 0; - - r = reset_queues_mes(dqm, q); - - return r; -} - static void increment_queue_count(struct device_queue_manager *dqm, struct qcm_process_device *qpd, struct queue *q) @@ -3269,7 +3260,7 @@ int kfd_dqm_suspend_bad_queue_mes(struct kfd_node *knode, u32 pasid, u32 doorbel list_for_each_entry(q, &qpd->queues_list, list) { if (q->doorbell_id == doorbell_id && q->properties.is_active) { - recover_bad_queue_mes(dqm, q); + reset_queues_mes(dqm, q); q->properties.is_evicted = true; q->properties.is_active = false; decrement_queue_count(dqm, qpd, q); -- 2.43.0 ^ permalink raw reply related [flat|nested] 3+ messages in thread
* RE: [PATCH 2/2] drm/amdkfd: Remove extra function 2026-06-18 1:23 ` [PATCH 2/2] drm/amdkfd: Remove extra function Amber Lin @ 2026-06-18 1:32 ` Zhang, Jesse(Jie) 0 siblings, 0 replies; 3+ messages in thread From: Zhang, Jesse(Jie) @ 2026-06-18 1:32 UTC (permalink / raw) To: Lin, Amber, amd-gfx@lists.freedesktop.org AMD General Series is Reviewed-by: <Jesse.zhang@amd.com> > -----Original Message----- > From: Lin, Amber <Amber.Lin@amd.com> > Sent: Thursday, June 18, 2026 9:23 AM > To: amd-gfx@lists.freedesktop.org; Zhang, Jesse(Jie) <Jesse.Zhang@amd.com> > Cc: Lin, Amber <Amber.Lin@amd.com> > Subject: [PATCH 2/2] drm/amdkfd: Remove extra function > > recover_bad_queue_mes is not needed. Call reset_queues_mes directly. > > Signed-off-by: Amber Lin <Amber.Lin@amd.com> > --- > .../drm/amd/amdkfd/kfd_device_queue_manager.c | 19 +++++-------------- > 1 file changed, 5 insertions(+), 14 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c > b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c > index 4e60d9364e37..83be54372b18 100644 > --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c > @@ -72,11 +72,11 @@ static int allocate_sdma_queue(struct > device_queue_manager *dqm, > struct queue *q, const uint32_t *restore_sdma_id); > > static int reset_queues_on_hws_hang(struct device_queue_manager *dqm, bool > is_sdma); -static int recover_bad_queue_mes(struct device_queue_manager *dqm, > struct queue *q); static struct queue *find_queue_by_doorbell_offset(struct > device_queue_manager *dqm, > u32 doorbell_offset); > static void set_queue_as_reset(struct device_queue_manager *dqm, struct queue > *q, > struct qcm_process_device *qpd); > +static int reset_queues_mes(struct device_queue_manager *dqm, struct > +queue *q); > > static inline > enum KFD_MQD_TYPE get_mqd_type_from_queue_type(enum kfd_queue_type > type) @@ -333,11 +333,12 @@ static int > remove_queue_mes_on_reset_option(struct device_queue_manager *dqm, st > amdgpu_mes_unlock(&adev->mes); > up_read(&adev->reset_domain->sem); > > + /* If is_for_reset set, it is a mes internal cleanup */ > if (!r || is_for_reset) > return r; > > - /* remove_hw_queue failed. try to recover */ > - r = recover_bad_queue_mes(dqm, q); > + /* remove_hw_queue failure indicates a queue hang. reset the queue */ > + r = reset_queues_mes(dqm, q); > if (r && amdgpu_gpu_recovery) { > dev_err(adev->dev, "failed to remove queue from MES, > doorbell=0x%x\n", > q->properties.doorbell_off); > @@ -488,16 +489,6 @@ static int reset_queues_mes(struct > device_queue_manager *dqm, struct queue *q) > return r; > } > > -static int recover_bad_queue_mes(struct device_queue_manager *dqm, struct > queue *q) -{ > - struct amdgpu_device *adev = (struct amdgpu_device *)dqm->dev->adev; > - int r = 0; > - > - r = reset_queues_mes(dqm, q); > - > - return r; > -} > - > static void increment_queue_count(struct device_queue_manager *dqm, > struct qcm_process_device *qpd, > struct queue *q) > @@ -3269,7 +3260,7 @@ int kfd_dqm_suspend_bad_queue_mes(struct kfd_node > *knode, u32 pasid, u32 doorbel > > list_for_each_entry(q, &qpd->queues_list, list) { > if (q->doorbell_id == doorbell_id && q->properties.is_active) { > - recover_bad_queue_mes(dqm, q); > + reset_queues_mes(dqm, q); > q->properties.is_evicted = true; > q->properties.is_active = false; > decrement_queue_count(dqm, qpd, q); > -- > 2.43.0 ^ permalink raw reply [flat|nested] 3+ messages in thread
end of thread, other threads:[~2026-06-18 1:32 UTC | newest] Thread overview: 3+ messages (download: mbox.gz follow: Atom feed -- links below jump to the message on this page -- 2026-06-18 1:23 [PATCH 1/2] drm/amdgpu: Fix mes remove_hw_queue lock Amber Lin 2026-06-18 1:23 ` [PATCH 2/2] drm/amdkfd: Remove extra function Amber Lin 2026-06-18 1:32 ` Zhang, Jesse(Jie)
This is an external index of several public inboxes, see mirroring instructions on how to clone and mirror all data and code used by this external index.