AMD-GFX Archive on lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/2] drm/amdgpu: fix userq lock order against reset_domain
@ 2026-04-14  8:55 Prike Liang
  2026-04-14  8:55 ` [PATCH 2/2] drm/amdgpu: fix userq destroy reservation inversion Prike Liang
  2026-04-14 13:35 ` [PATCH 1/2] drm/amdgpu: fix userq lock order against reset_domain Christian König
  0 siblings, 2 replies; 4+ messages in thread
From: Prike Liang @ 2026-04-14  8:55 UTC (permalink / raw)
  To: amd-gfx; +Cc: Alexander.Deucher, Christian.Koenig, Prike Liang

User queue restore takes reservation locks before userq_mutex, but the
create and destroy paths can take userq_mutex and then nest
reset_domain->sem under it. Lockdep rightfully reports that as a
possible deadlock against the restore worker and other reservation
users.

Fix this by keeping reset_domain->sem outside the userq_mutex section in
the create path, and by moving queue cleanup out from under userq_mutex
in the destroy path. Remove the queue from the global doorbell lookup
before dropping userq_mutex so IRQ paths cannot access it while teardown
continues.

Signed-off-by: Prike Liang <Prike.Liang@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
index 2408f888c4d9..551426741a7f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
@@ -447,8 +447,6 @@ static void amdgpu_userq_cleanup(struct amdgpu_usermode_queue *queue)
 	/* Drop the userq reference. */
 	amdgpu_userq_buffer_vas_list_cleanup(adev, queue);
 	uq_funcs->mqd_destroy(queue);
-	/* Use interrupt-safe locking since IRQ handlers may access these XArrays */
-	xa_erase_irq(&adev->userq_doorbell_xa, queue->doorbell_index);
 	amdgpu_userq_fence_driver_free(queue);
 	queue->fence_drv = NULL;
 	queue->userq_mgr = NULL;
@@ -662,8 +660,12 @@ amdgpu_userq_destroy(struct amdgpu_userq_mgr *uq_mgr, struct amdgpu_usermode_que
 		drm_warn(adev_to_drm(uq_mgr->adev), "trying to destroy a HW mapping userq\n");
 		queue->state = AMDGPU_USERQ_STATE_HUNG;
 	}
-	amdgpu_userq_cleanup(queue);
+	/* Remove the queue from the global doorbell lookup before dropping
+	 * userq_mutex so IRQ paths can't access it while cleanup continues.
+	 */
+	xa_erase_irq(&adev->userq_doorbell_xa, queue->doorbell_index);
 	mutex_unlock(&uq_mgr->userq_mutex);
+	amdgpu_userq_cleanup(queue);
 
 	pm_runtime_put_autosuspend(adev_to_drm(adev)->dev);
 
@@ -799,6 +801,13 @@ amdgpu_userq_create(struct drm_file *filp, union drm_amdgpu_userq *args)
 		goto clean_fence_driver;
 	}
 
+	/*
+	 * Keep reset_domain->sem outside the userq_mutex section returned by
+	 * amdgpu_userq_ensure_ev_fence(). Restore acquires reservation locks
+	 * before userq_mutex, so taking reset_domain->sem after userq_mutex
+	 * would invert the established order and trigger lockdep.
+	 */
+	down_read(&adev->reset_domain->sem);
 	amdgpu_userq_ensure_ev_fence(&fpriv->userq_mgr, &fpriv->evf_mgr);
 
 	/* don't map the queue if scheduling is halted */
@@ -812,16 +821,13 @@ amdgpu_userq_create(struct drm_file *filp, union drm_amdgpu_userq *args)
 		r = amdgpu_userq_map_helper(queue);
 		if (r) {
 			drm_file_err(uq_mgr->file, "Failed to map Queue\n");
-			goto clean_mqd;
+			goto clean_reset_domain;
 		}
 	}
 
 	/* drop this refcount during queue destroy */
 	kref_init(&queue->refcount);
 
-	/* Wait for mode-1 reset to complete */
-	down_read(&adev->reset_domain->sem);
-
 	r = xa_alloc(&uq_mgr->userq_xa, &qid, queue,
 		     XA_LIMIT(1, AMDGPU_MAX_USERQ_COUNT), GFP_KERNEL);
 	if (r) {
@@ -850,7 +856,6 @@ amdgpu_userq_create(struct drm_file *filp, union drm_amdgpu_userq *args)
 
 clean_reset_domain:
 	up_read(&adev->reset_domain->sem);
-clean_mqd:
 	mutex_unlock(&uq_mgr->userq_mutex);
 	uq_funcs->mqd_destroy(queue);
 clean_fence_driver:
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2026-04-20  2:37 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-04-14  8:55 [PATCH 1/2] drm/amdgpu: fix userq lock order against reset_domain Prike Liang
2026-04-14  8:55 ` [PATCH 2/2] drm/amdgpu: fix userq destroy reservation inversion Prike Liang
2026-04-14 13:35 ` [PATCH 1/2] drm/amdgpu: fix userq lock order against reset_domain Christian König
2026-04-20  2:36   ` Liang, Prike

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox