* [PATCH 1/2] drm/amdgpu: fix userq lock order against reset_domain
@ 2026-04-14 8:55 Prike Liang
2026-04-14 8:55 ` [PATCH 2/2] drm/amdgpu: fix userq destroy reservation inversion Prike Liang
2026-04-14 13:35 ` [PATCH 1/2] drm/amdgpu: fix userq lock order against reset_domain Christian König
0 siblings, 2 replies; 4+ messages in thread
From: Prike Liang @ 2026-04-14 8:55 UTC (permalink / raw)
To: amd-gfx; +Cc: Alexander.Deucher, Christian.Koenig, Prike Liang
User queue restore takes reservation locks before userq_mutex, but the
create and destroy paths can take userq_mutex and then nest
reset_domain->sem under it. Lockdep rightfully reports that as a
possible deadlock against the restore worker and other reservation
users.
Fix this by keeping reset_domain->sem outside the userq_mutex section in
the create path, and by moving queue cleanup out from under userq_mutex
in the destroy path. Remove the queue from the global doorbell lookup
before dropping userq_mutex so IRQ paths cannot access it while teardown
continues.
Signed-off-by: Prike Liang <Prike.Liang@amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c | 21 +++++++++++++--------
1 file changed, 13 insertions(+), 8 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
index 2408f888c4d9..551426741a7f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
@@ -447,8 +447,6 @@ static void amdgpu_userq_cleanup(struct amdgpu_usermode_queue *queue)
/* Drop the userq reference. */
amdgpu_userq_buffer_vas_list_cleanup(adev, queue);
uq_funcs->mqd_destroy(queue);
- /* Use interrupt-safe locking since IRQ handlers may access these XArrays */
- xa_erase_irq(&adev->userq_doorbell_xa, queue->doorbell_index);
amdgpu_userq_fence_driver_free(queue);
queue->fence_drv = NULL;
queue->userq_mgr = NULL;
@@ -662,8 +660,12 @@ amdgpu_userq_destroy(struct amdgpu_userq_mgr *uq_mgr, struct amdgpu_usermode_que
drm_warn(adev_to_drm(uq_mgr->adev), "trying to destroy a HW mapping userq\n");
queue->state = AMDGPU_USERQ_STATE_HUNG;
}
- amdgpu_userq_cleanup(queue);
+ /* Remove the queue from the global doorbell lookup before dropping
+ * userq_mutex so IRQ paths can't access it while cleanup continues.
+ */
+ xa_erase_irq(&adev->userq_doorbell_xa, queue->doorbell_index);
mutex_unlock(&uq_mgr->userq_mutex);
+ amdgpu_userq_cleanup(queue);
pm_runtime_put_autosuspend(adev_to_drm(adev)->dev);
@@ -799,6 +801,13 @@ amdgpu_userq_create(struct drm_file *filp, union drm_amdgpu_userq *args)
goto clean_fence_driver;
}
+ /*
+ * Keep reset_domain->sem outside the userq_mutex section returned by
+ * amdgpu_userq_ensure_ev_fence(). Restore acquires reservation locks
+ * before userq_mutex, so taking reset_domain->sem after userq_mutex
+ * would invert the established order and trigger lockdep.
+ */
+ down_read(&adev->reset_domain->sem);
amdgpu_userq_ensure_ev_fence(&fpriv->userq_mgr, &fpriv->evf_mgr);
/* don't map the queue if scheduling is halted */
@@ -812,16 +821,13 @@ amdgpu_userq_create(struct drm_file *filp, union drm_amdgpu_userq *args)
r = amdgpu_userq_map_helper(queue);
if (r) {
drm_file_err(uq_mgr->file, "Failed to map Queue\n");
- goto clean_mqd;
+ goto clean_reset_domain;
}
}
/* drop this refcount during queue destroy */
kref_init(&queue->refcount);
- /* Wait for mode-1 reset to complete */
- down_read(&adev->reset_domain->sem);
-
r = xa_alloc(&uq_mgr->userq_xa, &qid, queue,
XA_LIMIT(1, AMDGPU_MAX_USERQ_COUNT), GFP_KERNEL);
if (r) {
@@ -850,7 +856,6 @@ amdgpu_userq_create(struct drm_file *filp, union drm_amdgpu_userq *args)
clean_reset_domain:
up_read(&adev->reset_domain->sem);
-clean_mqd:
mutex_unlock(&uq_mgr->userq_mutex);
uq_funcs->mqd_destroy(queue);
clean_fence_driver:
--
2.34.1
^ permalink raw reply related [flat|nested] 4+ messages in thread
* [PATCH 2/2] drm/amdgpu: fix userq destroy reservation inversion
2026-04-14 8:55 [PATCH 1/2] drm/amdgpu: fix userq lock order against reset_domain Prike Liang
@ 2026-04-14 8:55 ` Prike Liang
2026-04-14 13:35 ` [PATCH 1/2] drm/amdgpu: fix userq lock order against reset_domain Christian König
1 sibling, 0 replies; 4+ messages in thread
From: Prike Liang @ 2026-04-14 8:55 UTC (permalink / raw)
To: amd-gfx; +Cc: Alexander.Deucher, Christian.Koenig, Prike Liang
As queue destroy still reserves and unpins the doorbell and
write-pointer BOs while holding userq_mutex. but the restore
worker takes reservation locks before userq_mutex, so queue
destroy still creates a reservation_ww_class_mutex -> userq_mutex
cycle and can deadlock against amdgpu_userq_restore_worker.
This reservation inversion issue can be fixed by moving the pinned BO
release into amdgpu_userq_cleanup(), which runs after userq_mutex has been
dropped. This keeps queue state updates serialized under userq_mutex while
moving all reservation taking cleanup to the post unlock path.
Signed-off-by: Prike Liang <Prike.Liang@amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c | 28 +++++++++++------------
1 file changed, 13 insertions(+), 15 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
index 551426741a7f..3edd74d89f08 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
@@ -435,6 +435,17 @@ static void amdgpu_userq_wait_for_last_fence(struct amdgpu_usermode_queue *queue
dma_fence_wait(f, false);
}
+static void amdgpu_userq_release_pinned_obj(struct amdgpu_userq_obj *userq_obj)
+{
+ if (!userq_obj->obj)
+ return;
+ if (!amdgpu_bo_reserve(userq_obj->obj, true)) {
+ amdgpu_bo_unpin(userq_obj->obj);
+ amdgpu_bo_unreserve(userq_obj->obj);
+ }
+ amdgpu_bo_unref(&userq_obj->obj);
+}
+
static void amdgpu_userq_cleanup(struct amdgpu_usermode_queue *queue)
{
struct amdgpu_userq_mgr *uq_mgr = queue->userq_mgr;
@@ -443,7 +454,8 @@ static void amdgpu_userq_cleanup(struct amdgpu_usermode_queue *queue)
/* Wait for mode-1 reset to complete */
down_read(&adev->reset_domain->sem);
-
+ amdgpu_userq_release_pinned_obj(&queue->db_obj);
+ amdgpu_userq_release_pinned_obj(&queue->wptr_obj);
/* Drop the userq reference. */
amdgpu_userq_buffer_vas_list_cleanup(adev, queue);
uq_funcs->mqd_destroy(queue);
@@ -635,20 +647,6 @@ amdgpu_userq_destroy(struct amdgpu_userq_mgr *uq_mgr, struct amdgpu_usermode_que
queue->hang_detect_fence = NULL;
amdgpu_userq_wait_for_last_fence(queue);
- r = amdgpu_bo_reserve(queue->db_obj.obj, true);
- if (!r) {
- amdgpu_bo_unpin(queue->db_obj.obj);
- amdgpu_bo_unreserve(queue->db_obj.obj);
- }
- amdgpu_bo_unref(&queue->db_obj.obj);
-
- r = amdgpu_bo_reserve(queue->wptr_obj.obj, true);
- if (!r) {
- amdgpu_bo_unpin(queue->wptr_obj.obj);
- amdgpu_bo_unreserve(queue->wptr_obj.obj);
- }
- amdgpu_bo_unref(&queue->wptr_obj.obj);
-
atomic_dec(&uq_mgr->userq_count[queue->queue_type]);
#if defined(CONFIG_DEBUG_FS)
debugfs_remove_recursive(queue->debugfs_queue);
--
2.34.1
^ permalink raw reply related [flat|nested] 4+ messages in thread
* Re: [PATCH 1/2] drm/amdgpu: fix userq lock order against reset_domain
2026-04-14 8:55 [PATCH 1/2] drm/amdgpu: fix userq lock order against reset_domain Prike Liang
2026-04-14 8:55 ` [PATCH 2/2] drm/amdgpu: fix userq destroy reservation inversion Prike Liang
@ 2026-04-14 13:35 ` Christian König
2026-04-20 2:36 ` Liang, Prike
1 sibling, 1 reply; 4+ messages in thread
From: Christian König @ 2026-04-14 13:35 UTC (permalink / raw)
To: Prike Liang, amd-gfx, Khatri, Sunil; +Cc: Alexander.Deucher
On 4/14/26 10:55, Prike Liang wrote:
> User queue restore takes reservation locks before userq_mutex, but the
> create and destroy paths can take userq_mutex and then nest
> reset_domain->sem under it.
Yeah but that is correct behavior.
> Lockdep rightfully reports that as a
> possible deadlock against the restore worker and other reservation
> users.
>
> Fix this by keeping reset_domain->sem outside the userq_mutex section in
> the create path, and by moving queue cleanup out from under userq_mutex
> in the destroy path. Remove the queue from the global doorbell lookup
> before dropping userq_mutex so IRQ paths cannot access it while teardown
> continues.
This is exactly the wrong order.
We need to be able to wait for fences while holding the userq_mutex and than in turn can wait for GPU reset. So the GPU reset can't depend on the userq_mutex.
The order needs to be:
1. reservation lock
2. userq_mutex
3. GPU reset lock
Regards,
Christian.
>
> Signed-off-by: Prike Liang <Prike.Liang@amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c | 21 +++++++++++++--------
> 1 file changed, 13 insertions(+), 8 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> index 2408f888c4d9..551426741a7f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> @@ -447,8 +447,6 @@ static void amdgpu_userq_cleanup(struct amdgpu_usermode_queue *queue)
> /* Drop the userq reference. */
> amdgpu_userq_buffer_vas_list_cleanup(adev, queue);
> uq_funcs->mqd_destroy(queue);
> - /* Use interrupt-safe locking since IRQ handlers may access these XArrays */
> - xa_erase_irq(&adev->userq_doorbell_xa, queue->doorbell_index);
> amdgpu_userq_fence_driver_free(queue);
> queue->fence_drv = NULL;
> queue->userq_mgr = NULL;
> @@ -662,8 +660,12 @@ amdgpu_userq_destroy(struct amdgpu_userq_mgr *uq_mgr, struct amdgpu_usermode_que
> drm_warn(adev_to_drm(uq_mgr->adev), "trying to destroy a HW mapping userq\n");
> queue->state = AMDGPU_USERQ_STATE_HUNG;
> }
> - amdgpu_userq_cleanup(queue);
> + /* Remove the queue from the global doorbell lookup before dropping
> + * userq_mutex so IRQ paths can't access it while cleanup continues.
> + */
> + xa_erase_irq(&adev->userq_doorbell_xa, queue->doorbell_index);
> mutex_unlock(&uq_mgr->userq_mutex);
> + amdgpu_userq_cleanup(queue);
>
> pm_runtime_put_autosuspend(adev_to_drm(adev)->dev);
>
> @@ -799,6 +801,13 @@ amdgpu_userq_create(struct drm_file *filp, union drm_amdgpu_userq *args)
> goto clean_fence_driver;
> }
>
> + /*
> + * Keep reset_domain->sem outside the userq_mutex section returned by
> + * amdgpu_userq_ensure_ev_fence(). Restore acquires reservation locks
> + * before userq_mutex, so taking reset_domain->sem after userq_mutex
> + * would invert the established order and trigger lockdep.
> + */
> + down_read(&adev->reset_domain->sem);
> amdgpu_userq_ensure_ev_fence(&fpriv->userq_mgr, &fpriv->evf_mgr);
>
> /* don't map the queue if scheduling is halted */
> @@ -812,16 +821,13 @@ amdgpu_userq_create(struct drm_file *filp, union drm_amdgpu_userq *args)
> r = amdgpu_userq_map_helper(queue);
> if (r) {
> drm_file_err(uq_mgr->file, "Failed to map Queue\n");
> - goto clean_mqd;
> + goto clean_reset_domain;
> }
> }
>
> /* drop this refcount during queue destroy */
> kref_init(&queue->refcount);
>
> - /* Wait for mode-1 reset to complete */
> - down_read(&adev->reset_domain->sem);
> -
> r = xa_alloc(&uq_mgr->userq_xa, &qid, queue,
> XA_LIMIT(1, AMDGPU_MAX_USERQ_COUNT), GFP_KERNEL);
> if (r) {
> @@ -850,7 +856,6 @@ amdgpu_userq_create(struct drm_file *filp, union drm_amdgpu_userq *args)
>
> clean_reset_domain:
> up_read(&adev->reset_domain->sem);
> -clean_mqd:
> mutex_unlock(&uq_mgr->userq_mutex);
> uq_funcs->mqd_destroy(queue);
> clean_fence_driver:
^ permalink raw reply [flat|nested] 4+ messages in thread
* RE: [PATCH 1/2] drm/amdgpu: fix userq lock order against reset_domain
2026-04-14 13:35 ` [PATCH 1/2] drm/amdgpu: fix userq lock order against reset_domain Christian König
@ 2026-04-20 2:36 ` Liang, Prike
0 siblings, 0 replies; 4+ messages in thread
From: Liang, Prike @ 2026-04-20 2:36 UTC (permalink / raw)
To: Koenig, Christian, amd-gfx@lists.freedesktop.org, Khatri, Sunil
Cc: Deucher, Alexander
[Public]
Regards,
Prike
> -----Original Message-----
> From: Koenig, Christian <Christian.Koenig@amd.com>
> Sent: Tuesday, April 14, 2026 9:36 PM
> To: Liang, Prike <Prike.Liang@amd.com>; amd-gfx@lists.freedesktop.org; Khatri,
> Sunil <Sunil.Khatri@amd.com>
> Cc: Deucher, Alexander <Alexander.Deucher@amd.com>
> Subject: Re: [PATCH 1/2] drm/amdgpu: fix userq lock order against reset_domain
>
> On 4/14/26 10:55, Prike Liang wrote:
> > User queue restore takes reservation locks before userq_mutex, but the
> > create and destroy paths can take userq_mutex and then nest
> > reset_domain->sem under it.
>
> Yeah but that is correct behavior.
>
> > Lockdep rightfully reports that as a
> > possible deadlock against the restore worker and other reservation
> > users.
> >
> > Fix this by keeping reset_domain->sem outside the userq_mutex section
> > in the create path, and by moving queue cleanup out from under
> > userq_mutex in the destroy path. Remove the queue from the global
> > doorbell lookup before dropping userq_mutex so IRQ paths cannot access
> > it while teardown continues.
>
> This is exactly the wrong order.
>
> We need to be able to wait for fences while holding the userq_mutex and than in turn
> can wait for GPU reset. So the GPU reset can't depend on the userq_mutex.
>
> The order needs to be:
> 1. reservation lock
> 2. userq_mutex
> 3. GPU reset lock
Yes, if we keep the current lock acquisition order in the userq IOCTL, we may need to rework the lock sequencing in amdgpu_info_ioctl() to address the lockdep violation shown in the trace below. Meanwhile, I noticed you already adjusted the lock ordering between the reset lock and mmap_lock; I’ll check that change on my side.
Chain exists of:
&reset_domain->sem --> reservation_ww_class_mutex --> &userq_mgr->userq_mutex
> Regards,
> Christian.
>
> >
> > Signed-off-by: Prike Liang <Prike.Liang@amd.com>
> > ---
> > drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c | 21 +++++++++++++--------
> > 1 file changed, 13 insertions(+), 8 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> > index 2408f888c4d9..551426741a7f 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> > @@ -447,8 +447,6 @@ static void amdgpu_userq_cleanup(struct
> amdgpu_usermode_queue *queue)
> > /* Drop the userq reference. */
> > amdgpu_userq_buffer_vas_list_cleanup(adev, queue);
> > uq_funcs->mqd_destroy(queue);
> > - /* Use interrupt-safe locking since IRQ handlers may access these XArrays
> */
> > - xa_erase_irq(&adev->userq_doorbell_xa, queue->doorbell_index);
> > amdgpu_userq_fence_driver_free(queue);
> > queue->fence_drv = NULL;
> > queue->userq_mgr = NULL;
> > @@ -662,8 +660,12 @@ amdgpu_userq_destroy(struct amdgpu_userq_mgr
> *uq_mgr, struct amdgpu_usermode_que
> > drm_warn(adev_to_drm(uq_mgr->adev), "trying to destroy a HW
> mapping userq\n");
> > queue->state = AMDGPU_USERQ_STATE_HUNG;
> > }
> > - amdgpu_userq_cleanup(queue);
> > + /* Remove the queue from the global doorbell lookup before dropping
> > + * userq_mutex so IRQ paths can't access it while cleanup continues.
> > + */
> > + xa_erase_irq(&adev->userq_doorbell_xa, queue->doorbell_index);
> > mutex_unlock(&uq_mgr->userq_mutex);
> > + amdgpu_userq_cleanup(queue);
> >
> > pm_runtime_put_autosuspend(adev_to_drm(adev)->dev);
> >
> > @@ -799,6 +801,13 @@ amdgpu_userq_create(struct drm_file *filp, union
> drm_amdgpu_userq *args)
> > goto clean_fence_driver;
> > }
> >
> > + /*
> > + * Keep reset_domain->sem outside the userq_mutex section returned by
> > + * amdgpu_userq_ensure_ev_fence(). Restore acquires reservation locks
> > + * before userq_mutex, so taking reset_domain->sem after userq_mutex
> > + * would invert the established order and trigger lockdep.
> > + */
> > + down_read(&adev->reset_domain->sem);
> > amdgpu_userq_ensure_ev_fence(&fpriv->userq_mgr, &fpriv->evf_mgr);
> >
> > /* don't map the queue if scheduling is halted */ @@ -812,16 +821,13
> > @@ amdgpu_userq_create(struct drm_file *filp, union drm_amdgpu_userq *args)
> > r = amdgpu_userq_map_helper(queue);
> > if (r) {
> > drm_file_err(uq_mgr->file, "Failed to map Queue\n");
> > - goto clean_mqd;
> > + goto clean_reset_domain;
> > }
> > }
> >
> > /* drop this refcount during queue destroy */
> > kref_init(&queue->refcount);
> >
> > - /* Wait for mode-1 reset to complete */
> > - down_read(&adev->reset_domain->sem);
> > -
> > r = xa_alloc(&uq_mgr->userq_xa, &qid, queue,
> > XA_LIMIT(1, AMDGPU_MAX_USERQ_COUNT),
> GFP_KERNEL);
> > if (r) {
> > @@ -850,7 +856,6 @@ amdgpu_userq_create(struct drm_file *filp, union
> > drm_amdgpu_userq *args)
> >
> > clean_reset_domain:
> > up_read(&adev->reset_domain->sem);
> > -clean_mqd:
> > mutex_unlock(&uq_mgr->userq_mutex);
> > uq_funcs->mqd_destroy(queue);
> > clean_fence_driver:
^ permalink raw reply [flat|nested] 4+ messages in thread
end of thread, other threads:[~2026-04-20 2:37 UTC | newest]
Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-04-14 8:55 [PATCH 1/2] drm/amdgpu: fix userq lock order against reset_domain Prike Liang
2026-04-14 8:55 ` [PATCH 2/2] drm/amdgpu: fix userq destroy reservation inversion Prike Liang
2026-04-14 13:35 ` [PATCH 1/2] drm/amdgpu: fix userq lock order against reset_domain Christian König
2026-04-20 2:36 ` Liang, Prike
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox