* [PATCH 1/2] drm/amdgpu: fix userq lock order against reset_domain @ 2026-04-14 8:55 Prike Liang 2026-04-14 8:55 ` [PATCH 2/2] drm/amdgpu: fix userq destroy reservation inversion Prike Liang 2026-04-14 13:35 ` [PATCH 1/2] drm/amdgpu: fix userq lock order against reset_domain Christian König 0 siblings, 2 replies; 4+ messages in thread From: Prike Liang @ 2026-04-14 8:55 UTC (permalink / raw) To: amd-gfx; +Cc: Alexander.Deucher, Christian.Koenig, Prike Liang User queue restore takes reservation locks before userq_mutex, but the create and destroy paths can take userq_mutex and then nest reset_domain->sem under it. Lockdep rightfully reports that as a possible deadlock against the restore worker and other reservation users. Fix this by keeping reset_domain->sem outside the userq_mutex section in the create path, and by moving queue cleanup out from under userq_mutex in the destroy path. Remove the queue from the global doorbell lookup before dropping userq_mutex so IRQ paths cannot access it while teardown continues. Signed-off-by: Prike Liang <Prike.Liang@amd.com> --- drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c index 2408f888c4d9..551426741a7f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c @@ -447,8 +447,6 @@ static void amdgpu_userq_cleanup(struct amdgpu_usermode_queue *queue) /* Drop the userq reference. */ amdgpu_userq_buffer_vas_list_cleanup(adev, queue); uq_funcs->mqd_destroy(queue); - /* Use interrupt-safe locking since IRQ handlers may access these XArrays */ - xa_erase_irq(&adev->userq_doorbell_xa, queue->doorbell_index); amdgpu_userq_fence_driver_free(queue); queue->fence_drv = NULL; queue->userq_mgr = NULL; @@ -662,8 +660,12 @@ amdgpu_userq_destroy(struct amdgpu_userq_mgr *uq_mgr, struct amdgpu_usermode_que drm_warn(adev_to_drm(uq_mgr->adev), "trying to destroy a HW mapping userq\n"); queue->state = AMDGPU_USERQ_STATE_HUNG; } - amdgpu_userq_cleanup(queue); + /* Remove the queue from the global doorbell lookup before dropping + * userq_mutex so IRQ paths can't access it while cleanup continues. + */ + xa_erase_irq(&adev->userq_doorbell_xa, queue->doorbell_index); mutex_unlock(&uq_mgr->userq_mutex); + amdgpu_userq_cleanup(queue); pm_runtime_put_autosuspend(adev_to_drm(adev)->dev); @@ -799,6 +801,13 @@ amdgpu_userq_create(struct drm_file *filp, union drm_amdgpu_userq *args) goto clean_fence_driver; } + /* + * Keep reset_domain->sem outside the userq_mutex section returned by + * amdgpu_userq_ensure_ev_fence(). Restore acquires reservation locks + * before userq_mutex, so taking reset_domain->sem after userq_mutex + * would invert the established order and trigger lockdep. + */ + down_read(&adev->reset_domain->sem); amdgpu_userq_ensure_ev_fence(&fpriv->userq_mgr, &fpriv->evf_mgr); /* don't map the queue if scheduling is halted */ @@ -812,16 +821,13 @@ amdgpu_userq_create(struct drm_file *filp, union drm_amdgpu_userq *args) r = amdgpu_userq_map_helper(queue); if (r) { drm_file_err(uq_mgr->file, "Failed to map Queue\n"); - goto clean_mqd; + goto clean_reset_domain; } } /* drop this refcount during queue destroy */ kref_init(&queue->refcount); - /* Wait for mode-1 reset to complete */ - down_read(&adev->reset_domain->sem); - r = xa_alloc(&uq_mgr->userq_xa, &qid, queue, XA_LIMIT(1, AMDGPU_MAX_USERQ_COUNT), GFP_KERNEL); if (r) { @@ -850,7 +856,6 @@ amdgpu_userq_create(struct drm_file *filp, union drm_amdgpu_userq *args) clean_reset_domain: up_read(&adev->reset_domain->sem); -clean_mqd: mutex_unlock(&uq_mgr->userq_mutex); uq_funcs->mqd_destroy(queue); clean_fence_driver: -- 2.34.1 ^ permalink raw reply related [flat|nested] 4+ messages in thread
* [PATCH 2/2] drm/amdgpu: fix userq destroy reservation inversion 2026-04-14 8:55 [PATCH 1/2] drm/amdgpu: fix userq lock order against reset_domain Prike Liang @ 2026-04-14 8:55 ` Prike Liang 2026-04-14 13:35 ` [PATCH 1/2] drm/amdgpu: fix userq lock order against reset_domain Christian König 1 sibling, 0 replies; 4+ messages in thread From: Prike Liang @ 2026-04-14 8:55 UTC (permalink / raw) To: amd-gfx; +Cc: Alexander.Deucher, Christian.Koenig, Prike Liang As queue destroy still reserves and unpins the doorbell and write-pointer BOs while holding userq_mutex. but the restore worker takes reservation locks before userq_mutex, so queue destroy still creates a reservation_ww_class_mutex -> userq_mutex cycle and can deadlock against amdgpu_userq_restore_worker. This reservation inversion issue can be fixed by moving the pinned BO release into amdgpu_userq_cleanup(), which runs after userq_mutex has been dropped. This keeps queue state updates serialized under userq_mutex while moving all reservation taking cleanup to the post unlock path. Signed-off-by: Prike Liang <Prike.Liang@amd.com> --- drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c | 28 +++++++++++------------ 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c index 551426741a7f..3edd74d89f08 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c @@ -435,6 +435,17 @@ static void amdgpu_userq_wait_for_last_fence(struct amdgpu_usermode_queue *queue dma_fence_wait(f, false); } +static void amdgpu_userq_release_pinned_obj(struct amdgpu_userq_obj *userq_obj) +{ + if (!userq_obj->obj) + return; + if (!amdgpu_bo_reserve(userq_obj->obj, true)) { + amdgpu_bo_unpin(userq_obj->obj); + amdgpu_bo_unreserve(userq_obj->obj); + } + amdgpu_bo_unref(&userq_obj->obj); +} + static void amdgpu_userq_cleanup(struct amdgpu_usermode_queue *queue) { struct amdgpu_userq_mgr *uq_mgr = queue->userq_mgr; @@ -443,7 +454,8 @@ static void amdgpu_userq_cleanup(struct amdgpu_usermode_queue *queue) /* Wait for mode-1 reset to complete */ down_read(&adev->reset_domain->sem); - + amdgpu_userq_release_pinned_obj(&queue->db_obj); + amdgpu_userq_release_pinned_obj(&queue->wptr_obj); /* Drop the userq reference. */ amdgpu_userq_buffer_vas_list_cleanup(adev, queue); uq_funcs->mqd_destroy(queue); @@ -635,20 +647,6 @@ amdgpu_userq_destroy(struct amdgpu_userq_mgr *uq_mgr, struct amdgpu_usermode_que queue->hang_detect_fence = NULL; amdgpu_userq_wait_for_last_fence(queue); - r = amdgpu_bo_reserve(queue->db_obj.obj, true); - if (!r) { - amdgpu_bo_unpin(queue->db_obj.obj); - amdgpu_bo_unreserve(queue->db_obj.obj); - } - amdgpu_bo_unref(&queue->db_obj.obj); - - r = amdgpu_bo_reserve(queue->wptr_obj.obj, true); - if (!r) { - amdgpu_bo_unpin(queue->wptr_obj.obj); - amdgpu_bo_unreserve(queue->wptr_obj.obj); - } - amdgpu_bo_unref(&queue->wptr_obj.obj); - atomic_dec(&uq_mgr->userq_count[queue->queue_type]); #if defined(CONFIG_DEBUG_FS) debugfs_remove_recursive(queue->debugfs_queue); -- 2.34.1 ^ permalink raw reply related [flat|nested] 4+ messages in thread
* Re: [PATCH 1/2] drm/amdgpu: fix userq lock order against reset_domain 2026-04-14 8:55 [PATCH 1/2] drm/amdgpu: fix userq lock order against reset_domain Prike Liang 2026-04-14 8:55 ` [PATCH 2/2] drm/amdgpu: fix userq destroy reservation inversion Prike Liang @ 2026-04-14 13:35 ` Christian König 2026-04-20 2:36 ` Liang, Prike 1 sibling, 1 reply; 4+ messages in thread From: Christian König @ 2026-04-14 13:35 UTC (permalink / raw) To: Prike Liang, amd-gfx, Khatri, Sunil; +Cc: Alexander.Deucher On 4/14/26 10:55, Prike Liang wrote: > User queue restore takes reservation locks before userq_mutex, but the > create and destroy paths can take userq_mutex and then nest > reset_domain->sem under it. Yeah but that is correct behavior. > Lockdep rightfully reports that as a > possible deadlock against the restore worker and other reservation > users. > > Fix this by keeping reset_domain->sem outside the userq_mutex section in > the create path, and by moving queue cleanup out from under userq_mutex > in the destroy path. Remove the queue from the global doorbell lookup > before dropping userq_mutex so IRQ paths cannot access it while teardown > continues. This is exactly the wrong order. We need to be able to wait for fences while holding the userq_mutex and than in turn can wait for GPU reset. So the GPU reset can't depend on the userq_mutex. The order needs to be: 1. reservation lock 2. userq_mutex 3. GPU reset lock Regards, Christian. > > Signed-off-by: Prike Liang <Prike.Liang@amd.com> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c | 21 +++++++++++++-------- > 1 file changed, 13 insertions(+), 8 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c > index 2408f888c4d9..551426741a7f 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c > @@ -447,8 +447,6 @@ static void amdgpu_userq_cleanup(struct amdgpu_usermode_queue *queue) > /* Drop the userq reference. */ > amdgpu_userq_buffer_vas_list_cleanup(adev, queue); > uq_funcs->mqd_destroy(queue); > - /* Use interrupt-safe locking since IRQ handlers may access these XArrays */ > - xa_erase_irq(&adev->userq_doorbell_xa, queue->doorbell_index); > amdgpu_userq_fence_driver_free(queue); > queue->fence_drv = NULL; > queue->userq_mgr = NULL; > @@ -662,8 +660,12 @@ amdgpu_userq_destroy(struct amdgpu_userq_mgr *uq_mgr, struct amdgpu_usermode_que > drm_warn(adev_to_drm(uq_mgr->adev), "trying to destroy a HW mapping userq\n"); > queue->state = AMDGPU_USERQ_STATE_HUNG; > } > - amdgpu_userq_cleanup(queue); > + /* Remove the queue from the global doorbell lookup before dropping > + * userq_mutex so IRQ paths can't access it while cleanup continues. > + */ > + xa_erase_irq(&adev->userq_doorbell_xa, queue->doorbell_index); > mutex_unlock(&uq_mgr->userq_mutex); > + amdgpu_userq_cleanup(queue); > > pm_runtime_put_autosuspend(adev_to_drm(adev)->dev); > > @@ -799,6 +801,13 @@ amdgpu_userq_create(struct drm_file *filp, union drm_amdgpu_userq *args) > goto clean_fence_driver; > } > > + /* > + * Keep reset_domain->sem outside the userq_mutex section returned by > + * amdgpu_userq_ensure_ev_fence(). Restore acquires reservation locks > + * before userq_mutex, so taking reset_domain->sem after userq_mutex > + * would invert the established order and trigger lockdep. > + */ > + down_read(&adev->reset_domain->sem); > amdgpu_userq_ensure_ev_fence(&fpriv->userq_mgr, &fpriv->evf_mgr); > > /* don't map the queue if scheduling is halted */ > @@ -812,16 +821,13 @@ amdgpu_userq_create(struct drm_file *filp, union drm_amdgpu_userq *args) > r = amdgpu_userq_map_helper(queue); > if (r) { > drm_file_err(uq_mgr->file, "Failed to map Queue\n"); > - goto clean_mqd; > + goto clean_reset_domain; > } > } > > /* drop this refcount during queue destroy */ > kref_init(&queue->refcount); > > - /* Wait for mode-1 reset to complete */ > - down_read(&adev->reset_domain->sem); > - > r = xa_alloc(&uq_mgr->userq_xa, &qid, queue, > XA_LIMIT(1, AMDGPU_MAX_USERQ_COUNT), GFP_KERNEL); > if (r) { > @@ -850,7 +856,6 @@ amdgpu_userq_create(struct drm_file *filp, union drm_amdgpu_userq *args) > > clean_reset_domain: > up_read(&adev->reset_domain->sem); > -clean_mqd: > mutex_unlock(&uq_mgr->userq_mutex); > uq_funcs->mqd_destroy(queue); > clean_fence_driver: ^ permalink raw reply [flat|nested] 4+ messages in thread
* RE: [PATCH 1/2] drm/amdgpu: fix userq lock order against reset_domain 2026-04-14 13:35 ` [PATCH 1/2] drm/amdgpu: fix userq lock order against reset_domain Christian König @ 2026-04-20 2:36 ` Liang, Prike 0 siblings, 0 replies; 4+ messages in thread From: Liang, Prike @ 2026-04-20 2:36 UTC (permalink / raw) To: Koenig, Christian, amd-gfx@lists.freedesktop.org, Khatri, Sunil Cc: Deucher, Alexander [Public] Regards, Prike > -----Original Message----- > From: Koenig, Christian <Christian.Koenig@amd.com> > Sent: Tuesday, April 14, 2026 9:36 PM > To: Liang, Prike <Prike.Liang@amd.com>; amd-gfx@lists.freedesktop.org; Khatri, > Sunil <Sunil.Khatri@amd.com> > Cc: Deucher, Alexander <Alexander.Deucher@amd.com> > Subject: Re: [PATCH 1/2] drm/amdgpu: fix userq lock order against reset_domain > > On 4/14/26 10:55, Prike Liang wrote: > > User queue restore takes reservation locks before userq_mutex, but the > > create and destroy paths can take userq_mutex and then nest > > reset_domain->sem under it. > > Yeah but that is correct behavior. > > > Lockdep rightfully reports that as a > > possible deadlock against the restore worker and other reservation > > users. > > > > Fix this by keeping reset_domain->sem outside the userq_mutex section > > in the create path, and by moving queue cleanup out from under > > userq_mutex in the destroy path. Remove the queue from the global > > doorbell lookup before dropping userq_mutex so IRQ paths cannot access > > it while teardown continues. > > This is exactly the wrong order. > > We need to be able to wait for fences while holding the userq_mutex and than in turn > can wait for GPU reset. So the GPU reset can't depend on the userq_mutex. > > The order needs to be: > 1. reservation lock > 2. userq_mutex > 3. GPU reset lock Yes, if we keep the current lock acquisition order in the userq IOCTL, we may need to rework the lock sequencing in amdgpu_info_ioctl() to address the lockdep violation shown in the trace below. Meanwhile, I noticed you already adjusted the lock ordering between the reset lock and mmap_lock; I’ll check that change on my side. Chain exists of: &reset_domain->sem --> reservation_ww_class_mutex --> &userq_mgr->userq_mutex > Regards, > Christian. > > > > > Signed-off-by: Prike Liang <Prike.Liang@amd.com> > > --- > > drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c | 21 +++++++++++++-------- > > 1 file changed, 13 insertions(+), 8 deletions(-) > > > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c > > b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c > > index 2408f888c4d9..551426741a7f 100644 > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c > > @@ -447,8 +447,6 @@ static void amdgpu_userq_cleanup(struct > amdgpu_usermode_queue *queue) > > /* Drop the userq reference. */ > > amdgpu_userq_buffer_vas_list_cleanup(adev, queue); > > uq_funcs->mqd_destroy(queue); > > - /* Use interrupt-safe locking since IRQ handlers may access these XArrays > */ > > - xa_erase_irq(&adev->userq_doorbell_xa, queue->doorbell_index); > > amdgpu_userq_fence_driver_free(queue); > > queue->fence_drv = NULL; > > queue->userq_mgr = NULL; > > @@ -662,8 +660,12 @@ amdgpu_userq_destroy(struct amdgpu_userq_mgr > *uq_mgr, struct amdgpu_usermode_que > > drm_warn(adev_to_drm(uq_mgr->adev), "trying to destroy a HW > mapping userq\n"); > > queue->state = AMDGPU_USERQ_STATE_HUNG; > > } > > - amdgpu_userq_cleanup(queue); > > + /* Remove the queue from the global doorbell lookup before dropping > > + * userq_mutex so IRQ paths can't access it while cleanup continues. > > + */ > > + xa_erase_irq(&adev->userq_doorbell_xa, queue->doorbell_index); > > mutex_unlock(&uq_mgr->userq_mutex); > > + amdgpu_userq_cleanup(queue); > > > > pm_runtime_put_autosuspend(adev_to_drm(adev)->dev); > > > > @@ -799,6 +801,13 @@ amdgpu_userq_create(struct drm_file *filp, union > drm_amdgpu_userq *args) > > goto clean_fence_driver; > > } > > > > + /* > > + * Keep reset_domain->sem outside the userq_mutex section returned by > > + * amdgpu_userq_ensure_ev_fence(). Restore acquires reservation locks > > + * before userq_mutex, so taking reset_domain->sem after userq_mutex > > + * would invert the established order and trigger lockdep. > > + */ > > + down_read(&adev->reset_domain->sem); > > amdgpu_userq_ensure_ev_fence(&fpriv->userq_mgr, &fpriv->evf_mgr); > > > > /* don't map the queue if scheduling is halted */ @@ -812,16 +821,13 > > @@ amdgpu_userq_create(struct drm_file *filp, union drm_amdgpu_userq *args) > > r = amdgpu_userq_map_helper(queue); > > if (r) { > > drm_file_err(uq_mgr->file, "Failed to map Queue\n"); > > - goto clean_mqd; > > + goto clean_reset_domain; > > } > > } > > > > /* drop this refcount during queue destroy */ > > kref_init(&queue->refcount); > > > > - /* Wait for mode-1 reset to complete */ > > - down_read(&adev->reset_domain->sem); > > - > > r = xa_alloc(&uq_mgr->userq_xa, &qid, queue, > > XA_LIMIT(1, AMDGPU_MAX_USERQ_COUNT), > GFP_KERNEL); > > if (r) { > > @@ -850,7 +856,6 @@ amdgpu_userq_create(struct drm_file *filp, union > > drm_amdgpu_userq *args) > > > > clean_reset_domain: > > up_read(&adev->reset_domain->sem); > > -clean_mqd: > > mutex_unlock(&uq_mgr->userq_mutex); > > uq_funcs->mqd_destroy(queue); > > clean_fence_driver: ^ permalink raw reply [flat|nested] 4+ messages in thread
end of thread, other threads:[~2026-04-20 2:37 UTC | newest] Thread overview: 4+ messages (download: mbox.gz follow: Atom feed -- links below jump to the message on this page -- 2026-04-14 8:55 [PATCH 1/2] drm/amdgpu: fix userq lock order against reset_domain Prike Liang 2026-04-14 8:55 ` [PATCH 2/2] drm/amdgpu: fix userq destroy reservation inversion Prike Liang 2026-04-14 13:35 ` [PATCH 1/2] drm/amdgpu: fix userq lock order against reset_domain Christian König 2026-04-20 2:36 ` Liang, Prike
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox