[PATCH] drm/amdgpu/userq: rework eviction fence suspension lock for fixing lockdep

All of lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH] drm/amdgpu/userq: rework eviction fence suspension lock for fixing lockdep
@ 2026-04-08  2:52 Prike Liang
  2026-04-08  8:21 ` Christian König
  0 siblings, 1 reply; 5+ messages in thread
From: Prike Liang @ 2026-04-08  2:52 UTC (permalink / raw)
  To: amd-gfx; +Cc: Alexander.Deucher, Christian.Koenig, Prike Liang

amdgpu_eviction_fence_suspend_worker() ran amdgpu_userq_wait_for_signal()
with userq_mutex held. The helper used to walk the xarray and block on
queue->last_fence while keeping that lock, so the userspace signal path
could never get the lock while the wait fence sleep waiting, then triggering
120s hung task warnings.

Meanwhile, there also rework the userq lock access in the eviction suspension
path for resolving the lockdep/lock order issues.

Signed-off-by: Prike Liang <Prike.Liang@amd.com>
---
 .../drm/amd/amdgpu/amdgpu_eviction_fence.c    |   2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c     | 107 ++++++++++++++----
 2 files changed, 85 insertions(+), 24 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_eviction_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_eviction_fence.c
index 5ae477c49a53..00c450e31139 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_eviction_fence.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_eviction_fence.c
@@ -73,7 +73,6 @@ amdgpu_eviction_fence_suspend_worker(struct work_struct *work)
 	 * allocate memory while holding this lock, but only after ensuring that
 	 * the eviction fence is signaled.
 	 */
-	cookie = dma_fence_begin_signalling();
 
 	ev_fence = amdgpu_evf_mgr_get_fence(evf_mgr);
 	amdgpu_userq_evict(uq_mgr);
@@ -83,6 +82,7 @@ amdgpu_eviction_fence_suspend_worker(struct work_struct *work)
 	 * userq_mutex. Otherwise we won't resume the queues before issuing the
 	 * next fence.
 	 */
+	cookie = dma_fence_begin_signalling();
 	dma_fence_signal(ev_fence);
 	dma_fence_end_signalling(cookie);
 	dma_fence_put(ev_fence);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
index 9d3c39e96ac1..7691f169415b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
@@ -26,6 +26,7 @@
 #include <drm/drm_exec.h>
 #include <linux/pm_runtime.h>
 #include <drm/drm_drv.h>
+#include <linux/lockdep.h>
 
 #include "amdgpu.h"
 #include "amdgpu_reset.h"
@@ -34,6 +35,23 @@
 #include "amdgpu_hmm.h"
 #include "amdgpu_userq_fence.h"
 
+#define AMDGPU_USERQ_FENCE_WAIT_POLL_MS 1000
+static unsigned long
+amdgpu_userq_fence_timeout_ms(struct amdgpu_usermode_queue *queue)
+{
+	struct amdgpu_device *adev = queue->userq_mgr->adev;
+	switch (queue->queue_type) {
+	case AMDGPU_RING_TYPE_GFX:
+		return adev->gfx_timeout;
+	case AMDGPU_RING_TYPE_COMPUTE:
+		return adev->compute_timeout;
+	case AMDGPU_RING_TYPE_SDMA:
+		return adev->sdma_timeout;
+	default:
+		return adev->gfx_timeout;
+	}
+}
+
 u32 amdgpu_userq_get_supported_ip_mask(struct amdgpu_device *adev)
 {
 	int i;
@@ -176,29 +194,12 @@ static void amdgpu_userq_hang_detect_work(struct work_struct *work)
 */
 void amdgpu_userq_start_hang_detect_work(struct amdgpu_usermode_queue *queue)
 {
-	struct amdgpu_device *adev;
 	unsigned long timeout_ms;
 
 	if (!queue || !queue->userq_mgr || !queue->userq_mgr->adev)
 		return;
 
-	adev = queue->userq_mgr->adev;
-	/* Determine timeout based on queue type */
-	switch (queue->queue_type) {
-	case AMDGPU_RING_TYPE_GFX:
-		timeout_ms = adev->gfx_timeout;
-		break;
-	case AMDGPU_RING_TYPE_COMPUTE:
-		timeout_ms = adev->compute_timeout;
-		break;
-	case AMDGPU_RING_TYPE_SDMA:
-		timeout_ms = adev->sdma_timeout;
-		break;
-	default:
-		timeout_ms = adev->gfx_timeout;
-		break;
-	}
-
+	timeout_ms = amdgpu_userq_fence_timeout_ms(queue);
 	/* Store the fence to monitor and schedule hang detection */
 	WRITE_ONCE(queue->hang_detect_fence, queue->last_fence);
 	schedule_delayed_work(&queue->hang_detect_work,
@@ -1274,16 +1275,76 @@ void amdgpu_userq_reset_work(struct work_struct *work)
 static void
 amdgpu_userq_wait_for_signal(struct amdgpu_userq_mgr *uq_mgr)
 {
-	struct amdgpu_usermode_queue *queue;
-	unsigned long queue_id;
+	lockdep_assert_held(&uq_mgr->userq_mutex);
 
-	xa_for_each(&uq_mgr->userq_xa, queue_id, queue) {
-		struct dma_fence *f = queue->last_fence;
+	/* Rescan the userq xarray after each fence poll interval to get
+	 * newly added queues or fences.
+	 */
+	for (;;) {
+		struct amdgpu_usermode_queue *queue;
+		unsigned long queue_id = 0;
+		struct dma_fence *f = NULL;
+		unsigned long timeout_ms = 0;
+		u64 context = 0, seqno = 0;
+		bool signaled = false;
+
+		xa_for_each(&uq_mgr->userq_xa, queue_id, queue) {
+			struct dma_fence *tmp = queue->last_fence;
+
+			if (!tmp || dma_fence_is_signaled(tmp))
+				continue;
+
+			f = dma_fence_get(tmp);
+			timeout_ms = amdgpu_userq_fence_timeout_ms(queue);
+			context = tmp->context;
+			seqno = tmp->seqno;
+			break;
+		}
 
 		if (!f)
+			return;
+
+		if (!timeout_ms)
+			timeout_ms = 1;
+
+		/*
+		 * We can't use dma_fence_wait() here. Waiting there and then
+		 * reacquiring userq_mutex creates a lockdep cycle through
+		 * dma_fence_map:
+		 *   userq_mutex -> reservation_ww_class_mutex -> dma_fence_map
+		 * and
+		 *   dma_fence_map -> userq_mutex
+		 * Instead, drop the mutex, sleep in bounded intervals, then
+		 * reacquire and poll the fence signaled bit.
+		 */
+		while (timeout_ms) {
+			unsigned long interval_ms;
+
+			if (dma_fence_is_signaled(f)) {
+				signaled = true;
+				break;
+			}
+
+			interval_ms = min(timeout_ms,
+					  (unsigned long)AMDGPU_USERQ_FENCE_WAIT_POLL_MS);
+			mutex_unlock(&uq_mgr->userq_mutex);
+			msleep(interval_ms);
+			mutex_lock(&uq_mgr->userq_mutex);
+			timeout_ms -= interval_ms;
+		}
+
+		if (!signaled && dma_fence_is_signaled(f))
+			signaled = true;
+
+		dma_fence_put(f);
+
+		if (signaled)
 			continue;
 
-		dma_fence_wait(f, false);
+		drm_dbg(adev_to_drm(uq_mgr->adev),
+			     "Timed out waiting for fence=%llu:%llu during eviction\n",
+			     context, seqno);
+		amdgpu_userq_detect_and_reset_queues(uq_mgr);
 	}
 }
 
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [PATCH] drm/amdgpu/userq: rework eviction fence suspension lock for fixing lockdep
  2026-04-08  2:52 [PATCH] drm/amdgpu/userq: rework eviction fence suspension lock for fixing lockdep Prike Liang
@ 2026-04-08  8:21 ` Christian König
  2026-04-09  3:35   ` Liang, Prike
  0 siblings, 1 reply; 5+ messages in thread
From: Christian König @ 2026-04-08  8:21 UTC (permalink / raw)
  To: Prike Liang, amd-gfx; +Cc: Alexander.Deucher

On 4/8/26 04:52, Prike Liang wrote:
> amdgpu_eviction_fence_suspend_worker() ran amdgpu_userq_wait_for_signal()
> with userq_mutex held. The helper used to walk the xarray and block on
> queue->last_fence while keeping that lock, so the userspace signal path
> could never get the lock while the wait fence sleep waiting, then triggering
> 120s hung task warnings.

And that is perfectly intentional.

> 
> Meanwhile, there also rework the userq lock access in the eviction suspension
> path for resolving the lockdep/lock order issues.
> 
> Signed-off-by: Prike Liang <Prike.Liang@amd.com>
> ---
>  .../drm/amd/amdgpu/amdgpu_eviction_fence.c    |   2 +-
>  drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c     | 107 ++++++++++++++----
>  2 files changed, 85 insertions(+), 24 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_eviction_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_eviction_fence.c
> index 5ae477c49a53..00c450e31139 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_eviction_fence.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_eviction_fence.c
> @@ -73,7 +73,6 @@ amdgpu_eviction_fence_suspend_worker(struct work_struct *work)
>  	 * allocate memory while holding this lock, but only after ensuring that
>  	 * the eviction fence is signaled.
>  	 */
> -	cookie = dma_fence_begin_signalling();
>  
>  	ev_fence = amdgpu_evf_mgr_get_fence(evf_mgr);
>  	amdgpu_userq_evict(uq_mgr);
> @@ -83,6 +82,7 @@ amdgpu_eviction_fence_suspend_worker(struct work_struct *work)
>  	 * userq_mutex. Otherwise we won't resume the queues before issuing the
>  	 * next fence.
>  	 */
> +	cookie = dma_fence_begin_signalling();

Absolutely clear NAK to that. This only disables the warning but doesn't fix the locking problem.

As far as I can see the patch here is just once more utterly nonsense. What problem are you exactly trying to solve?

Regards,
Christian.

>  	dma_fence_signal(ev_fence);
>  	dma_fence_end_signalling(cookie);
>  	dma_fence_put(ev_fence);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> index 9d3c39e96ac1..7691f169415b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> @@ -26,6 +26,7 @@
>  #include <drm/drm_exec.h>
>  #include <linux/pm_runtime.h>
>  #include <drm/drm_drv.h>
> +#include <linux/lockdep.h>
>  
>  #include "amdgpu.h"
>  #include "amdgpu_reset.h"
> @@ -34,6 +35,23 @@
>  #include "amdgpu_hmm.h"
>  #include "amdgpu_userq_fence.h"
>  
> +#define AMDGPU_USERQ_FENCE_WAIT_POLL_MS 1000
> +static unsigned long
> +amdgpu_userq_fence_timeout_ms(struct amdgpu_usermode_queue *queue)
> +{
> +	struct amdgpu_device *adev = queue->userq_mgr->adev;
> +	switch (queue->queue_type) {
> +	case AMDGPU_RING_TYPE_GFX:
> +		return adev->gfx_timeout;
> +	case AMDGPU_RING_TYPE_COMPUTE:
> +		return adev->compute_timeout;
> +	case AMDGPU_RING_TYPE_SDMA:
> +		return adev->sdma_timeout;
> +	default:
> +		return adev->gfx_timeout;
> +	}
> +}
> +
>  u32 amdgpu_userq_get_supported_ip_mask(struct amdgpu_device *adev)
>  {
>  	int i;
> @@ -176,29 +194,12 @@ static void amdgpu_userq_hang_detect_work(struct work_struct *work)
>  */
>  void amdgpu_userq_start_hang_detect_work(struct amdgpu_usermode_queue *queue)
>  {
> -	struct amdgpu_device *adev;
>  	unsigned long timeout_ms;
>  
>  	if (!queue || !queue->userq_mgr || !queue->userq_mgr->adev)
>  		return;
>  
> -	adev = queue->userq_mgr->adev;
> -	/* Determine timeout based on queue type */
> -	switch (queue->queue_type) {
> -	case AMDGPU_RING_TYPE_GFX:
> -		timeout_ms = adev->gfx_timeout;
> -		break;
> -	case AMDGPU_RING_TYPE_COMPUTE:
> -		timeout_ms = adev->compute_timeout;
> -		break;
> -	case AMDGPU_RING_TYPE_SDMA:
> -		timeout_ms = adev->sdma_timeout;
> -		break;
> -	default:
> -		timeout_ms = adev->gfx_timeout;
> -		break;
> -	}
> -
> +	timeout_ms = amdgpu_userq_fence_timeout_ms(queue);
>  	/* Store the fence to monitor and schedule hang detection */
>  	WRITE_ONCE(queue->hang_detect_fence, queue->last_fence);
>  	schedule_delayed_work(&queue->hang_detect_work,
> @@ -1274,16 +1275,76 @@ void amdgpu_userq_reset_work(struct work_struct *work)
>  static void
>  amdgpu_userq_wait_for_signal(struct amdgpu_userq_mgr *uq_mgr)
>  {
> -	struct amdgpu_usermode_queue *queue;
> -	unsigned long queue_id;
> +	lockdep_assert_held(&uq_mgr->userq_mutex);
>  
> -	xa_for_each(&uq_mgr->userq_xa, queue_id, queue) {
> -		struct dma_fence *f = queue->last_fence;
> +	/* Rescan the userq xarray after each fence poll interval to get
> +	 * newly added queues or fences.
> +	 */
> +	for (;;) {
> +		struct amdgpu_usermode_queue *queue;
> +		unsigned long queue_id = 0;
> +		struct dma_fence *f = NULL;
> +		unsigned long timeout_ms = 0;
> +		u64 context = 0, seqno = 0;
> +		bool signaled = false;
> +
> +		xa_for_each(&uq_mgr->userq_xa, queue_id, queue) {
> +			struct dma_fence *tmp = queue->last_fence;
> +
> +			if (!tmp || dma_fence_is_signaled(tmp))
> +				continue;
> +
> +			f = dma_fence_get(tmp);
> +			timeout_ms = amdgpu_userq_fence_timeout_ms(queue);
> +			context = tmp->context;
> +			seqno = tmp->seqno;
> +			break;
> +		}
>  
>  		if (!f)
> +			return;
> +
> +		if (!timeout_ms)
> +			timeout_ms = 1;
> +
> +		/*
> +		 * We can't use dma_fence_wait() here. Waiting there and then
> +		 * reacquiring userq_mutex creates a lockdep cycle through
> +		 * dma_fence_map:
> +		 *   userq_mutex -> reservation_ww_class_mutex -> dma_fence_map
> +		 * and
> +		 *   dma_fence_map -> userq_mutex
> +		 * Instead, drop the mutex, sleep in bounded intervals, then
> +		 * reacquire and poll the fence signaled bit.
> +		 */
> +		while (timeout_ms) {
> +			unsigned long interval_ms;
> +
> +			if (dma_fence_is_signaled(f)) {
> +				signaled = true;
> +				break;
> +			}
> +
> +			interval_ms = min(timeout_ms,
> +					  (unsigned long)AMDGPU_USERQ_FENCE_WAIT_POLL_MS);
> +			mutex_unlock(&uq_mgr->userq_mutex);
> +			msleep(interval_ms);
> +			mutex_lock(&uq_mgr->userq_mutex);
> +			timeout_ms -= interval_ms;
> +		}
> +
> +		if (!signaled && dma_fence_is_signaled(f))
> +			signaled = true;
> +
> +		dma_fence_put(f);
> +
> +		if (signaled)
>  			continue;
>  
> -		dma_fence_wait(f, false);
> +		drm_dbg(adev_to_drm(uq_mgr->adev),
> +			     "Timed out waiting for fence=%llu:%llu during eviction\n",
> +			     context, seqno);
> +		amdgpu_userq_detect_and_reset_queues(uq_mgr);
>  	}
>  }
>  


^ permalink raw reply	[flat|nested] 5+ messages in thread

* RE: [PATCH] drm/amdgpu/userq: rework eviction fence suspension lock for fixing lockdep
  2026-04-08  8:21 ` Christian König
@ 2026-04-09  3:35   ` Liang, Prike
  2026-04-09 11:47     ` Christian König
  0 siblings, 1 reply; 5+ messages in thread
From: Liang, Prike @ 2026-04-09  3:35 UTC (permalink / raw)
  To: Koenig, Christian, amd-gfx@lists.freedesktop.org; +Cc: Deucher, Alexander

[Public]

Regards,
      Prike

> -----Original Message-----
> From: Koenig, Christian <Christian.Koenig@amd.com>
> Sent: Wednesday, April 8, 2026 4:22 PM
> To: Liang, Prike <Prike.Liang@amd.com>; amd-gfx@lists.freedesktop.org
> Cc: Deucher, Alexander <Alexander.Deucher@amd.com>
> Subject: Re: [PATCH] drm/amdgpu/userq: rework eviction fence suspension lock for
> fixing lockdep
>
> On 4/8/26 04:52, Prike Liang wrote:
> > amdgpu_eviction_fence_suspend_worker() ran
> > amdgpu_userq_wait_for_signal() with userq_mutex held. The helper used
> > to walk the xarray and block on
> > queue->last_fence while keeping that lock, so the userspace signal
> > queue->path
> > could never get the lock while the wait fence sleep waiting, then
> > triggering 120s hung task warnings.
>
> And that is perfectly intentional.
>
> >
> > Meanwhile, there also rework the userq lock access in the eviction
> > suspension path for resolving the lockdep/lock order issues.
> >
> > Signed-off-by: Prike Liang <Prike.Liang@amd.com>
> > ---
> >  .../drm/amd/amdgpu/amdgpu_eviction_fence.c    |   2 +-
> >  drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c     | 107 ++++++++++++++----
> >  2 files changed, 85 insertions(+), 24 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_eviction_fence.c
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_eviction_fence.c
> > index 5ae477c49a53..00c450e31139 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_eviction_fence.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_eviction_fence.c
> > @@ -73,7 +73,6 @@ amdgpu_eviction_fence_suspend_worker(struct work_struct
> *work)
> >      * allocate memory while holding this lock, but only after ensuring that
> >      * the eviction fence is signaled.
> >      */
> > -   cookie = dma_fence_begin_signalling();
> >
> >     ev_fence = amdgpu_evf_mgr_get_fence(evf_mgr);
> >     amdgpu_userq_evict(uq_mgr);
> > @@ -83,6 +82,7 @@ amdgpu_eviction_fence_suspend_worker(struct work_struct
> *work)
> >      * userq_mutex. Otherwise we won't resume the queues before issuing the
> >      * next fence.
> >      */
> > +   cookie = dma_fence_begin_signalling();
>
> Absolutely clear NAK to that. This only disables the warning but doesn't fix the
> locking problem.
>
> As far as I can see the patch here is just once more utterly nonsense. What problem
> are you exactly trying to solve?

There's a lock issue as following which is caused by the userq lock is acquired with sleep wait (dma_fence_wait())in the
amdgpu_userq_wait_for_signal(), and this solution is to avoid waiting the userq fence with userq mutex and rework the
userq mutex lock and fence dma fence lockdep order issue.


9] INFO: task Xorg:cs0:2019 blocked for more than 120 seconds.
[ 7130.223182]       Tainted: G     U     OE       6.19.0-custom #16
[ 7130.223468] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[ 7130.223822] task:Xorg:cs0        state:D stack:0     pid:2019  tgid:2016  ppid:2014   task_flags:0x400040 flags:0x00080000
[ 7130.223849] Call Trace:
[ 7130.223861]  <TASK>
[ 7130.223880]  __schedule+0x570/0x1200
[ 7130.223905]  schedule+0x47/0x160
[ 7130.223912]  schedule_preempt_disabled+0x19/0x30
[ 7130.223918]  __mutex_lock+0x6b1/0x10d0
[ 7130.223934]  ? amdgpu_userq_ensure_ev_fence+0x3c/0x110 [amdgpu]
[ 7130.224240]  mutex_lock_nested+0x1f/0x30
[ 7130.224245]  ? mutex_lock_nested+0x1f/0x30
[ 7130.224251]  amdgpu_userq_ensure_ev_fence+0x3c/0x110 [amdgpu]
[ 7130.224512]  amdgpu_userq_signal_ioctl+0x571/0x1060 [amdgpu]
[ 7130.224748]  ? srso_return_thunk+0x5/0x5f
[ 7130.224755]  ? __lock_acquire+0x43e/0x2210
[ 7130.224785]  ? srso_return_thunk+0x5/0x5f
[ 7130.224790]  ? lock_acquire+0xc6/0x2c0
[ 7130.224797]  ? drm_dev_enter+0x58/0xe0 [drm]
[ 7130.224833]  ? srso_return_thunk+0x5/0x5f
[ 7130.224838]  ? sched_clock_noinstr+0xd/0x20
[ 7130.224844]  ? srso_return_thunk+0x5/0x5f
[ 7130.224849]  ? local_clock_noinstr+0x12/0xc0
[ 7130.224857]  ? srso_return_thunk+0x5/0x5f
[ 7130.224861]  ? local_clock+0x19/0x40
[ 7130.224867]  ? srso_return_thunk+0x5/0x5f
[ 7130.224872]  ? lock_release+0x27d/0x3c0
[ 7130.224887]  ? __pfx_amdgpu_userq_signal_ioctl+0x10/0x10 [amdgpu]
[ 7130.225110]  drm_ioctl_kernel+0xaf/0x110 [drm]
[ 7130.225149]  drm_ioctl+0x290/0x510 [drm]
[ 7130.225176]  ? __pfx_amdgpu_userq_signal_ioctl+0x10/0x10 [amdgpu]
[ 7130.225417]  ? srso_return_thunk+0x5/0x5f
[ 7130.225423]  ? trace_hardirqs_on+0x5f/0xc0
[ 7130.225429]  ? srso_return_thunk+0x5/0x5f
[ 7130.225434]  ? _raw_spin_unlock_irqrestore+0x35/0x60
[ 7130.225449]  amdgpu_drm_ioctl+0x52/0x90 [amdgpu]
[ 7130.225657]  __x64_sys_ioctl+0xa0/0xf0
[ 7130.225672]  x64_sys_call+0x1280/0x21b0
[ 7130.225679]  do_syscall_64+0x6f/0x760
[ 7130.225690]  entry_SYSCALL_64_after_hwframe+0x76/0x7e
[ 7130.225695] RIP: 0033:0x7f5339d1a9cf
[ 7130.225701] RSP: 002b:00007f532d3fe540 EFLAGS: 00000246 ORIG_RAX: 0000000000000010
[ 7130.225708] RAX: ffffffffffffffda RBX: 0000000000000010 RCX: 00007f5339d1a9cf
[ 7130.225712] RDX: 00007f532d3fe730 RSI: 00000000c0306457 RDI: 000000000000000f
[ 7130.225716] RBP: 00007f532d3fe5c0 R08: 00007f532d3fe630 R09: 0000000000000001
[ 7130.225720] R10: 00007f532d3fe818 R11: 0000000000000246 R12: 00007f532d3ff640
[ 7130.225724] R13: 0000000000000016 R14: 00007f5339c947d0 R15: 00007ffe53cb5130
[ 7130.225755]  </TASK>
[ 7130.225818] INFO: task Xorg:cs0:2019 is blocked on a mutex likely owned by task kworker/5:0:4873.
[ 7130.226193] INFO: task kworker/2:1:4775 blocked for more than 120 seconds.
[ 7130.226514]       Tainted: G     U     OE       6.19.0-custom #16
[ 7130.226764] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[ 7130.227082] task:kworker/2:1     state:D stack:0     pid:4775  tgid:4775  ppid:2      task_flags:0x4208060 flags:0x00080000
[ 7130.227091] Workqueue: events amdgpu_userq_hang_detect_work [amdgpu]
[ 7130.227341] Call Trace:
[ 7130.227345]  <TASK>
[ 7130.227356]  __schedule+0x570/0x1200
[ 7130.227378]  schedule+0x47/0x160
[ 7130.227385]  schedule_preempt_disabled+0x19/0x30
[ 7130.227390]  __mutex_lock+0x6b1/0x10d0
[ 7130.227403]  ? amdgpu_userq_hang_detect_work+0x5a/0x80 [amdgpu]
[ 7130.227660]  mutex_lock_nested+0x1f/0x30
[ 7130.227667]  ? mutex_lock_nested+0x1f/0x30
[ 7130.227673]  amdgpu_userq_hang_detect_work+0x5a/0x80 [amdgpu]
[ 7130.227918]  process_one_work+0x233/0x650
[ 7130.227944]  worker_thread+0x1b2/0x360
[ 7130.227957]  kthread+0x11c/0x260
[ 7130.227964]  ? srso_return_thunk+0x5/0x5f
[ 7130.227969]  ? __pfx_worker_thread+0x10/0x10
[ 7130.227977]  ? __pfx_kthread+0x10/0x10
[ 7130.227987]  ret_from_fork+0x29f/0x2f0
[ 7130.227995]  ? __pfx_kthread+0x10/0x10
[ 7130.228003]  ret_from_fork_asm+0x1a/0x30
[ 7130.228034]  </TASK>
[ 7130.228089] INFO: task kworker/2:1:4775 is blocked on a mutex likely owned by task kworker/5:0:4873.
[ 7130.228462] INFO: task kworker/5:0:4873 blocked for more than 120 seconds.
[ 7130.228743]       Tainted: G     U     OE       6.19.0-custom #16
[ 7130.228993] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[ 7130.229309] task:kworker/5:0     state:D stack:0     pid:4873  tgid:4873  ppid:2      task_flags:0x4208060 flags:0x00080000
[ 7130.229318] Workqueue: events amdgpu_eviction_fence_suspend_worker [amdgpu]
[ 7130.229543] Call Trace:
[ 7130.229547]  <TASK>
[ 7130.229558]  __schedule+0x570/0x1200
[ 7130.229579]  schedule+0x47/0x160
[ 7130.229586]  schedule_timeout+0x10a/0x120
[ 7130.229591]  ? srso_return_thunk+0x5/0x5f
[ 7130.229597]  ? mark_held_locks+0x54/0x90
[ 7130.229610]  ? srso_return_thunk+0x5/0x5f
[ 7130.229615]  ? trace_hardirqs_on+0x5f/0xc0
[ 7130.229621]  ? srso_return_thunk+0x5/0x5f
[ 7130.229631]  dma_fence_default_wait+0x1f5/0x290
[ 7130.229640]  ? dma_fence_default_wait+0xfc/0x290
[ 7130.229649]  ? __pfx_dma_fence_default_wait_cb+0x10/0x10
[ 7130.229663]  dma_fence_wait_timeout+0x300/0x3c0
[ 7130.229675]  amdgpu_userq_evict+0x67/0x120 [amdgpu]
[ 7130.229930]  amdgpu_eviction_fence_suspend_worker+0x4d/0xd0 [amdgpu]
[ 7130.230162]  process_one_work+0x233/0x650
[ 7130.230187]  worker_thread+0x1b2/0x360
[ 7130.230200]  kthread+0x11c/0x260
[ 7130.230205]  ? srso_return_thunk+0x5/0x5f
[ 7130.230211]  ? __pfx_worker_thread+0x10/0x10
[ 7130.230218]  ? __pfx_kthread+0x10/0x10
[ 7130.230229]  ret_from_fork+0x29f/0x2f0
[ 7130.230234]  ? __pfx_kthread+0x10/0x10
[ 7130.230243]  ret_from_fork_asm+0x1a/0x30
[ 7130.230274]  </TASK>
> Regards,
> Christian.
>
> >     dma_fence_signal(ev_fence);
> >     dma_fence_end_signalling(cookie);
> >     dma_fence_put(ev_fence);
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> > index 9d3c39e96ac1..7691f169415b 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> > @@ -26,6 +26,7 @@
> >  #include <drm/drm_exec.h>
> >  #include <linux/pm_runtime.h>
> >  #include <drm/drm_drv.h>
> > +#include <linux/lockdep.h>
> >
> >  #include "amdgpu.h"
> >  #include "amdgpu_reset.h"
> > @@ -34,6 +35,23 @@
> >  #include "amdgpu_hmm.h"
> >  #include "amdgpu_userq_fence.h"
> >
> > +#define AMDGPU_USERQ_FENCE_WAIT_POLL_MS 1000 static unsigned long
> > +amdgpu_userq_fence_timeout_ms(struct amdgpu_usermode_queue *queue) {
> > +   struct amdgpu_device *adev = queue->userq_mgr->adev;
> > +   switch (queue->queue_type) {
> > +   case AMDGPU_RING_TYPE_GFX:
> > +           return adev->gfx_timeout;
> > +   case AMDGPU_RING_TYPE_COMPUTE:
> > +           return adev->compute_timeout;
> > +   case AMDGPU_RING_TYPE_SDMA:
> > +           return adev->sdma_timeout;
> > +   default:
> > +           return adev->gfx_timeout;
> > +   }
> > +}
> > +
> >  u32 amdgpu_userq_get_supported_ip_mask(struct amdgpu_device *adev)  {
> >     int i;
> > @@ -176,29 +194,12 @@ static void amdgpu_userq_hang_detect_work(struct
> > work_struct *work)  */  void
> > amdgpu_userq_start_hang_detect_work(struct amdgpu_usermode_queue
> > *queue)  {
> > -   struct amdgpu_device *adev;
> >     unsigned long timeout_ms;
> >
> >     if (!queue || !queue->userq_mgr || !queue->userq_mgr->adev)
> >             return;
> >
> > -   adev = queue->userq_mgr->adev;
> > -   /* Determine timeout based on queue type */
> > -   switch (queue->queue_type) {
> > -   case AMDGPU_RING_TYPE_GFX:
> > -           timeout_ms = adev->gfx_timeout;
> > -           break;
> > -   case AMDGPU_RING_TYPE_COMPUTE:
> > -           timeout_ms = adev->compute_timeout;
> > -           break;
> > -   case AMDGPU_RING_TYPE_SDMA:
> > -           timeout_ms = adev->sdma_timeout;
> > -           break;
> > -   default:
> > -           timeout_ms = adev->gfx_timeout;
> > -           break;
> > -   }
> > -
> > +   timeout_ms = amdgpu_userq_fence_timeout_ms(queue);
> >     /* Store the fence to monitor and schedule hang detection */
> >     WRITE_ONCE(queue->hang_detect_fence, queue->last_fence);
> >     schedule_delayed_work(&queue->hang_detect_work,
> > @@ -1274,16 +1275,76 @@ void amdgpu_userq_reset_work(struct
> > work_struct *work)  static void  amdgpu_userq_wait_for_signal(struct
> > amdgpu_userq_mgr *uq_mgr)  {
> > -   struct amdgpu_usermode_queue *queue;
> > -   unsigned long queue_id;
> > +   lockdep_assert_held(&uq_mgr->userq_mutex);
> >
> > -   xa_for_each(&uq_mgr->userq_xa, queue_id, queue) {
> > -           struct dma_fence *f = queue->last_fence;
> > +   /* Rescan the userq xarray after each fence poll interval to get
> > +    * newly added queues or fences.
> > +    */
> > +   for (;;) {
> > +           struct amdgpu_usermode_queue *queue;
> > +           unsigned long queue_id = 0;
> > +           struct dma_fence *f = NULL;
> > +           unsigned long timeout_ms = 0;
> > +           u64 context = 0, seqno = 0;
> > +           bool signaled = false;
> > +
> > +           xa_for_each(&uq_mgr->userq_xa, queue_id, queue) {
> > +                   struct dma_fence *tmp = queue->last_fence;
> > +
> > +                   if (!tmp || dma_fence_is_signaled(tmp))
> > +                           continue;
> > +
> > +                   f = dma_fence_get(tmp);
> > +                   timeout_ms = amdgpu_userq_fence_timeout_ms(queue);
> > +                   context = tmp->context;
> > +                   seqno = tmp->seqno;
> > +                   break;
> > +           }
> >
> >             if (!f)
> > +                   return;
> > +
> > +           if (!timeout_ms)
> > +                   timeout_ms = 1;
> > +
> > +           /*
> > +            * We can't use dma_fence_wait() here. Waiting there and then
> > +            * reacquiring userq_mutex creates a lockdep cycle through
> > +            * dma_fence_map:
> > +            *   userq_mutex -> reservation_ww_class_mutex -> dma_fence_map
> > +            * and
> > +            *   dma_fence_map -> userq_mutex
> > +            * Instead, drop the mutex, sleep in bounded intervals, then
> > +            * reacquire and poll the fence signaled bit.
> > +            */
> > +           while (timeout_ms) {
> > +                   unsigned long interval_ms;
> > +
> > +                   if (dma_fence_is_signaled(f)) {
> > +                           signaled = true;
> > +                           break;
> > +                   }
> > +
> > +                   interval_ms = min(timeout_ms,
> > +                                     (unsigned
> long)AMDGPU_USERQ_FENCE_WAIT_POLL_MS);
> > +                   mutex_unlock(&uq_mgr->userq_mutex);
> > +                   msleep(interval_ms);
> > +                   mutex_lock(&uq_mgr->userq_mutex);
> > +                   timeout_ms -= interval_ms;
> > +           }
> > +
> > +           if (!signaled && dma_fence_is_signaled(f))
> > +                   signaled = true;
> > +
> > +           dma_fence_put(f);
> > +
> > +           if (signaled)
> >                     continue;
> >
> > -           dma_fence_wait(f, false);
> > +           drm_dbg(adev_to_drm(uq_mgr->adev),
> > +                        "Timed out waiting for fence=%llu:%llu during eviction\n",
> > +                        context, seqno);
> > +           amdgpu_userq_detect_and_reset_queues(uq_mgr);
> >     }
> >  }
> >


^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] drm/amdgpu/userq: rework eviction fence suspension lock for fixing lockdep
  2026-04-09  3:35   ` Liang, Prike
@ 2026-04-09 11:47     ` Christian König
  2026-04-10  6:56       ` Liang, Prike
  0 siblings, 1 reply; 5+ messages in thread
From: Christian König @ 2026-04-09 11:47 UTC (permalink / raw)
  To: Liang, Prike, amd-gfx@lists.freedesktop.org; +Cc: Deucher, Alexander

On 4/9/26 05:35, Liang, Prike wrote:
> [Public]
> 
> Regards,
>       Prike
> 
>> -----Original Message-----
>> From: Koenig, Christian <Christian.Koenig@amd.com>
>> Sent: Wednesday, April 8, 2026 4:22 PM
>> To: Liang, Prike <Prike.Liang@amd.com>; amd-gfx@lists.freedesktop.org
>> Cc: Deucher, Alexander <Alexander.Deucher@amd.com>
>> Subject: Re: [PATCH] drm/amdgpu/userq: rework eviction fence suspension lock for
>> fixing lockdep
>>
>> On 4/8/26 04:52, Prike Liang wrote:
>>> amdgpu_eviction_fence_suspend_worker() ran
>>> amdgpu_userq_wait_for_signal() with userq_mutex held. The helper used
>>> to walk the xarray and block on
>>> queue->last_fence while keeping that lock, so the userspace signal
>>> queue->path
>>> could never get the lock while the wait fence sleep waiting, then
>>> triggering 120s hung task warnings.
>>
>> And that is perfectly intentional.
>>
>>>
>>> Meanwhile, there also rework the userq lock access in the eviction
>>> suspension path for resolving the lockdep/lock order issues.
>>>
>>> Signed-off-by: Prike Liang <Prike.Liang@amd.com>
>>> ---
>>>  .../drm/amd/amdgpu/amdgpu_eviction_fence.c    |   2 +-
>>>  drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c     | 107 ++++++++++++++----
>>>  2 files changed, 85 insertions(+), 24 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_eviction_fence.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_eviction_fence.c
>>> index 5ae477c49a53..00c450e31139 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_eviction_fence.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_eviction_fence.c
>>> @@ -73,7 +73,6 @@ amdgpu_eviction_fence_suspend_worker(struct work_struct
>> *work)
>>>      * allocate memory while holding this lock, but only after ensuring that
>>>      * the eviction fence is signaled.
>>>      */
>>> -   cookie = dma_fence_begin_signalling();
>>>
>>>     ev_fence = amdgpu_evf_mgr_get_fence(evf_mgr);
>>>     amdgpu_userq_evict(uq_mgr);
>>> @@ -83,6 +82,7 @@ amdgpu_eviction_fence_suspend_worker(struct work_struct
>> *work)
>>>      * userq_mutex. Otherwise we won't resume the queues before issuing the
>>>      * next fence.
>>>      */
>>> +   cookie = dma_fence_begin_signalling();
>>
>> Absolutely clear NAK to that. This only disables the warning but doesn't fix the
>> locking problem.
>>
>> As far as I can see the patch here is just once more utterly nonsense. What problem
>> are you exactly trying to solve?
> 
> There's a lock issue as following which is caused by the userq lock is acquired with sleep wait (dma_fence_wait())in the
> amdgpu_userq_wait_for_signal(), and this solution is to avoid waiting the userq fence with userq mutex

That solution is completely invalid. One of the main purposes of the userq_mutex is to prevent installing a new userq fence while we wait for the previous ones to signal.

So waiting for the dma_fence while holding the userq_mutex lock is a must have!

Who else is blocking on the userq_mutex?

Regards,
Christian.

> and rework the
> userq mutex lock and fence dma fence lockdep order issue.
> 
> 
> 9] INFO: task Xorg:cs0:2019 blocked for more than 120 seconds.
> [ 7130.223182]       Tainted: G     U     OE       6.19.0-custom #16
> [ 7130.223468] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
> [ 7130.223822] task:Xorg:cs0        state:D stack:0     pid:2019  tgid:2016  ppid:2014   task_flags:0x400040 flags:0x00080000
> [ 7130.223849] Call Trace:
> [ 7130.223861]  <TASK>
> [ 7130.223880]  __schedule+0x570/0x1200
> [ 7130.223905]  schedule+0x47/0x160
> [ 7130.223912]  schedule_preempt_disabled+0x19/0x30
> [ 7130.223918]  __mutex_lock+0x6b1/0x10d0
> [ 7130.223934]  ? amdgpu_userq_ensure_ev_fence+0x3c/0x110 [amdgpu]
> [ 7130.224240]  mutex_lock_nested+0x1f/0x30
> [ 7130.224245]  ? mutex_lock_nested+0x1f/0x30
> [ 7130.224251]  amdgpu_userq_ensure_ev_fence+0x3c/0x110 [amdgpu]
> [ 7130.224512]  amdgpu_userq_signal_ioctl+0x571/0x1060 [amdgpu]
> [ 7130.224748]  ? srso_return_thunk+0x5/0x5f
> [ 7130.224755]  ? __lock_acquire+0x43e/0x2210
> [ 7130.224785]  ? srso_return_thunk+0x5/0x5f
> [ 7130.224790]  ? lock_acquire+0xc6/0x2c0
> [ 7130.224797]  ? drm_dev_enter+0x58/0xe0 [drm]
> [ 7130.224833]  ? srso_return_thunk+0x5/0x5f
> [ 7130.224838]  ? sched_clock_noinstr+0xd/0x20
> [ 7130.224844]  ? srso_return_thunk+0x5/0x5f
> [ 7130.224849]  ? local_clock_noinstr+0x12/0xc0
> [ 7130.224857]  ? srso_return_thunk+0x5/0x5f
> [ 7130.224861]  ? local_clock+0x19/0x40
> [ 7130.224867]  ? srso_return_thunk+0x5/0x5f
> [ 7130.224872]  ? lock_release+0x27d/0x3c0
> [ 7130.224887]  ? __pfx_amdgpu_userq_signal_ioctl+0x10/0x10 [amdgpu]
> [ 7130.225110]  drm_ioctl_kernel+0xaf/0x110 [drm]
> [ 7130.225149]  drm_ioctl+0x290/0x510 [drm]
> [ 7130.225176]  ? __pfx_amdgpu_userq_signal_ioctl+0x10/0x10 [amdgpu]
> [ 7130.225417]  ? srso_return_thunk+0x5/0x5f
> [ 7130.225423]  ? trace_hardirqs_on+0x5f/0xc0
> [ 7130.225429]  ? srso_return_thunk+0x5/0x5f
> [ 7130.225434]  ? _raw_spin_unlock_irqrestore+0x35/0x60
> [ 7130.225449]  amdgpu_drm_ioctl+0x52/0x90 [amdgpu]
> [ 7130.225657]  __x64_sys_ioctl+0xa0/0xf0
> [ 7130.225672]  x64_sys_call+0x1280/0x21b0
> [ 7130.225679]  do_syscall_64+0x6f/0x760
> [ 7130.225690]  entry_SYSCALL_64_after_hwframe+0x76/0x7e
> [ 7130.225695] RIP: 0033:0x7f5339d1a9cf
> [ 7130.225701] RSP: 002b:00007f532d3fe540 EFLAGS: 00000246 ORIG_RAX: 0000000000000010
> [ 7130.225708] RAX: ffffffffffffffda RBX: 0000000000000010 RCX: 00007f5339d1a9cf
> [ 7130.225712] RDX: 00007f532d3fe730 RSI: 00000000c0306457 RDI: 000000000000000f
> [ 7130.225716] RBP: 00007f532d3fe5c0 R08: 00007f532d3fe630 R09: 0000000000000001
> [ 7130.225720] R10: 00007f532d3fe818 R11: 0000000000000246 R12: 00007f532d3ff640
> [ 7130.225724] R13: 0000000000000016 R14: 00007f5339c947d0 R15: 00007ffe53cb5130
> [ 7130.225755]  </TASK>
> [ 7130.225818] INFO: task Xorg:cs0:2019 is blocked on a mutex likely owned by task kworker/5:0:4873.
> [ 7130.226193] INFO: task kworker/2:1:4775 blocked for more than 120 seconds.
> [ 7130.226514]       Tainted: G     U     OE       6.19.0-custom #16
> [ 7130.226764] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
> [ 7130.227082] task:kworker/2:1     state:D stack:0     pid:4775  tgid:4775  ppid:2      task_flags:0x4208060 flags:0x00080000
> [ 7130.227091] Workqueue: events amdgpu_userq_hang_detect_work [amdgpu]
> [ 7130.227341] Call Trace:
> [ 7130.227345]  <TASK>
> [ 7130.227356]  __schedule+0x570/0x1200
> [ 7130.227378]  schedule+0x47/0x160
> [ 7130.227385]  schedule_preempt_disabled+0x19/0x30
> [ 7130.227390]  __mutex_lock+0x6b1/0x10d0
> [ 7130.227403]  ? amdgpu_userq_hang_detect_work+0x5a/0x80 [amdgpu]
> [ 7130.227660]  mutex_lock_nested+0x1f/0x30
> [ 7130.227667]  ? mutex_lock_nested+0x1f/0x30
> [ 7130.227673]  amdgpu_userq_hang_detect_work+0x5a/0x80 [amdgpu]
> [ 7130.227918]  process_one_work+0x233/0x650
> [ 7130.227944]  worker_thread+0x1b2/0x360
> [ 7130.227957]  kthread+0x11c/0x260
> [ 7130.227964]  ? srso_return_thunk+0x5/0x5f
> [ 7130.227969]  ? __pfx_worker_thread+0x10/0x10
> [ 7130.227977]  ? __pfx_kthread+0x10/0x10
> [ 7130.227987]  ret_from_fork+0x29f/0x2f0
> [ 7130.227995]  ? __pfx_kthread+0x10/0x10
> [ 7130.228003]  ret_from_fork_asm+0x1a/0x30
> [ 7130.228034]  </TASK>
> [ 7130.228089] INFO: task kworker/2:1:4775 is blocked on a mutex likely owned by task kworker/5:0:4873.
> [ 7130.228462] INFO: task kworker/5:0:4873 blocked for more than 120 seconds.
> [ 7130.228743]       Tainted: G     U     OE       6.19.0-custom #16
> [ 7130.228993] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
> [ 7130.229309] task:kworker/5:0     state:D stack:0     pid:4873  tgid:4873  ppid:2      task_flags:0x4208060 flags:0x00080000
> [ 7130.229318] Workqueue: events amdgpu_eviction_fence_suspend_worker [amdgpu]
> [ 7130.229543] Call Trace:
> [ 7130.229547]  <TASK>
> [ 7130.229558]  __schedule+0x570/0x1200
> [ 7130.229579]  schedule+0x47/0x160
> [ 7130.229586]  schedule_timeout+0x10a/0x120
> [ 7130.229591]  ? srso_return_thunk+0x5/0x5f
> [ 7130.229597]  ? mark_held_locks+0x54/0x90
> [ 7130.229610]  ? srso_return_thunk+0x5/0x5f
> [ 7130.229615]  ? trace_hardirqs_on+0x5f/0xc0
> [ 7130.229621]  ? srso_return_thunk+0x5/0x5f
> [ 7130.229631]  dma_fence_default_wait+0x1f5/0x290
> [ 7130.229640]  ? dma_fence_default_wait+0xfc/0x290
> [ 7130.229649]  ? __pfx_dma_fence_default_wait_cb+0x10/0x10
> [ 7130.229663]  dma_fence_wait_timeout+0x300/0x3c0
> [ 7130.229675]  amdgpu_userq_evict+0x67/0x120 [amdgpu]
> [ 7130.229930]  amdgpu_eviction_fence_suspend_worker+0x4d/0xd0 [amdgpu]
> [ 7130.230162]  process_one_work+0x233/0x650
> [ 7130.230187]  worker_thread+0x1b2/0x360
> [ 7130.230200]  kthread+0x11c/0x260
> [ 7130.230205]  ? srso_return_thunk+0x5/0x5f
> [ 7130.230211]  ? __pfx_worker_thread+0x10/0x10
> [ 7130.230218]  ? __pfx_kthread+0x10/0x10
> [ 7130.230229]  ret_from_fork+0x29f/0x2f0
> [ 7130.230234]  ? __pfx_kthread+0x10/0x10
> [ 7130.230243]  ret_from_fork_asm+0x1a/0x30
> [ 7130.230274]  </TASK>
>> Regards,
>> Christian.
>>
>>>     dma_fence_signal(ev_fence);
>>>     dma_fence_end_signalling(cookie);
>>>     dma_fence_put(ev_fence);
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
>>> index 9d3c39e96ac1..7691f169415b 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
>>> @@ -26,6 +26,7 @@
>>>  #include <drm/drm_exec.h>
>>>  #include <linux/pm_runtime.h>
>>>  #include <drm/drm_drv.h>
>>> +#include <linux/lockdep.h>
>>>
>>>  #include "amdgpu.h"
>>>  #include "amdgpu_reset.h"
>>> @@ -34,6 +35,23 @@
>>>  #include "amdgpu_hmm.h"
>>>  #include "amdgpu_userq_fence.h"
>>>
>>> +#define AMDGPU_USERQ_FENCE_WAIT_POLL_MS 1000 static unsigned long
>>> +amdgpu_userq_fence_timeout_ms(struct amdgpu_usermode_queue *queue) {
>>> +   struct amdgpu_device *adev = queue->userq_mgr->adev;
>>> +   switch (queue->queue_type) {
>>> +   case AMDGPU_RING_TYPE_GFX:
>>> +           return adev->gfx_timeout;
>>> +   case AMDGPU_RING_TYPE_COMPUTE:
>>> +           return adev->compute_timeout;
>>> +   case AMDGPU_RING_TYPE_SDMA:
>>> +           return adev->sdma_timeout;
>>> +   default:
>>> +           return adev->gfx_timeout;
>>> +   }
>>> +}
>>> +
>>>  u32 amdgpu_userq_get_supported_ip_mask(struct amdgpu_device *adev)  {
>>>     int i;
>>> @@ -176,29 +194,12 @@ static void amdgpu_userq_hang_detect_work(struct
>>> work_struct *work)  */  void
>>> amdgpu_userq_start_hang_detect_work(struct amdgpu_usermode_queue
>>> *queue)  {
>>> -   struct amdgpu_device *adev;
>>>     unsigned long timeout_ms;
>>>
>>>     if (!queue || !queue->userq_mgr || !queue->userq_mgr->adev)
>>>             return;
>>>
>>> -   adev = queue->userq_mgr->adev;
>>> -   /* Determine timeout based on queue type */
>>> -   switch (queue->queue_type) {
>>> -   case AMDGPU_RING_TYPE_GFX:
>>> -           timeout_ms = adev->gfx_timeout;
>>> -           break;
>>> -   case AMDGPU_RING_TYPE_COMPUTE:
>>> -           timeout_ms = adev->compute_timeout;
>>> -           break;
>>> -   case AMDGPU_RING_TYPE_SDMA:
>>> -           timeout_ms = adev->sdma_timeout;
>>> -           break;
>>> -   default:
>>> -           timeout_ms = adev->gfx_timeout;
>>> -           break;
>>> -   }
>>> -
>>> +   timeout_ms = amdgpu_userq_fence_timeout_ms(queue);
>>>     /* Store the fence to monitor and schedule hang detection */
>>>     WRITE_ONCE(queue->hang_detect_fence, queue->last_fence);
>>>     schedule_delayed_work(&queue->hang_detect_work,
>>> @@ -1274,16 +1275,76 @@ void amdgpu_userq_reset_work(struct
>>> work_struct *work)  static void  amdgpu_userq_wait_for_signal(struct
>>> amdgpu_userq_mgr *uq_mgr)  {
>>> -   struct amdgpu_usermode_queue *queue;
>>> -   unsigned long queue_id;
>>> +   lockdep_assert_held(&uq_mgr->userq_mutex);
>>>
>>> -   xa_for_each(&uq_mgr->userq_xa, queue_id, queue) {
>>> -           struct dma_fence *f = queue->last_fence;
>>> +   /* Rescan the userq xarray after each fence poll interval to get
>>> +    * newly added queues or fences.
>>> +    */
>>> +   for (;;) {
>>> +           struct amdgpu_usermode_queue *queue;
>>> +           unsigned long queue_id = 0;
>>> +           struct dma_fence *f = NULL;
>>> +           unsigned long timeout_ms = 0;
>>> +           u64 context = 0, seqno = 0;
>>> +           bool signaled = false;
>>> +
>>> +           xa_for_each(&uq_mgr->userq_xa, queue_id, queue) {
>>> +                   struct dma_fence *tmp = queue->last_fence;
>>> +
>>> +                   if (!tmp || dma_fence_is_signaled(tmp))
>>> +                           continue;
>>> +
>>> +                   f = dma_fence_get(tmp);
>>> +                   timeout_ms = amdgpu_userq_fence_timeout_ms(queue);
>>> +                   context = tmp->context;
>>> +                   seqno = tmp->seqno;
>>> +                   break;
>>> +           }
>>>
>>>             if (!f)
>>> +                   return;
>>> +
>>> +           if (!timeout_ms)
>>> +                   timeout_ms = 1;
>>> +
>>> +           /*
>>> +            * We can't use dma_fence_wait() here. Waiting there and then
>>> +            * reacquiring userq_mutex creates a lockdep cycle through
>>> +            * dma_fence_map:
>>> +            *   userq_mutex -> reservation_ww_class_mutex -> dma_fence_map
>>> +            * and
>>> +            *   dma_fence_map -> userq_mutex
>>> +            * Instead, drop the mutex, sleep in bounded intervals, then
>>> +            * reacquire and poll the fence signaled bit.
>>> +            */
>>> +           while (timeout_ms) {
>>> +                   unsigned long interval_ms;
>>> +
>>> +                   if (dma_fence_is_signaled(f)) {
>>> +                           signaled = true;
>>> +                           break;
>>> +                   }
>>> +
>>> +                   interval_ms = min(timeout_ms,
>>> +                                     (unsigned
>> long)AMDGPU_USERQ_FENCE_WAIT_POLL_MS);
>>> +                   mutex_unlock(&uq_mgr->userq_mutex);
>>> +                   msleep(interval_ms);
>>> +                   mutex_lock(&uq_mgr->userq_mutex);
>>> +                   timeout_ms -= interval_ms;
>>> +           }
>>> +
>>> +           if (!signaled && dma_fence_is_signaled(f))
>>> +                   signaled = true;
>>> +
>>> +           dma_fence_put(f);
>>> +
>>> +           if (signaled)
>>>                     continue;
>>>
>>> -           dma_fence_wait(f, false);
>>> +           drm_dbg(adev_to_drm(uq_mgr->adev),
>>> +                        "Timed out waiting for fence=%llu:%llu during eviction\n",
>>> +                        context, seqno);
>>> +           amdgpu_userq_detect_and_reset_queues(uq_mgr);
>>>     }
>>>  }
>>>
> 


^ permalink raw reply	[flat|nested] 5+ messages in thread

* RE: [PATCH] drm/amdgpu/userq: rework eviction fence suspension lock for fixing lockdep
  2026-04-09 11:47     ` Christian König
@ 2026-04-10  6:56       ` Liang, Prike
  0 siblings, 0 replies; 5+ messages in thread
From: Liang, Prike @ 2026-04-10  6:56 UTC (permalink / raw)
  To: Koenig, Christian, amd-gfx@lists.freedesktop.org; +Cc: Deucher, Alexander

[Public]

Regards,
      Prike

> -----Original Message-----
> From: Koenig, Christian <Christian.Koenig@amd.com>
> Sent: Thursday, April 9, 2026 7:47 PM
> To: Liang, Prike <Prike.Liang@amd.com>; amd-gfx@lists.freedesktop.org
> Cc: Deucher, Alexander <Alexander.Deucher@amd.com>
> Subject: Re: [PATCH] drm/amdgpu/userq: rework eviction fence suspension lock for
> fixing lockdep
>
> On 4/9/26 05:35, Liang, Prike wrote:
> > [Public]
> >
> > Regards,
> >       Prike
> >
> >> -----Original Message-----
> >> From: Koenig, Christian <Christian.Koenig@amd.com>
> >> Sent: Wednesday, April 8, 2026 4:22 PM
> >> To: Liang, Prike <Prike.Liang@amd.com>; amd-gfx@lists.freedesktop.org
> >> Cc: Deucher, Alexander <Alexander.Deucher@amd.com>
> >> Subject: Re: [PATCH] drm/amdgpu/userq: rework eviction fence
> >> suspension lock for fixing lockdep
> >>
> >> On 4/8/26 04:52, Prike Liang wrote:
> >>> amdgpu_eviction_fence_suspend_worker() ran
> >>> amdgpu_userq_wait_for_signal() with userq_mutex held. The helper
> >>> used to walk the xarray and block on
> >>> queue->last_fence while keeping that lock, so the userspace signal
> >>> queue->path
> >>> could never get the lock while the wait fence sleep waiting, then
> >>> triggering 120s hung task warnings.
> >>
> >> And that is perfectly intentional.
> >>
> >>>
> >>> Meanwhile, there also rework the userq lock access in the eviction
> >>> suspension path for resolving the lockdep/lock order issues.
> >>>
> >>> Signed-off-by: Prike Liang <Prike.Liang@amd.com>
> >>> ---
> >>>  .../drm/amd/amdgpu/amdgpu_eviction_fence.c    |   2 +-
> >>>  drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c     | 107 ++++++++++++++----
> >>>  2 files changed, 85 insertions(+), 24 deletions(-)
> >>>
> >>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_eviction_fence.c
> >>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_eviction_fence.c
> >>> index 5ae477c49a53..00c450e31139 100644
> >>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_eviction_fence.c
> >>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_eviction_fence.c
> >>> @@ -73,7 +73,6 @@ amdgpu_eviction_fence_suspend_worker(struct
> >>> work_struct
> >> *work)
> >>>      * allocate memory while holding this lock, but only after ensuring that
> >>>      * the eviction fence is signaled.
> >>>      */
> >>> -   cookie = dma_fence_begin_signalling();
> >>>
> >>>     ev_fence = amdgpu_evf_mgr_get_fence(evf_mgr);
> >>>     amdgpu_userq_evict(uq_mgr);
> >>> @@ -83,6 +82,7 @@ amdgpu_eviction_fence_suspend_worker(struct
> >>> work_struct
> >> *work)
> >>>      * userq_mutex. Otherwise we won't resume the queues before issuing the
> >>>      * next fence.
> >>>      */
> >>> +   cookie = dma_fence_begin_signalling();
> >>
> >> Absolutely clear NAK to that. This only disables the warning but
> >> doesn't fix the locking problem.
> >>
> >> As far as I can see the patch here is just once more utterly
> >> nonsense. What problem are you exactly trying to solve?
> >
> > There's a lock issue as following which is caused by the userq lock is
> > acquired with sleep wait (dma_fence_wait())in the
> > amdgpu_userq_wait_for_signal(), and this solution is to avoid waiting
> > the userq fence with userq mutex
>
> That solution is completely invalid. One of the main purposes of the userq_mutex is
> to prevent installing a new userq fence while we wait for the previous ones to signal.
>
> So waiting for the dma_fence while holding the userq_mutex lock is a must have!
>
> Who else is blocking on the userq_mutex?
From the lock hang dump log shows the mutex is acquired and blocked at dma_fence_wait() during eviction fence suspension, and that seems eventually blocked by the HW processed the queue. If this is a HW process timeout or hang up issue, then the SW can't do much on this case?  This issue can't reproduce consistently now; I will further investigate once the issue reproduced.

> Regards,
> Christian.
>
> > and rework the
> > userq mutex lock and fence dma fence lockdep order issue.
> >
> >
> > 9] INFO: task Xorg:cs0:2019 blocked for more than 120 seconds.
> > [ 7130.223182]       Tainted: G     U     OE       6.19.0-custom #16
> > [ 7130.223468] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this
> message.
> > [ 7130.223822] task:Xorg:cs0        state:D stack:0     pid:2019  tgid:2016  ppid:2014
> task_flags:0x400040 flags:0x00080000
> > [ 7130.223849] Call Trace:
> > [ 7130.223861]  <TASK>
> > [ 7130.223880]  __schedule+0x570/0x1200 [ 7130.223905]
> > schedule+0x47/0x160 [ 7130.223912]
> > schedule_preempt_disabled+0x19/0x30
> > [ 7130.223918]  __mutex_lock+0x6b1/0x10d0 [ 7130.223934]  ?
> > amdgpu_userq_ensure_ev_fence+0x3c/0x110 [amdgpu] [ 7130.224240]
> > mutex_lock_nested+0x1f/0x30 [ 7130.224245]  ?
> > mutex_lock_nested+0x1f/0x30 [ 7130.224251]
> > amdgpu_userq_ensure_ev_fence+0x3c/0x110 [amdgpu] [ 7130.224512]
> > amdgpu_userq_signal_ioctl+0x571/0x1060 [amdgpu] [ 7130.224748]  ?
> > srso_return_thunk+0x5/0x5f [ 7130.224755]  ?
> > __lock_acquire+0x43e/0x2210 [ 7130.224785]  ?
> > srso_return_thunk+0x5/0x5f [ 7130.224790]  ? lock_acquire+0xc6/0x2c0 [
> > 7130.224797]  ? drm_dev_enter+0x58/0xe0 [drm] [ 7130.224833]  ?
> > srso_return_thunk+0x5/0x5f [ 7130.224838]  ?
> > sched_clock_noinstr+0xd/0x20 [ 7130.224844]  ?
> > srso_return_thunk+0x5/0x5f [ 7130.224849]  ?
> > local_clock_noinstr+0x12/0xc0 [ 7130.224857]  ?
> > srso_return_thunk+0x5/0x5f [ 7130.224861]  ? local_clock+0x19/0x40 [
> > 7130.224867]  ? srso_return_thunk+0x5/0x5f [ 7130.224872]  ?
> > lock_release+0x27d/0x3c0 [ 7130.224887]  ?
> > __pfx_amdgpu_userq_signal_ioctl+0x10/0x10 [amdgpu] [ 7130.225110]
> > drm_ioctl_kernel+0xaf/0x110 [drm] [ 7130.225149]
> > drm_ioctl+0x290/0x510 [drm] [ 7130.225176]  ?
> > __pfx_amdgpu_userq_signal_ioctl+0x10/0x10 [amdgpu] [ 7130.225417]  ?
> > srso_return_thunk+0x5/0x5f [ 7130.225423]  ?
> > trace_hardirqs_on+0x5f/0xc0 [ 7130.225429]  ?
> > srso_return_thunk+0x5/0x5f [ 7130.225434]  ?
> > _raw_spin_unlock_irqrestore+0x35/0x60
> > [ 7130.225449]  amdgpu_drm_ioctl+0x52/0x90 [amdgpu] [ 7130.225657]
> > __x64_sys_ioctl+0xa0/0xf0 [ 7130.225672]  x64_sys_call+0x1280/0x21b0 [
> > 7130.225679]  do_syscall_64+0x6f/0x760 [ 7130.225690]
> > entry_SYSCALL_64_after_hwframe+0x76/0x7e
> > [ 7130.225695] RIP: 0033:0x7f5339d1a9cf [ 7130.225701] RSP:
> > 002b:00007f532d3fe540 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 [
> > 7130.225708] RAX: ffffffffffffffda RBX: 0000000000000010 RCX:
> > 00007f5339d1a9cf [ 7130.225712] RDX: 00007f532d3fe730 RSI:
> > 00000000c0306457 RDI: 000000000000000f [ 7130.225716] RBP:
> > 00007f532d3fe5c0 R08: 00007f532d3fe630 R09: 0000000000000001 [
> > 7130.225720] R10: 00007f532d3fe818 R11: 0000000000000246 R12:
> > 00007f532d3ff640 [ 7130.225724] R13: 0000000000000016 R14:
> > 00007f5339c947d0 R15: 00007ffe53cb5130 [ 7130.225755]  </TASK>
> [ 7130.225818] INFO: task Xorg:cs0:2019 is blocked on a mutex likely owned by
> task kworker/5:0:4873.
> > [ 7130.226193] INFO: task kworker/2:1:4775 blocked for more than 120 seconds.
> > [ 7130.226514]       Tainted: G     U     OE       6.19.0-custom #16
> > [ 7130.226764] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this
> message.
> > [ 7130.227082] task:kworker/2:1     state:D stack:0     pid:4775  tgid:4775  ppid:2
> task_flags:0x4208060 flags:0x00080000
> > [ 7130.227091] Workqueue: events amdgpu_userq_hang_detect_work
> > [amdgpu] [ 7130.227341] Call Trace:
> > [ 7130.227345]  <TASK>
> > [ 7130.227356]  __schedule+0x570/0x1200 [ 7130.227378]
> > schedule+0x47/0x160 [ 7130.227385]
> > schedule_preempt_disabled+0x19/0x30
> > [ 7130.227390]  __mutex_lock+0x6b1/0x10d0 [ 7130.227403]  ?
> > amdgpu_userq_hang_detect_work+0x5a/0x80 [amdgpu] [ 7130.227660]
> > mutex_lock_nested+0x1f/0x30 [ 7130.227667]  ?
> > mutex_lock_nested+0x1f/0x30 [ 7130.227673]
> > amdgpu_userq_hang_detect_work+0x5a/0x80 [amdgpu] [ 7130.227918]
> > process_one_work+0x233/0x650 [ 7130.227944]  worker_thread+0x1b2/0x360
> > [ 7130.227957]  kthread+0x11c/0x260 [ 7130.227964]  ?
> > srso_return_thunk+0x5/0x5f [ 7130.227969]  ?
> > __pfx_worker_thread+0x10/0x10 [ 7130.227977]  ?
> > __pfx_kthread+0x10/0x10 [ 7130.227987]  ret_from_fork+0x29f/0x2f0 [
> > 7130.227995]  ? __pfx_kthread+0x10/0x10 [ 7130.228003]
> > ret_from_fork_asm+0x1a/0x30 [ 7130.228034]  </TASK> [ 7130.228089]
> > INFO: task kworker/2:1:4775 is blocked on a mutex likely owned by task
> kworker/5:0:4873.
> > [ 7130.228462] INFO: task kworker/5:0:4873 blocked for more than 120 seconds.
> > [ 7130.228743]       Tainted: G     U     OE       6.19.0-custom #16
> > [ 7130.228993] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this
> message.
> > [ 7130.229309] task:kworker/5:0     state:D stack:0     pid:4873  tgid:4873  ppid:2
> task_flags:0x4208060 flags:0x00080000
> > [ 7130.229318] Workqueue: events amdgpu_eviction_fence_suspend_worker
> > [amdgpu] [ 7130.229543] Call Trace:
> > [ 7130.229547]  <TASK>
> > [ 7130.229558]  __schedule+0x570/0x1200 [ 7130.229579]
> > schedule+0x47/0x160 [ 7130.229586]  schedule_timeout+0x10a/0x120 [
> > 7130.229591]  ? srso_return_thunk+0x5/0x5f [ 7130.229597]  ?
> > mark_held_locks+0x54/0x90 [ 7130.229610]  ? srso_return_thunk+0x5/0x5f
> > [ 7130.229615]  ? trace_hardirqs_on+0x5f/0xc0 [ 7130.229621]  ?
> > srso_return_thunk+0x5/0x5f [ 7130.229631]
> > dma_fence_default_wait+0x1f5/0x290
> > [ 7130.229640]  ? dma_fence_default_wait+0xfc/0x290 [ 7130.229649]  ?
> > __pfx_dma_fence_default_wait_cb+0x10/0x10
> > [ 7130.229663]  dma_fence_wait_timeout+0x300/0x3c0
> > [ 7130.229675]  amdgpu_userq_evict+0x67/0x120 [amdgpu] [ 7130.229930]
> > amdgpu_eviction_fence_suspend_worker+0x4d/0xd0 [amdgpu] [ 7130.230162]
> > process_one_work+0x233/0x650 [ 7130.230187]  worker_thread+0x1b2/0x360
> > [ 7130.230200]  kthread+0x11c/0x260 [ 7130.230205]  ?
> > srso_return_thunk+0x5/0x5f [ 7130.230211]  ?
> > __pfx_worker_thread+0x10/0x10 [ 7130.230218]  ?
> > __pfx_kthread+0x10/0x10 [ 7130.230229]  ret_from_fork+0x29f/0x2f0 [
> > 7130.230234]  ? __pfx_kthread+0x10/0x10 [ 7130.230243]
> > ret_from_fork_asm+0x1a/0x30 [ 7130.230274]  </TASK>
> >> Regards,
> >> Christian.
> >>
> >>>     dma_fence_signal(ev_fence);
> >>>     dma_fence_end_signalling(cookie);
> >>>     dma_fence_put(ev_fence);
> >>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> >>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> >>> index 9d3c39e96ac1..7691f169415b 100644
> >>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> >>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> >>> @@ -26,6 +26,7 @@
> >>>  #include <drm/drm_exec.h>
> >>>  #include <linux/pm_runtime.h>
> >>>  #include <drm/drm_drv.h>
> >>> +#include <linux/lockdep.h>
> >>>
> >>>  #include "amdgpu.h"
> >>>  #include "amdgpu_reset.h"
> >>> @@ -34,6 +35,23 @@
> >>>  #include "amdgpu_hmm.h"
> >>>  #include "amdgpu_userq_fence.h"
> >>>
> >>> +#define AMDGPU_USERQ_FENCE_WAIT_POLL_MS 1000 static unsigned
> long
> >>> +amdgpu_userq_fence_timeout_ms(struct amdgpu_usermode_queue *queue) {
> >>> +   struct amdgpu_device *adev = queue->userq_mgr->adev;
> >>> +   switch (queue->queue_type) {
> >>> +   case AMDGPU_RING_TYPE_GFX:
> >>> +           return adev->gfx_timeout;
> >>> +   case AMDGPU_RING_TYPE_COMPUTE:
> >>> +           return adev->compute_timeout;
> >>> +   case AMDGPU_RING_TYPE_SDMA:
> >>> +           return adev->sdma_timeout;
> >>> +   default:
> >>> +           return adev->gfx_timeout;
> >>> +   }
> >>> +}
> >>> +
> >>>  u32 amdgpu_userq_get_supported_ip_mask(struct amdgpu_device *adev)  {
> >>>     int i;
> >>> @@ -176,29 +194,12 @@ static void
> >>> amdgpu_userq_hang_detect_work(struct
> >>> work_struct *work)  */  void
> >>> amdgpu_userq_start_hang_detect_work(struct amdgpu_usermode_queue
> >>> *queue)  {
> >>> -   struct amdgpu_device *adev;
> >>>     unsigned long timeout_ms;
> >>>
> >>>     if (!queue || !queue->userq_mgr || !queue->userq_mgr->adev)
> >>>             return;
> >>>
> >>> -   adev = queue->userq_mgr->adev;
> >>> -   /* Determine timeout based on queue type */
> >>> -   switch (queue->queue_type) {
> >>> -   case AMDGPU_RING_TYPE_GFX:
> >>> -           timeout_ms = adev->gfx_timeout;
> >>> -           break;
> >>> -   case AMDGPU_RING_TYPE_COMPUTE:
> >>> -           timeout_ms = adev->compute_timeout;
> >>> -           break;
> >>> -   case AMDGPU_RING_TYPE_SDMA:
> >>> -           timeout_ms = adev->sdma_timeout;
> >>> -           break;
> >>> -   default:
> >>> -           timeout_ms = adev->gfx_timeout;
> >>> -           break;
> >>> -   }
> >>> -
> >>> +   timeout_ms = amdgpu_userq_fence_timeout_ms(queue);
> >>>     /* Store the fence to monitor and schedule hang detection */
> >>>     WRITE_ONCE(queue->hang_detect_fence, queue->last_fence);
> >>>     schedule_delayed_work(&queue->hang_detect_work,
> >>> @@ -1274,16 +1275,76 @@ void amdgpu_userq_reset_work(struct
> >>> work_struct *work)  static void  amdgpu_userq_wait_for_signal(struct
> >>> amdgpu_userq_mgr *uq_mgr)  {
> >>> -   struct amdgpu_usermode_queue *queue;
> >>> -   unsigned long queue_id;
> >>> +   lockdep_assert_held(&uq_mgr->userq_mutex);
> >>>
> >>> -   xa_for_each(&uq_mgr->userq_xa, queue_id, queue) {
> >>> -           struct dma_fence *f = queue->last_fence;
> >>> +   /* Rescan the userq xarray after each fence poll interval to get
> >>> +    * newly added queues or fences.
> >>> +    */
> >>> +   for (;;) {
> >>> +           struct amdgpu_usermode_queue *queue;
> >>> +           unsigned long queue_id = 0;
> >>> +           struct dma_fence *f = NULL;
> >>> +           unsigned long timeout_ms = 0;
> >>> +           u64 context = 0, seqno = 0;
> >>> +           bool signaled = false;
> >>> +
> >>> +           xa_for_each(&uq_mgr->userq_xa, queue_id, queue) {
> >>> +                   struct dma_fence *tmp = queue->last_fence;
> >>> +
> >>> +                   if (!tmp || dma_fence_is_signaled(tmp))
> >>> +                           continue;
> >>> +
> >>> +                   f = dma_fence_get(tmp);
> >>> +                   timeout_ms = amdgpu_userq_fence_timeout_ms(queue);
> >>> +                   context = tmp->context;
> >>> +                   seqno = tmp->seqno;
> >>> +                   break;
> >>> +           }
> >>>
> >>>             if (!f)
> >>> +                   return;
> >>> +
> >>> +           if (!timeout_ms)
> >>> +                   timeout_ms = 1;
> >>> +
> >>> +           /*
> >>> +            * We can't use dma_fence_wait() here. Waiting there and then
> >>> +            * reacquiring userq_mutex creates a lockdep cycle through
> >>> +            * dma_fence_map:
> >>> +            *   userq_mutex -> reservation_ww_class_mutex -> dma_fence_map
> >>> +            * and
> >>> +            *   dma_fence_map -> userq_mutex
> >>> +            * Instead, drop the mutex, sleep in bounded intervals, then
> >>> +            * reacquire and poll the fence signaled bit.
> >>> +            */
> >>> +           while (timeout_ms) {
> >>> +                   unsigned long interval_ms;
> >>> +
> >>> +                   if (dma_fence_is_signaled(f)) {
> >>> +                           signaled = true;
> >>> +                           break;
> >>> +                   }
> >>> +
> >>> +                   interval_ms = min(timeout_ms,
> >>> +                                     (unsigned
> >> long)AMDGPU_USERQ_FENCE_WAIT_POLL_MS);
> >>> +                   mutex_unlock(&uq_mgr->userq_mutex);
> >>> +                   msleep(interval_ms);
> >>> +                   mutex_lock(&uq_mgr->userq_mutex);
> >>> +                   timeout_ms -= interval_ms;
> >>> +           }
> >>> +
> >>> +           if (!signaled && dma_fence_is_signaled(f))
> >>> +                   signaled = true;
> >>> +
> >>> +           dma_fence_put(f);
> >>> +
> >>> +           if (signaled)
> >>>                     continue;
> >>>
> >>> -           dma_fence_wait(f, false);
> >>> +           drm_dbg(adev_to_drm(uq_mgr->adev),
> >>> +                        "Timed out waiting for fence=%llu:%llu during eviction\n",
> >>> +                        context, seqno);
> >>> +           amdgpu_userq_detect_and_reset_queues(uq_mgr);
> >>>     }
> >>>  }
> >>>
> >


^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2026-04-10  6:56 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-04-08  2:52 [PATCH] drm/amdgpu/userq: rework eviction fence suspension lock for fixing lockdep Prike Liang
2026-04-08  8:21 ` Christian König
2026-04-09  3:35   ` Liang, Prike
2026-04-09 11:47     ` Christian König
2026-04-10  6:56       ` Liang, Prike

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.