[PATCH] drm/amdgpu/userq: make sure only one reset work thread runs at a time

AMD-GFX Archive on lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH] drm/amdgpu/userq: make sure only one reset work thread runs at a time
@ 2026-05-12  7:04 Sunil Khatri
  2026-05-12  8:22 ` Christian König
  0 siblings, 1 reply; 3+ messages in thread
From: Sunil Khatri @ 2026-05-12  7:04 UTC (permalink / raw)
  To: Alex Deucher, Christian König; +Cc: amd-gfx, Sunil Khatri

CPU0:   hang_detect_work → directly calls reset_work()
CPU1:   evict_all → queues reset_work (via workqueue)

There is a possibility of two reset thread running at same time.
To avoid that we add a per queue manager flag to avoid duplication.

Signed-off-by: Sunil Khatri <sunil.khatri@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c | 16 ++++++++++++++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h |  1 +
 2 files changed, 17 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
index 0a1fc45f5b4e..1440f51b667f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
@@ -109,6 +109,19 @@ static void amdgpu_userq_mgr_reset_work(struct work_struct *work)
 	if (!amdgpu_gpu_recovery)
 		return;
 
+	/*
+	 * Prevent concurrent/duplicate reset executions. Both hang_detect_work
+	 * (direct call) and evict_all (via schedule+flush_work) can invoke this
+	 * function simultaneously. Use an atomic test-and-set so only the first
+	 * caller proceeds; the second exits early.
+	 *
+	 * Note: amdgpu_in_reset() cannot be used here because in_gpu_reset is
+	 * only set deep inside amdgpu_device_gpu_recover(), well after we've
+	 * already entered this function.
+	 */
+	if (atomic_cmpxchg(&uq_mgr->reset_in_progress, 0, 1) != 0)
+		return;
+
 	/*
 	 * Iterate through all queue types to detect and reset problematic queues
 	 * Process each queue type in the defined order
@@ -145,6 +158,8 @@ static void amdgpu_userq_mgr_reset_work(struct work_struct *work)
 
 		amdgpu_device_gpu_recover(adev, NULL, &reset_context);
 	}
+
+	atomic_set(&uq_mgr->reset_in_progress, 0);
 }
 
 static void amdgpu_userq_hang_detect_work(struct work_struct *work)
@@ -1304,6 +1319,7 @@ int amdgpu_userq_mgr_init(struct amdgpu_userq_mgr *userq_mgr, struct drm_file *f
 
 	INIT_DELAYED_WORK(&userq_mgr->resume_work, amdgpu_userq_restore_worker);
 	INIT_WORK(&userq_mgr->reset_work, amdgpu_userq_mgr_reset_work);
+	atomic_set(&userq_mgr->reset_in_progress, 0);
 	return 0;
 }
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
index 49b33e2d6932..2748ecc0f6c9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
@@ -129,6 +129,7 @@ struct amdgpu_userq_mgr {
 	 * Reset work which is used when eviction fails.
 	 */
 	struct work_struct		reset_work;
+	atomic_t			reset_in_progress;
 	atomic_t                        userq_count[AMDGPU_RING_TYPE_MAX];
 };
 
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 3+ messages in thread

* Re: [PATCH] drm/amdgpu/userq: make sure only one reset work thread runs at a time
  2026-05-12  7:04 [PATCH] drm/amdgpu/userq: make sure only one reset work thread runs at a time Sunil Khatri
@ 2026-05-12  8:22 ` Christian König
  2026-05-12  9:31   ` Khatri, Sunil
  0 siblings, 1 reply; 3+ messages in thread
From: Christian König @ 2026-05-12  8:22 UTC (permalink / raw)
  To: Sunil Khatri, Alex Deucher; +Cc: amd-gfx

On 5/12/26 09:04, Sunil Khatri wrote:
> CPU0:   hang_detect_work → directly calls reset_work()
> CPU1:   evict_all → queues reset_work (via workqueue)
> 
> There is a possibility of two reset thread running at same time.
> To avoid that we add a per queue manager flag to avoid duplication.

Clear NAK, that doesn't make sense.

All reset work must run on a single threaded reset queue, so only one work at a time can run.

If multiple reset sources trigger at the same time (which is quite common) then the ones handled by a reset are canceled as soon as the reset is completed.

Regards,
Christian.

> 
> Signed-off-by: Sunil Khatri <sunil.khatri@amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c | 16 ++++++++++++++++
>  drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h |  1 +
>  2 files changed, 17 insertions(+)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> index 0a1fc45f5b4e..1440f51b667f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> @@ -109,6 +109,19 @@ static void amdgpu_userq_mgr_reset_work(struct work_struct *work)
>  	if (!amdgpu_gpu_recovery)
>  		return;
>  
> +	/*
> +	 * Prevent concurrent/duplicate reset executions. Both hang_detect_work
> +	 * (direct call) and evict_all (via schedule+flush_work) can invoke this
> +	 * function simultaneously. Use an atomic test-and-set so only the first
> +	 * caller proceeds; the second exits early.
> +	 *
> +	 * Note: amdgpu_in_reset() cannot be used here because in_gpu_reset is
> +	 * only set deep inside amdgpu_device_gpu_recover(), well after we've
> +	 * already entered this function.
> +	 */
> +	if (atomic_cmpxchg(&uq_mgr->reset_in_progress, 0, 1) != 0)
> +		return;
> +
>  	/*
>  	 * Iterate through all queue types to detect and reset problematic queues
>  	 * Process each queue type in the defined order
> @@ -145,6 +158,8 @@ static void amdgpu_userq_mgr_reset_work(struct work_struct *work)
>  
>  		amdgpu_device_gpu_recover(adev, NULL, &reset_context);
>  	}
> +
> +	atomic_set(&uq_mgr->reset_in_progress, 0);
>  }
>  
>  static void amdgpu_userq_hang_detect_work(struct work_struct *work)
> @@ -1304,6 +1319,7 @@ int amdgpu_userq_mgr_init(struct amdgpu_userq_mgr *userq_mgr, struct drm_file *f
>  
>  	INIT_DELAYED_WORK(&userq_mgr->resume_work, amdgpu_userq_restore_worker);
>  	INIT_WORK(&userq_mgr->reset_work, amdgpu_userq_mgr_reset_work);
> +	atomic_set(&userq_mgr->reset_in_progress, 0);
>  	return 0;
>  }
>  
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
> index 49b33e2d6932..2748ecc0f6c9 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
> @@ -129,6 +129,7 @@ struct amdgpu_userq_mgr {
>  	 * Reset work which is used when eviction fails.
>  	 */
>  	struct work_struct		reset_work;
> +	atomic_t			reset_in_progress;
>  	atomic_t                        userq_count[AMDGPU_RING_TYPE_MAX];
>  };
>  


^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [PATCH] drm/amdgpu/userq: make sure only one reset work thread runs at a time
  2026-05-12  8:22 ` Christian König
@ 2026-05-12  9:31   ` Khatri, Sunil
  0 siblings, 0 replies; 3+ messages in thread
From: Khatri, Sunil @ 2026-05-12  9:31 UTC (permalink / raw)
  To: Christian König, Sunil Khatri, Alex Deucher; +Cc: amd-gfx


On 12-05-2026 01:52 pm, Christian König wrote:
> On 5/12/26 09:04, Sunil Khatri wrote:
>> CPU0:   hang_detect_work → directly calls reset_work()
>> CPU1:   evict_all → queues reset_work (via workqueue)
>>
>> There is a possibility of two reset thread running at same time.
>> To avoid that we add a per queue manager flag to avoid duplication.
> Clear NAK, that doesn't make sense.
>
> All reset work must run on a single threaded reset queue, so only one work at a time can run.
>
> If multiple reset sources trigger at the same time (which is quite common) then the ones handled by a reset are canceled as soon as the reset is completed.

Got it probably the reason of two instances running is different as we 
discussed. Shared another patch for an open bug we found.

Regards
Sunil Khatri

>
> Regards,
> Christian.
>
>> Signed-off-by: Sunil Khatri <sunil.khatri@amd.com>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c | 16 ++++++++++++++++
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h |  1 +
>>   2 files changed, 17 insertions(+)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
>> index 0a1fc45f5b4e..1440f51b667f 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
>> @@ -109,6 +109,19 @@ static void amdgpu_userq_mgr_reset_work(struct work_struct *work)
>>   	if (!amdgpu_gpu_recovery)
>>   		return;
>>   
>> +	/*
>> +	 * Prevent concurrent/duplicate reset executions. Both hang_detect_work
>> +	 * (direct call) and evict_all (via schedule+flush_work) can invoke this
>> +	 * function simultaneously. Use an atomic test-and-set so only the first
>> +	 * caller proceeds; the second exits early.
>> +	 *
>> +	 * Note: amdgpu_in_reset() cannot be used here because in_gpu_reset is
>> +	 * only set deep inside amdgpu_device_gpu_recover(), well after we've
>> +	 * already entered this function.
>> +	 */
>> +	if (atomic_cmpxchg(&uq_mgr->reset_in_progress, 0, 1) != 0)
>> +		return;
>> +
>>   	/*
>>   	 * Iterate through all queue types to detect and reset problematic queues
>>   	 * Process each queue type in the defined order
>> @@ -145,6 +158,8 @@ static void amdgpu_userq_mgr_reset_work(struct work_struct *work)
>>   
>>   		amdgpu_device_gpu_recover(adev, NULL, &reset_context);
>>   	}
>> +
>> +	atomic_set(&uq_mgr->reset_in_progress, 0);
>>   }
>>   
>>   static void amdgpu_userq_hang_detect_work(struct work_struct *work)
>> @@ -1304,6 +1319,7 @@ int amdgpu_userq_mgr_init(struct amdgpu_userq_mgr *userq_mgr, struct drm_file *f
>>   
>>   	INIT_DELAYED_WORK(&userq_mgr->resume_work, amdgpu_userq_restore_worker);
>>   	INIT_WORK(&userq_mgr->reset_work, amdgpu_userq_mgr_reset_work);
>> +	atomic_set(&userq_mgr->reset_in_progress, 0);
>>   	return 0;
>>   }
>>   
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
>> index 49b33e2d6932..2748ecc0f6c9 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
>> @@ -129,6 +129,7 @@ struct amdgpu_userq_mgr {
>>   	 * Reset work which is used when eviction fails.
>>   	 */
>>   	struct work_struct		reset_work;
>> +	atomic_t			reset_in_progress;
>>   	atomic_t                        userq_count[AMDGPU_RING_TYPE_MAX];
>>   };
>>   

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2026-05-12  9:31 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-05-12  7:04 [PATCH] drm/amdgpu/userq: make sure only one reset work thread runs at a time Sunil Khatri
2026-05-12  8:22 ` Christian König
2026-05-12  9:31   ` Khatri, Sunil

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox