public inbox for amd-gfx@lists.freedesktop.org
 help / color / mirror / Atom feed
From: "Khatri, Sunil" <sukhatri@amd.com>
To: "Christian König" <ckoenig.leichtzumerken@gmail.com>,
	alexander.deucher@amd.com, Prike.Liang@amd.com,
	amd-gfx@lists.freedesktop.org
Cc: christian.koenig@amd.com
Subject: Re: [PATCH 06/11] drm/amdgpu: remove almost all calls to amdgpu_userq_detect_and_reset_queues
Date: Wed, 22 Apr 2026 15:50:03 +0530	[thread overview]
Message-ID: <30fcb9fe-cef3-4320-b430-735071e808c8@amd.com> (raw)
In-Reply-To: <20260421125513.4545-6-christian.koenig@amd.com>

Now this is exactly how i nearly disabled the reset login in my 
validation setup.
Looks clean and as per the expectations.

Reviewed-by: Sunil Khatri <sunil.khatri@amd.com>

On 21-04-2026 06:25 pm, Christian König wrote:
> Well the reset handling seems broken on multiple levels.
>
> As first step of fixing this remove most calls to the hang detection.
> That function should only be called after we run into a timeout! And *NOT*
> as random check spread over the code in multiple places.
>
> Signed-off-by: Christian König <christian.koenig@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c | 38 +++++++++--------------
>   1 file changed, 14 insertions(+), 24 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> index 8ce001481d42..5ccd53ad8efd 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> @@ -345,23 +345,18 @@ static int amdgpu_userq_preempt_helper(struct amdgpu_usermode_queue *queue)
>   	struct amdgpu_device *adev = uq_mgr->adev;
>   	const struct amdgpu_userq_funcs *userq_funcs =
>   		adev->userq_funcs[queue->queue_type];
> -	bool found_hung_queue = false;
> -	int r = 0;
> +	int r;
>   
>   	if (queue->state == AMDGPU_USERQ_STATE_MAPPED) {
>   		r = userq_funcs->preempt(queue);
>   		if (r) {
>   			queue->state = AMDGPU_USERQ_STATE_HUNG;
> -			found_hung_queue = true;
> +			return r;
>   		} else {
>   			queue->state = AMDGPU_USERQ_STATE_PREEMPTED;
>   		}
>   	}
> -
> -	if (found_hung_queue)
> -		amdgpu_userq_detect_and_reset_queues(uq_mgr);
> -
> -	return r;
> +	return 0;
>   }
>   
>   static int amdgpu_userq_restore_helper(struct amdgpu_usermode_queue *queue)
> @@ -390,24 +385,21 @@ static int amdgpu_userq_unmap_helper(struct amdgpu_usermode_queue *queue)
>   	struct amdgpu_device *adev = uq_mgr->adev;
>   	const struct amdgpu_userq_funcs *userq_funcs =
>   		adev->userq_funcs[queue->queue_type];
> -	bool found_hung_queue = false;
> -	int r = 0;
> +	int r;
>   
>   	if ((queue->state == AMDGPU_USERQ_STATE_MAPPED) ||
> -		(queue->state == AMDGPU_USERQ_STATE_PREEMPTED)) {
> +	    (queue->state == AMDGPU_USERQ_STATE_PREEMPTED)) {
> +
>   		r = userq_funcs->unmap(queue);
>   		if (r) {
>   			queue->state = AMDGPU_USERQ_STATE_HUNG;
> -			found_hung_queue = true;
> +			return r;
>   		} else {
>   			queue->state = AMDGPU_USERQ_STATE_UNMAPPED;
>   		}
>   	}
>   
> -	if (found_hung_queue)
> -		amdgpu_userq_detect_and_reset_queues(uq_mgr);
> -
> -	return r;
> +	return 0;
>   }
>   
>   static int amdgpu_userq_map_helper(struct amdgpu_usermode_queue *queue)
> @@ -416,19 +408,19 @@ static int amdgpu_userq_map_helper(struct amdgpu_usermode_queue *queue)
>   	struct amdgpu_device *adev = uq_mgr->adev;
>   	const struct amdgpu_userq_funcs *userq_funcs =
>   		adev->userq_funcs[queue->queue_type];
> -	int r = 0;
> +	int r;
>   
>   	if (queue->state == AMDGPU_USERQ_STATE_UNMAPPED) {
>   		r = userq_funcs->map(queue);
>   		if (r) {
>   			queue->state = AMDGPU_USERQ_STATE_HUNG;
> -			amdgpu_userq_detect_and_reset_queues(uq_mgr);
> +			return r;
>   		} else {
>   			queue->state = AMDGPU_USERQ_STATE_MAPPED;
>   		}
>   	}
>   
> -	return r;
> +	return 0;
>   }
>   
>   static void amdgpu_userq_wait_for_last_fence(struct amdgpu_usermode_queue *queue)
> @@ -654,7 +646,6 @@ amdgpu_userq_destroy(struct amdgpu_userq_mgr *uq_mgr, struct amdgpu_usermode_que
>   #if defined(CONFIG_DEBUG_FS)
>   	debugfs_remove_recursive(queue->debugfs_queue);
>   #endif
> -	amdgpu_userq_detect_and_reset_queues(uq_mgr);
>   	r = amdgpu_userq_unmap_helper(queue);
>   	/*TODO: It requires a reset for userq hw unmap error*/
>   	if (r) {
> @@ -1268,7 +1259,6 @@ amdgpu_userq_evict_all(struct amdgpu_userq_mgr *uq_mgr)
>   	unsigned long queue_id;
>   	int ret = 0, r;
>   
> -	amdgpu_userq_detect_and_reset_queues(uq_mgr);
>   	/* Try to unmap all the queues in this process ctx */
>   	xa_for_each(&uq_mgr->userq_xa, queue_id, queue) {
>   		r = amdgpu_userq_preempt_helper(queue);
> @@ -1276,9 +1266,11 @@ amdgpu_userq_evict_all(struct amdgpu_userq_mgr *uq_mgr)
>   			ret = r;
>   	}
>   
> -	if (ret)
> +	if (ret) {
>   		drm_file_err(uq_mgr->file,
>   			     "Couldn't unmap all the queues, eviction failed ret=%d\n", ret);
> +		amdgpu_userq_detect_and_reset_queues(uq_mgr);
> +	}
>   	return ret;
>   }
>   
> @@ -1378,7 +1370,6 @@ int amdgpu_userq_suspend(struct amdgpu_device *adev)
>   		uqm = queue->userq_mgr;
>   		cancel_delayed_work_sync(&uqm->resume_work);
>   		guard(mutex)(&uqm->userq_mutex);
> -		amdgpu_userq_detect_and_reset_queues(uqm);
>   		if (adev->in_s0ix)
>   			r = amdgpu_userq_preempt_helper(queue);
>   		else
> @@ -1437,7 +1428,6 @@ int amdgpu_userq_stop_sched_for_enforce_isolation(struct amdgpu_device *adev,
>   		if (((queue->queue_type == AMDGPU_HW_IP_GFX) ||
>   		     (queue->queue_type == AMDGPU_HW_IP_COMPUTE)) &&
>   		    (queue->xcp_id == idx)) {
> -			amdgpu_userq_detect_and_reset_queues(uqm);
>   			r = amdgpu_userq_preempt_helper(queue);
>   			if (r)
>   				ret = r;

  reply	other threads:[~2026-04-22 10:20 UTC|newest]

Thread overview: 38+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-04-21 12:55 [PATCH 01/11] drm/amdgpu: fix AMDGPU_INFO_READ_MMR_REG Christian König
2026-04-21 12:55 ` [PATCH 02/11] drm/amdgpu: remove deadlocks from amdgpu_userq_pre_reset Christian König
2026-04-22  4:53   ` Khatri, Sunil
2026-04-22  7:13     ` Christian König
2026-04-22  7:19       ` Khatri, Sunil
2026-04-22  7:24         ` Christian König
2026-04-22  7:29           ` Khatri, Sunil
2026-04-27  8:45   ` Liang, Prike
2026-04-21 12:55 ` [PATCH 03/11] drm/amdgpu: nuke amdgpu_userq_fence_free Christian König
2026-04-22  8:29   ` Khatri, Sunil
2026-04-22  9:26     ` Christian König
2026-04-22  9:40       ` Khatri, Sunil
2026-04-22 10:12         ` Christian König
2026-04-22 14:32           ` Khatri, Sunil
2026-04-27  6:21   ` Liang, Prike
2026-04-21 12:55 ` [PATCH 04/11] drm/amdgpu: rework amdgpu_userq_signal_ioctl Christian König
2026-04-22 10:08   ` Khatri, Sunil
2026-04-22 10:14     ` Christian König
2026-04-22 15:14       ` Khatri, Sunil
2026-04-23  9:58   ` Liang, Prike
2026-04-23 10:47     ` Christian König
2026-04-23 10:54       ` Khatri, Sunil
2026-04-24  8:01       ` Liang, Prike
2026-04-24 13:02         ` Christian König
2026-04-21 12:55 ` [PATCH 05/11] drm/amdgpu: rework userq fence signal processing Christian König
2026-04-22 10:16   ` Khatri, Sunil
2026-04-21 12:55 ` [PATCH 06/11] drm/amdgpu: remove almost all calls to amdgpu_userq_detect_and_reset_queues Christian König
2026-04-22 10:20   ` Khatri, Sunil [this message]
2026-04-21 12:55 ` [PATCH 07/11] drm/amdgpu: fix userq hang detection and reset Christian König
2026-04-22 10:35   ` Khatri, Sunil
2026-04-21 12:55 ` [PATCH 08/11] drm/amdgpu: rework userq reset work handling Christian König
2026-04-23 10:43   ` Khatri, Sunil
2026-04-21 12:55 ` [PATCH 09/11] drm/amdgpu: revert to old status lock handling v4 Christian König
2026-04-23 10:45   ` Khatri, Sunil
2026-04-21 12:55 ` [PATCH 10/11] drm/amdgpu: restructure VM state machine v2 Christian König
2026-04-23 10:46   ` Khatri, Sunil
2026-04-21 12:55 ` [PATCH 11/11] drm/amdgpu: WIP sync amdgpu_ttm_fill_mem only to kernel fences Christian König
2026-04-23 10:47   ` Khatri, Sunil

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=30fcb9fe-cef3-4320-b430-735071e808c8@amd.com \
    --to=sukhatri@amd.com \
    --cc=Prike.Liang@amd.com \
    --cc=alexander.deucher@amd.com \
    --cc=amd-gfx@lists.freedesktop.org \
    --cc=christian.koenig@amd.com \
    --cc=ckoenig.leichtzumerken@gmail.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox