public inbox for amd-gfx@lists.freedesktop.org
 help / color / mirror / Atom feed
From: "Khatri, Sunil" <sukhatri@amd.com>
To: "Christian König" <ckoenig.leichtzumerken@gmail.com>,
	alexander.deucher@amd.com, Prike.Liang@amd.com,
	amd-gfx@lists.freedesktop.org
Cc: christian.koenig@amd.com
Subject: Re: [PATCH 08/11] drm/amdgpu: rework userq reset work handling
Date: Thu, 23 Apr 2026 16:13:30 +0530	[thread overview]
Message-ID: <99fbb7b9-bdd4-4c97-b089-5f6a3bb7a6c8@amd.com> (raw)
In-Reply-To: <20260421125513.4545-8-christian.koenig@amd.com>

[-- Attachment #1: Type: text/plain, Size: 9526 bytes --]


On 21-04-2026 06:25 pm, Christian König wrote:
> It is illegal to schedule reset work from another reset work!
>
> Fix this by scheduling the userq reset work directly on the work queue
> of the reset domain.
>
> Not fully tested, I leave that to the IGT test cases.
>
> Signed-off-by: Christian König<christian.koenig@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  1 -
>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  3 +-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c  | 84 +++++++++++-----------
>   drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h  | 16 ++++-
>   4 files changed, 60 insertions(+), 44 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index 39894e38fee4..17341e384caf 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -1191,7 +1191,6 @@ struct amdgpu_device {
>   	bool                            apu_prefer_gtt;
>   
>   	bool                            userq_halt_for_enforce_isolation;
> -	struct work_struct              userq_reset_work;
>   	struct amdgpu_uid *uid_info;
>   
>   	struct amdgpu_uma_carveout_info uma_info;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index b11c4b5fa8fc..cf61be17e061 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -3786,7 +3786,6 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>   	}
>   
>   	INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
> -	INIT_WORK(&adev->userq_reset_work, amdgpu_userq_reset_work);
>   
>   	amdgpu_coredump_init(adev);
>   
> @@ -5477,7 +5476,7 @@ static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev)
>   	if (!amdgpu_sriov_vf(adev))
>   		cancel_work(&adev->reset_work);
>   #endif
> -	cancel_work(&adev->userq_reset_work);
> +	amdgpu_userq_mgr_cancel_reset_work(adev);
>   
>   	if (adev->kfd.dev)
>   		cancel_work(&adev->kfd.reset_work);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> index 0a4c39d83adc..ad6dac17dd21 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> @@ -82,19 +82,11 @@ static bool amdgpu_userq_is_reset_type_supported(struct amdgpu_device *adev,
>   	return false;
>   }
>   
> -static void amdgpu_userq_gpu_reset(struct amdgpu_device *adev)
> -{
> -	if (amdgpu_device_should_recover_gpu(adev)) {
> -		amdgpu_reset_domain_schedule(adev->reset_domain,
> -					     &adev->userq_reset_work);
> -		/* Wait for the reset job to complete */
> -		flush_work(&adev->userq_reset_work);
> -	}
> -}
> -
> -static int
> -amdgpu_userq_detect_and_reset_queues(struct amdgpu_userq_mgr *uq_mgr)
> +static void amdgpu_userq_mgr_reset_work(struct work_struct *work)
>   {
> +	struct amdgpu_userq_mgr *uq_mgr =
> +		container_of(work, struct amdgpu_userq_mgr,
> +			     reset_work);
>   	struct amdgpu_device *adev = uq_mgr->adev;
>   	const int queue_types[] = {
>   		AMDGPU_RING_TYPE_COMPUTE,
> @@ -103,12 +95,11 @@ amdgpu_userq_detect_and_reset_queues(struct amdgpu_userq_mgr *uq_mgr)
>   	};
>   	const int num_queue_types = ARRAY_SIZE(queue_types);
>   	bool gpu_reset = false;
> -	int r = 0;
> -	int i;
> +	int i, r;
>   
>   	if (unlikely(adev->debug_disable_gpu_ring_reset)) {
>   		dev_err(adev->dev, "userq reset disabled by debug mask\n");
> -		return 0;
> +		return;
>   	}
>   
>   	/*
> @@ -116,7 +107,7 @@ amdgpu_userq_detect_and_reset_queues(struct amdgpu_userq_mgr *uq_mgr)
>   	 * skip all reset detection logic
>   	 */
>   	if (!amdgpu_gpu_recovery)
> -		return 0;
> +		return;
>   
>   	/*
>   	 * Iterate through all queue types to detect and reset problematic queues
> @@ -141,10 +132,19 @@ amdgpu_userq_detect_and_reset_queues(struct amdgpu_userq_mgr *uq_mgr)
>   		}
>   	}
>   
> -	if (gpu_reset)
> -		amdgpu_userq_gpu_reset(adev);
> +	if (gpu_reset) {
> +		struct amdgpu_reset_context reset_context;
>   
> -	return r;
> +		memset(&reset_context, 0, sizeof(reset_context));
> +
> +		reset_context.method = AMD_RESET_METHOD_NONE;
> +		reset_context.reset_req_dev = adev;
> +		reset_context.src = AMDGPU_RESET_SRC_USERQ;
> +		set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
> +		/*set_bit(AMDGPU_SKIP_COREDUMP, &reset_context.flags);*/
> +
> +		amdgpu_device_gpu_recover(adev, NULL, &reset_context);
> +	}
>   }
>   
>   static void amdgpu_userq_hang_detect_work(struct work_struct *work)
The function and the work handler for are using the same name and it 
causes confusion to understand.
queue_delayed_work(adev->reset_domain->wq, &queue->hang_detect_work,
                            msecs_to_jiffies(timeout_ms)); The queued 
item here call the work item where the function name is same , so its 
better if we can keep a different name

Regards
Sunil Khatri

> @@ -153,7 +153,11 @@ static void amdgpu_userq_hang_detect_work(struct work_struct *work)
>   		container_of(work, struct amdgpu_usermode_queue,
>   			     hang_detect_work.work);
>   
> -	amdgpu_userq_detect_and_reset_queues(queue->userq_mgr);
> +	/*
> +	 * Don't schedule the work here! Scheduling or queue work from one reset
> +	 * handler to another is illegal if you don't take extra precautions!
> +	 */
> +	amdgpu_userq_mgr_reset_work(&queue->userq_mgr->reset_work);
>   }
>   
>   /*
> @@ -182,8 +186,8 @@ void amdgpu_userq_start_hang_detect_work(struct amdgpu_usermode_queue *queue)
>   		break;
>   	}
>   
> -	schedule_delayed_work(&queue->hang_detect_work,
> -		     msecs_to_jiffies(timeout_ms));
> +	queue_delayed_work(adev->reset_domain->wq, &queue->hang_detect_work,
> +			   msecs_to_jiffies(timeout_ms));
>   }
>   
>   void amdgpu_userq_process_fence_irq(struct amdgpu_device *adev, u32 doorbell)
> @@ -1256,28 +1260,13 @@ amdgpu_userq_evict_all(struct amdgpu_userq_mgr *uq_mgr)
>   	if (ret) {
>   		drm_file_err(uq_mgr->file,
>   			     "Couldn't unmap all the queues, eviction failed ret=%d\n", ret);
> -		amdgpu_userq_detect_and_reset_queues(uq_mgr);
> +		amdgpu_reset_domain_schedule(uq_mgr->adev->reset_domain,
> +					     &uq_mgr->reset_work);
> +		flush_work(&uq_mgr->reset_work);
Flush work is called here with userq_mutex held? Is it ok to run for 
that long time and not sure about it but the flush_work might try to 
take the userq_mutex again, that was problem initially during reset.
>   	}
>   	return ret;
>   }
>   
> -void amdgpu_userq_reset_work(struct work_struct *work)
> -{
> -	struct amdgpu_device *adev = container_of(work, struct amdgpu_device,
> -						  userq_reset_work);
> -	struct amdgpu_reset_context reset_context;
> -
> -	memset(&reset_context, 0, sizeof(reset_context));
> -
> -	reset_context.method = AMD_RESET_METHOD_NONE;
> -	reset_context.reset_req_dev = adev;
> -	reset_context.src = AMDGPU_RESET_SRC_USERQ;
> -	set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
> -	/*set_bit(AMDGPU_SKIP_COREDUMP, &reset_context.flags);*/
> -
> -	amdgpu_device_gpu_recover(adev, NULL, &reset_context);
> -}
> -
>   static void
>   amdgpu_userq_wait_for_signal(struct amdgpu_userq_mgr *uq_mgr)
>   {
> @@ -1311,9 +1300,24 @@ int amdgpu_userq_mgr_init(struct amdgpu_userq_mgr *userq_mgr, struct drm_file *f
>   	userq_mgr->file = file_priv;
>   
>   	INIT_DELAYED_WORK(&userq_mgr->resume_work, amdgpu_userq_restore_worker);
> +	INIT_WORK(&userq_mgr->reset_work, amdgpu_userq_mgr_reset_work);
>   	return 0;
>   }
>   
> +void amdgpu_userq_mgr_cancel_reset_work(struct amdgpu_device *adev)
> +{
> +	struct xarray *xa = &adev->userq_doorbell_xa;
> +	struct amdgpu_usermode_queue *queue;
> +	unsigned long flags, queue_id;
> +
> +	xa_lock_irqsave(xa, flags);
> +	xa_for_each(xa, queue_id, queue) {
> +		cancel_delayed_work(&queue->hang_detect_work);
> +		cancel_work(&queue->userq_mgr->reset_work);
> +	}
> +	xa_unlock_irqrestore(xa, flags);
> +}
> +
>   void amdgpu_userq_mgr_cancel_resume(struct amdgpu_userq_mgr *userq_mgr)
>   {
>   	cancel_delayed_work_sync(&userq_mgr->resume_work);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
> index 85f460e7c31b..49b33e2d6932 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
> @@ -84,7 +84,13 @@ struct amdgpu_usermode_queue {
>   	u32			xcp_id;
>   	int			priority;
>   	struct dentry		*debugfs_queue;
> -	struct delayed_work hang_detect_work;
> +
> +	/**
> +	 * @hang_detect_work:
> +	 *
> +	 * Delayed work which runs when userq_fences time out.
> +	 */
> +	struct delayed_work	hang_detect_work;
>   	struct kref		refcount;
>   
>   	struct list_head	userq_va_list;
> @@ -116,6 +122,13 @@ struct amdgpu_userq_mgr {
>   	struct amdgpu_device		*adev;
>   	struct delayed_work		resume_work;
>   	struct drm_file			*file;
> +
> +	/**
> +	 * @reset_work:
> +	 *
> +	 * Reset work which is used when eviction fails.
> +	 */
> +	struct work_struct		reset_work;
>   	atomic_t                        userq_count[AMDGPU_RING_TYPE_MAX];
>   };
>   
> @@ -134,6 +147,7 @@ int amdgpu_userq_ioctl(struct drm_device *dev, void *data, struct drm_file *filp
>   int amdgpu_userq_mgr_init(struct amdgpu_userq_mgr *userq_mgr, struct drm_file *file_priv,
>   			  struct amdgpu_device *adev);
>   
> +void amdgpu_userq_mgr_cancel_reset_work(struct amdgpu_device *adev);
>   void amdgpu_userq_mgr_cancel_resume(struct amdgpu_userq_mgr *userq_mgr);
>   void amdgpu_userq_mgr_fini(struct amdgpu_userq_mgr *userq_mgr);
>   

[-- Attachment #2: Type: text/html, Size: 10220 bytes --]

  reply	other threads:[~2026-04-23 10:43 UTC|newest]

Thread overview: 38+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-04-21 12:55 [PATCH 01/11] drm/amdgpu: fix AMDGPU_INFO_READ_MMR_REG Christian König
2026-04-21 12:55 ` [PATCH 02/11] drm/amdgpu: remove deadlocks from amdgpu_userq_pre_reset Christian König
2026-04-22  4:53   ` Khatri, Sunil
2026-04-22  7:13     ` Christian König
2026-04-22  7:19       ` Khatri, Sunil
2026-04-22  7:24         ` Christian König
2026-04-22  7:29           ` Khatri, Sunil
2026-04-27  8:45   ` Liang, Prike
2026-04-21 12:55 ` [PATCH 03/11] drm/amdgpu: nuke amdgpu_userq_fence_free Christian König
2026-04-22  8:29   ` Khatri, Sunil
2026-04-22  9:26     ` Christian König
2026-04-22  9:40       ` Khatri, Sunil
2026-04-22 10:12         ` Christian König
2026-04-22 14:32           ` Khatri, Sunil
2026-04-27  6:21   ` Liang, Prike
2026-04-21 12:55 ` [PATCH 04/11] drm/amdgpu: rework amdgpu_userq_signal_ioctl Christian König
2026-04-22 10:08   ` Khatri, Sunil
2026-04-22 10:14     ` Christian König
2026-04-22 15:14       ` Khatri, Sunil
2026-04-23  9:58   ` Liang, Prike
2026-04-23 10:47     ` Christian König
2026-04-23 10:54       ` Khatri, Sunil
2026-04-24  8:01       ` Liang, Prike
2026-04-24 13:02         ` Christian König
2026-04-21 12:55 ` [PATCH 05/11] drm/amdgpu: rework userq fence signal processing Christian König
2026-04-22 10:16   ` Khatri, Sunil
2026-04-21 12:55 ` [PATCH 06/11] drm/amdgpu: remove almost all calls to amdgpu_userq_detect_and_reset_queues Christian König
2026-04-22 10:20   ` Khatri, Sunil
2026-04-21 12:55 ` [PATCH 07/11] drm/amdgpu: fix userq hang detection and reset Christian König
2026-04-22 10:35   ` Khatri, Sunil
2026-04-21 12:55 ` [PATCH 08/11] drm/amdgpu: rework userq reset work handling Christian König
2026-04-23 10:43   ` Khatri, Sunil [this message]
2026-04-21 12:55 ` [PATCH 09/11] drm/amdgpu: revert to old status lock handling v4 Christian König
2026-04-23 10:45   ` Khatri, Sunil
2026-04-21 12:55 ` [PATCH 10/11] drm/amdgpu: restructure VM state machine v2 Christian König
2026-04-23 10:46   ` Khatri, Sunil
2026-04-21 12:55 ` [PATCH 11/11] drm/amdgpu: WIP sync amdgpu_ttm_fill_mem only to kernel fences Christian König
2026-04-23 10:47   ` Khatri, Sunil

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=99fbb7b9-bdd4-4c97-b089-5f6a3bb7a6c8@amd.com \
    --to=sukhatri@amd.com \
    --cc=Prike.Liang@amd.com \
    --cc=alexander.deucher@amd.com \
    --cc=amd-gfx@lists.freedesktop.org \
    --cc=christian.koenig@amd.com \
    --cc=ckoenig.leichtzumerken@gmail.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox