Re: [PATCH 08/11] drm/amdgpu: rework userq reset work handling

All of lore.kernel.org
 help / color / mirror / Atom feed

From: "Khatri, Sunil" <sukhatri@amd.com>
To: "Christian König" <ckoenig.leichtzumerken@gmail.com>,
	alexander.deucher@amd.com, Prike.Liang@amd.com,
	amd-gfx@lists.freedesktop.org
Cc: christian.koenig@amd.com
Subject: Re: [PATCH 08/11] drm/amdgpu: rework userq reset work handling
Date: Thu, 23 Apr 2026 16:13:30 +0530	[thread overview]
Message-ID: <99fbb7b9-bdd4-4c97-b089-5f6a3bb7a6c8@amd.com> (raw)
In-Reply-To: <20260421125513.4545-8-christian.koenig@amd.com>

[-- Attachment #1: Type: text/plain, Size: 9526 bytes --]


On 21-04-2026 06:25 pm, Christian König wrote:
> It is illegal to schedule reset work from another reset work!
>
> Fix this by scheduling the userq reset work directly on the work queue
> of the reset domain.
>
> Not fully tested, I leave that to the IGT test cases.
>
> Signed-off-by: Christian König<christian.koenig@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  1 -
>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  3 +-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c  | 84 +++++++++++-----------
>   drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h  | 16 ++++-
>   4 files changed, 60 insertions(+), 44 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index 39894e38fee4..17341e384caf 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -1191,7 +1191,6 @@ struct amdgpu_device {
>   	bool                            apu_prefer_gtt;
>   
>   	bool                            userq_halt_for_enforce_isolation;
> -	struct work_struct              userq_reset_work;
>   	struct amdgpu_uid *uid_info;
>   
>   	struct amdgpu_uma_carveout_info uma_info;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index b11c4b5fa8fc..cf61be17e061 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -3786,7 +3786,6 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>   	}
>   
>   	INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
> -	INIT_WORK(&adev->userq_reset_work, amdgpu_userq_reset_work);
>   
>   	amdgpu_coredump_init(adev);
>   
> @@ -5477,7 +5476,7 @@ static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev)
>   	if (!amdgpu_sriov_vf(adev))
>   		cancel_work(&adev->reset_work);
>   #endif
> -	cancel_work(&adev->userq_reset_work);
> +	amdgpu_userq_mgr_cancel_reset_work(adev);
>   
>   	if (adev->kfd.dev)
>   		cancel_work(&adev->kfd.reset_work);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> index 0a4c39d83adc..ad6dac17dd21 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> @@ -82,19 +82,11 @@ static bool amdgpu_userq_is_reset_type_supported(struct amdgpu_device *adev,
>   	return false;
>   }
>   
> -static void amdgpu_userq_gpu_reset(struct amdgpu_device *adev)
> -{
> -	if (amdgpu_device_should_recover_gpu(adev)) {
> -		amdgpu_reset_domain_schedule(adev->reset_domain,
> -					     &adev->userq_reset_work);
> -		/* Wait for the reset job to complete */
> -		flush_work(&adev->userq_reset_work);
> -	}
> -}
> -
> -static int
> -amdgpu_userq_detect_and_reset_queues(struct amdgpu_userq_mgr *uq_mgr)
> +static void amdgpu_userq_mgr_reset_work(struct work_struct *work)
>   {
> +	struct amdgpu_userq_mgr *uq_mgr =
> +		container_of(work, struct amdgpu_userq_mgr,
> +			     reset_work);
>   	struct amdgpu_device *adev = uq_mgr->adev;
>   	const int queue_types[] = {
>   		AMDGPU_RING_TYPE_COMPUTE,
> @@ -103,12 +95,11 @@ amdgpu_userq_detect_and_reset_queues(struct amdgpu_userq_mgr *uq_mgr)
>   	};
>   	const int num_queue_types = ARRAY_SIZE(queue_types);
>   	bool gpu_reset = false;
> -	int r = 0;
> -	int i;
> +	int i, r;
>   
>   	if (unlikely(adev->debug_disable_gpu_ring_reset)) {
>   		dev_err(adev->dev, "userq reset disabled by debug mask\n");
> -		return 0;
> +		return;
>   	}
>   
>   	/*
> @@ -116,7 +107,7 @@ amdgpu_userq_detect_and_reset_queues(struct amdgpu_userq_mgr *uq_mgr)
>   	 * skip all reset detection logic
>   	 */
>   	if (!amdgpu_gpu_recovery)
> -		return 0;
> +		return;
>   
>   	/*
>   	 * Iterate through all queue types to detect and reset problematic queues
> @@ -141,10 +132,19 @@ amdgpu_userq_detect_and_reset_queues(struct amdgpu_userq_mgr *uq_mgr)
>   		}
>   	}
>   
> -	if (gpu_reset)
> -		amdgpu_userq_gpu_reset(adev);
> +	if (gpu_reset) {
> +		struct amdgpu_reset_context reset_context;
>   
> -	return r;
> +		memset(&reset_context, 0, sizeof(reset_context));
> +
> +		reset_context.method = AMD_RESET_METHOD_NONE;
> +		reset_context.reset_req_dev = adev;
> +		reset_context.src = AMDGPU_RESET_SRC_USERQ;
> +		set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
> +		/*set_bit(AMDGPU_SKIP_COREDUMP, &reset_context.flags);*/
> +
> +		amdgpu_device_gpu_recover(adev, NULL, &reset_context);
> +	}
>   }
>   
>   static void amdgpu_userq_hang_detect_work(struct work_struct *work)
The function and the work handler for are using the same name and it 
causes confusion to understand.
queue_delayed_work(adev->reset_domain->wq, &queue->hang_detect_work,
                            msecs_to_jiffies(timeout_ms)); The queued 
item here call the work item where the function name is same , so its 
better if we can keep a different name

Regards
Sunil Khatri

> @@ -153,7 +153,11 @@ static void amdgpu_userq_hang_detect_work(struct work_struct *work)
>   		container_of(work, struct amdgpu_usermode_queue,
>   			     hang_detect_work.work);
>   
> -	amdgpu_userq_detect_and_reset_queues(queue->userq_mgr);
> +	/*
> +	 * Don't schedule the work here! Scheduling or queue work from one reset
> +	 * handler to another is illegal if you don't take extra precautions!
> +	 */
> +	amdgpu_userq_mgr_reset_work(&queue->userq_mgr->reset_work);
>   }
>   
>   /*
> @@ -182,8 +186,8 @@ void amdgpu_userq_start_hang_detect_work(struct amdgpu_usermode_queue *queue)
>   		break;
>   	}
>   
> -	schedule_delayed_work(&queue->hang_detect_work,
> -		     msecs_to_jiffies(timeout_ms));
> +	queue_delayed_work(adev->reset_domain->wq, &queue->hang_detect_work,
> +			   msecs_to_jiffies(timeout_ms));
>   }
>   
>   void amdgpu_userq_process_fence_irq(struct amdgpu_device *adev, u32 doorbell)
> @@ -1256,28 +1260,13 @@ amdgpu_userq_evict_all(struct amdgpu_userq_mgr *uq_mgr)
>   	if (ret) {
>   		drm_file_err(uq_mgr->file,
>   			     "Couldn't unmap all the queues, eviction failed ret=%d\n", ret);
> -		amdgpu_userq_detect_and_reset_queues(uq_mgr);
> +		amdgpu_reset_domain_schedule(uq_mgr->adev->reset_domain,
> +					     &uq_mgr->reset_work);
> +		flush_work(&uq_mgr->reset_work);
Flush work is called here with userq_mutex held? Is it ok to run for 
that long time and not sure about it but the flush_work might try to 
take the userq_mutex again, that was problem initially during reset.
>   	}
>   	return ret;
>   }
>   
> -void amdgpu_userq_reset_work(struct work_struct *work)
> -{
> -	struct amdgpu_device *adev = container_of(work, struct amdgpu_device,
> -						  userq_reset_work);
> -	struct amdgpu_reset_context reset_context;
> -
> -	memset(&reset_context, 0, sizeof(reset_context));
> -
> -	reset_context.method = AMD_RESET_METHOD_NONE;
> -	reset_context.reset_req_dev = adev;
> -	reset_context.src = AMDGPU_RESET_SRC_USERQ;
> -	set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
> -	/*set_bit(AMDGPU_SKIP_COREDUMP, &reset_context.flags);*/
> -
> -	amdgpu_device_gpu_recover(adev, NULL, &reset_context);
> -}
> -
>   static void
>   amdgpu_userq_wait_for_signal(struct amdgpu_userq_mgr *uq_mgr)
>   {
> @@ -1311,9 +1300,24 @@ int amdgpu_userq_mgr_init(struct amdgpu_userq_mgr *userq_mgr, struct drm_file *f
>   	userq_mgr->file = file_priv;
>   
>   	INIT_DELAYED_WORK(&userq_mgr->resume_work, amdgpu_userq_restore_worker);
> +	INIT_WORK(&userq_mgr->reset_work, amdgpu_userq_mgr_reset_work);
>   	return 0;
>   }
>   
> +void amdgpu_userq_mgr_cancel_reset_work(struct amdgpu_device *adev)
> +{
> +	struct xarray *xa = &adev->userq_doorbell_xa;
> +	struct amdgpu_usermode_queue *queue;
> +	unsigned long flags, queue_id;
> +
> +	xa_lock_irqsave(xa, flags);
> +	xa_for_each(xa, queue_id, queue) {
> +		cancel_delayed_work(&queue->hang_detect_work);
> +		cancel_work(&queue->userq_mgr->reset_work);
> +	}
> +	xa_unlock_irqrestore(xa, flags);
> +}
> +
>   void amdgpu_userq_mgr_cancel_resume(struct amdgpu_userq_mgr *userq_mgr)
>   {
>   	cancel_delayed_work_sync(&userq_mgr->resume_work);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
> index 85f460e7c31b..49b33e2d6932 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
> @@ -84,7 +84,13 @@ struct amdgpu_usermode_queue {
>   	u32			xcp_id;
>   	int			priority;
>   	struct dentry		*debugfs_queue;
> -	struct delayed_work hang_detect_work;
> +
> +	/**
> +	 * @hang_detect_work:
> +	 *
> +	 * Delayed work which runs when userq_fences time out.
> +	 */
> +	struct delayed_work	hang_detect_work;
>   	struct kref		refcount;
>   
>   	struct list_head	userq_va_list;
> @@ -116,6 +122,13 @@ struct amdgpu_userq_mgr {
>   	struct amdgpu_device		*adev;
>   	struct delayed_work		resume_work;
>   	struct drm_file			*file;
> +
> +	/**
> +	 * @reset_work:
> +	 *
> +	 * Reset work which is used when eviction fails.
> +	 */
> +	struct work_struct		reset_work;
>   	atomic_t                        userq_count[AMDGPU_RING_TYPE_MAX];
>   };
>   
> @@ -134,6 +147,7 @@ int amdgpu_userq_ioctl(struct drm_device *dev, void *data, struct drm_file *filp
>   int amdgpu_userq_mgr_init(struct amdgpu_userq_mgr *userq_mgr, struct drm_file *file_priv,
>   			  struct amdgpu_device *adev);
>   
> +void amdgpu_userq_mgr_cancel_reset_work(struct amdgpu_device *adev);
>   void amdgpu_userq_mgr_cancel_resume(struct amdgpu_userq_mgr *userq_mgr);
>   void amdgpu_userq_mgr_fini(struct amdgpu_userq_mgr *userq_mgr);
>   

[-- Attachment #2: Type: text/html, Size: 10220 bytes --]

next prev parent reply	other threads:[~2026-04-23 10:43 UTC|newest]

Thread overview: 40+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-04-21 12:55 [PATCH 01/11] drm/amdgpu: fix AMDGPU_INFO_READ_MMR_REG Christian König
2026-04-21 12:55 ` [PATCH 02/11] drm/amdgpu: remove deadlocks from amdgpu_userq_pre_reset Christian König
2026-04-22  4:53   ` Khatri, Sunil
2026-04-22  7:13     ` Christian König
2026-04-22  7:19       ` Khatri, Sunil
2026-04-22  7:24         ` Christian König
2026-04-22  7:29           ` Khatri, Sunil
2026-04-27  8:45   ` Liang, Prike
2026-04-21 12:55 ` [PATCH 03/11] drm/amdgpu: nuke amdgpu_userq_fence_free Christian König
2026-04-22  8:29   ` Khatri, Sunil
2026-04-22  9:26     ` Christian König
2026-04-22  9:40       ` Khatri, Sunil
2026-04-22 10:12         ` Christian König
2026-04-22 14:32           ` Khatri, Sunil
2026-04-27  6:21   ` Liang, Prike
2026-04-21 12:55 ` [PATCH 04/11] drm/amdgpu: rework amdgpu_userq_signal_ioctl Christian König
2026-04-22 10:08   ` Khatri, Sunil
2026-04-22 10:14     ` Christian König
2026-04-22 15:14       ` Khatri, Sunil
2026-04-23  9:58   ` Liang, Prike
2026-04-23 10:47     ` Christian König
2026-04-23 10:54       ` Khatri, Sunil
2026-04-24  8:01       ` Liang, Prike
2026-04-24 13:02         ` Christian König
2026-04-21 12:55 ` [PATCH 05/11] drm/amdgpu: rework userq fence signal processing Christian König
2026-04-22 10:16   ` Khatri, Sunil
2026-04-21 12:55 ` [PATCH 06/11] drm/amdgpu: remove almost all calls to amdgpu_userq_detect_and_reset_queues Christian König
2026-04-22 10:20   ` Khatri, Sunil
2026-04-21 12:55 ` [PATCH 07/11] drm/amdgpu: fix userq hang detection and reset Christian König
2026-04-22 10:35   ` Khatri, Sunil
2026-04-21 12:55 ` [PATCH 08/11] drm/amdgpu: rework userq reset work handling Christian König
2026-04-23 10:43   ` Khatri, Sunil [this message]
2026-05-11 17:50     ` Christian König
2026-05-11 17:58       ` Khatri, Sunil
2026-04-21 12:55 ` [PATCH 09/11] drm/amdgpu: revert to old status lock handling v4 Christian König
2026-04-23 10:45   ` Khatri, Sunil
2026-04-21 12:55 ` [PATCH 10/11] drm/amdgpu: restructure VM state machine v2 Christian König
2026-04-23 10:46   ` Khatri, Sunil
2026-04-21 12:55 ` [PATCH 11/11] drm/amdgpu: WIP sync amdgpu_ttm_fill_mem only to kernel fences Christian König
2026-04-23 10:47   ` Khatri, Sunil

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=99fbb7b9-bdd4-4c97-b089-5f6a3bb7a6c8@amd.com \
    --to=sukhatri@amd.com \
    --cc=Prike.Liang@amd.com \
    --cc=alexander.deucher@amd.com \
    --cc=amd-gfx@lists.freedesktop.org \
    --cc=christian.koenig@amd.com \
    --cc=ckoenig.leichtzumerken@gmail.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.