[PATCH 1/3] drm/amdgpu: Implement user queue reset functionality

AMD-GFX Archive on lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH 1/3] drm/amdgpu: Implement user queue reset functionality
@ 2025-10-24  9:43 Jesse.Zhang
  2025-10-24  9:43 ` [PATCH 2/3] drm/amdgpu: Add user queue reset mask support Jesse.Zhang
                   ` (2 more replies)
  0 siblings, 3 replies; 9+ messages in thread
From: Jesse.Zhang @ 2025-10-24  9:43 UTC (permalink / raw)
  To: amd-gfx; +Cc: Alexander.Deucher, Christian Koenig, Alex Deucher, Jesse Zhang

From: Alex Deucher <alexander.deucher@amd.com>

This patch adds robust reset handling for user queues (userq) to improve
recovery from queue failures. The key components include:

1. Queue detection and reset logic:
   - amdgpu_userq_detect_and_reset_queues() identifies failed queues
   - Per-IP detect_and_reset callbacks for targeted recovery
   - Falls back to full GPU reset when needed

2. Reset infrastructure:
   - Adds userq_reset_work workqueue for async reset handling
   - Implements pre/post reset handlers for queue state management
   - Integrates with existing GPU reset framework

3. Error handling improvements:
   - Enhanced state tracking with HUNG state
   - Automatic reset triggering on critical failures
   - VRAM loss handling during recovery

4. Integration points:
   - Added to device init/reset paths
   - Called during queue destroy, suspend, and isolation events
   - Handles both individual queue and full GPU resets

The reset functionality works with both gfx/compute and sdma queues,
providing better resilience against queue failures while minimizing
disruption to unaffected queues.

v2: add detection and reset calls when preemption/unmaped fails.
    add a per device userq counter for each user queue type.(Alex)
v3: make sure we hold the adev->userq_mutex when we call amdgpu_userq_detect_and_reset_queues. (Alex)
   warn if the adev->userq_mutex is not held.
v4: make sure we have all of the uqm->userq_mutex held.
   warn if the uqm->userq_mutex is not held.

v5: Use array for user queue type counters.(Alex)
    all of the uqm->userq_mutex need to be held when calling detect and reset.  (Alex)

v6: fix lock dep warning in amdgpu_userq_fence_dence_driver_process

v7: add the queue types in an array and use a loop in amdgpu_userq_detect_and_reset_queues (Lijo)
v8: remove atomic_set(&userq_mgr->userq_count[i], 0).
   it should already be 0 since we kzalloc the structure (Alex)

Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Signed-off-by: Jesse Zhang <Jesse.Zhang@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h        |   1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |   8 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h   |   1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c  | 176 +++++++++++++++++++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h  |   5 +
 5 files changed, 179 insertions(+), 12 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 50079209c472..d0fb4eb1d7c4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1316,6 +1316,7 @@ struct amdgpu_device {
 	bool                            apu_prefer_gtt;
 
 	bool                            userq_halt_for_enforce_isolation;
+	struct work_struct              userq_reset_work;
 	struct amdgpu_uid *uid_info;
 
 	/* KFD
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index b8d91247f51a..8480b72258f2 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -4608,6 +4608,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
 	}
 
 	INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
+	INIT_WORK(&adev->userq_reset_work, amdgpu_userq_reset_work);
 
 	adev->gfx.gfx_off_req_count = 1;
 	adev->gfx.gfx_off_residency = 0;
@@ -5990,6 +5991,10 @@ int amdgpu_device_reinit_after_reset(struct amdgpu_reset_context *reset_context)
 				if (r)
 					goto out;
 
+				r = amdgpu_userq_post_reset(tmp_adev, vram_lost);
+				if (r)
+					goto out;
+
 				drm_client_dev_resume(adev_to_drm(tmp_adev), false);
 
 				/*
@@ -6212,6 +6217,7 @@ static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev)
 	if (!amdgpu_sriov_vf(adev))
 		cancel_work(&adev->reset_work);
 #endif
+	cancel_work(&adev->userq_reset_work);
 
 	if (adev->kfd.dev)
 		cancel_work(&adev->kfd.reset_work);
@@ -6332,6 +6338,8 @@ static void amdgpu_device_halt_activities(struct amdgpu_device *adev,
 		    amdgpu_device_ip_need_full_reset(tmp_adev))
 			amdgpu_ras_suspend(tmp_adev);
 
+		amdgpu_userq_pre_reset(tmp_adev);
+
 		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
 			struct amdgpu_ring *ring = tmp_adev->rings[i];
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
index 87b962df5460..7a27c6c4bb44 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
@@ -83,6 +83,7 @@ enum amdgpu_ring_type {
 	AMDGPU_RING_TYPE_MES,
 	AMDGPU_RING_TYPE_UMSCH_MM,
 	AMDGPU_RING_TYPE_CPER,
+	AMDGPU_RING_TYPE_MAX,
 };
 
 enum amdgpu_ib_pool_type {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
index c50b162e80a7..188de848c229 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
@@ -25,8 +25,10 @@
 #include <drm/drm_auth.h>
 #include <drm/drm_exec.h>
 #include <linux/pm_runtime.h>
+#include <drm/drm_drv.h>
 
 #include "amdgpu.h"
+#include "amdgpu_reset.h"
 #include "amdgpu_vm.h"
 #include "amdgpu_userq.h"
 #include "amdgpu_hmm.h"
@@ -45,6 +47,69 @@ u32 amdgpu_userq_get_supported_ip_mask(struct amdgpu_device *adev)
 	return userq_ip_mask;
 }
 
+static void amdgpu_userq_gpu_reset(struct amdgpu_device *adev)
+{
+	if (amdgpu_device_should_recover_gpu(adev)) {
+		amdgpu_reset_domain_schedule(adev->reset_domain,
+					     &adev->userq_reset_work);
+		/* Wait for the reset job to complete */
+		flush_work(&adev->userq_reset_work);
+	}
+}
+
+static int
+amdgpu_userq_detect_and_reset_queues(struct amdgpu_userq_mgr *uq_mgr)
+{
+	struct amdgpu_device *adev = uq_mgr->adev;
+	const int queue_types[] = {
+		AMDGPU_RING_TYPE_COMPUTE,
+		AMDGPU_RING_TYPE_GFX,
+		AMDGPU_RING_TYPE_SDMA
+	};
+	const int num_queue_types = ARRAY_SIZE(queue_types);
+	bool gpu_reset = false;
+	int r = 0;
+	int i;
+
+	/* Warning if current process mutex is not held */
+	WARN_ON(!mutex_is_locked(&uq_mgr->userq_mutex));
+
+	if (unlikely(adev->debug_disable_gpu_ring_reset)) {
+		dev_err(adev->dev, "userq reset disabled by debug mask\n");
+		return 0;
+	}
+
+	/*
+	 * If GPU recovery feature is disabled system-wide,
+	 * skip all reset detection logic
+	 */
+	if (!amdgpu_gpu_recovery)
+		return 0;
+
+	/*
+	 * Iterate through all queue types to detect and reset problematic queues
+	 * Process each queue type in the defined order
+	 */
+	for (i = 0; i < num_queue_types; i++) {
+		int ring_type = queue_types[i];
+		const struct amdgpu_userq_funcs *funcs = adev->userq_funcs[ring_type];
+
+		if (atomic_read(&uq_mgr->userq_count[ring_type]) > 0 &&
+		    funcs && funcs->detect_and_reset) {
+			r = funcs->detect_and_reset(adev, ring_type);
+			if (r) {
+				gpu_reset = true;
+				break;
+			}
+		}
+	}
+
+	if (gpu_reset)
+		amdgpu_userq_gpu_reset(adev);
+
+	return r;
+}
+
 static int amdgpu_userq_buffer_va_list_add(struct amdgpu_usermode_queue *queue,
 					   struct amdgpu_bo_va_mapping *va_map, u64 addr)
 {
@@ -175,17 +240,22 @@ amdgpu_userq_preempt_helper(struct amdgpu_userq_mgr *uq_mgr,
 	struct amdgpu_device *adev = uq_mgr->adev;
 	const struct amdgpu_userq_funcs *userq_funcs =
 		adev->userq_funcs[queue->queue_type];
+	bool found_hung_queue = false;
 	int r = 0;
 
 	if (queue->state == AMDGPU_USERQ_STATE_MAPPED) {
 		r = userq_funcs->preempt(uq_mgr, queue);
 		if (r) {
 			queue->state = AMDGPU_USERQ_STATE_HUNG;
+			found_hung_queue = true;
 		} else {
 			queue->state = AMDGPU_USERQ_STATE_PREEMPTED;
 		}
 	}
 
+	if (found_hung_queue)
+		amdgpu_userq_detect_and_reset_queues(uq_mgr);
+
 	return r;
 }
 
@@ -217,16 +287,23 @@ amdgpu_userq_unmap_helper(struct amdgpu_userq_mgr *uq_mgr,
 	struct amdgpu_device *adev = uq_mgr->adev;
 	const struct amdgpu_userq_funcs *userq_funcs =
 		adev->userq_funcs[queue->queue_type];
+	bool found_hung_queue = false;
 	int r = 0;
 
 	if ((queue->state == AMDGPU_USERQ_STATE_MAPPED) ||
 		(queue->state == AMDGPU_USERQ_STATE_PREEMPTED)) {
 		r = userq_funcs->unmap(uq_mgr, queue);
-		if (r)
+		if (r) {
 			queue->state = AMDGPU_USERQ_STATE_HUNG;
-		else
+			found_hung_queue = true;
+		} else {
 			queue->state = AMDGPU_USERQ_STATE_UNMAPPED;
+		}
 	}
+
+	if (found_hung_queue)
+		amdgpu_userq_detect_and_reset_queues(uq_mgr);
+
 	return r;
 }
 
@@ -243,10 +320,12 @@ amdgpu_userq_map_helper(struct amdgpu_userq_mgr *uq_mgr,
 		r = userq_funcs->map(uq_mgr, queue);
 		if (r) {
 			queue->state = AMDGPU_USERQ_STATE_HUNG;
+			amdgpu_userq_detect_and_reset_queues(uq_mgr);
 		} else {
 			queue->state = AMDGPU_USERQ_STATE_MAPPED;
 		}
 	}
+
 	return r;
 }
 
@@ -474,10 +553,11 @@ amdgpu_userq_destroy(struct drm_file *filp, int queue_id)
 		amdgpu_bo_unreserve(queue->db_obj.obj);
 	}
 	amdgpu_bo_unref(&queue->db_obj.obj);
-
+	atomic_dec(&uq_mgr->userq_count[queue->queue_type]);
 #if defined(CONFIG_DEBUG_FS)
 	debugfs_remove_recursive(queue->debugfs_queue);
 #endif
+	amdgpu_userq_detect_and_reset_queues(uq_mgr);
 	r = amdgpu_userq_unmap_helper(uq_mgr, queue);
 	/*TODO: It requires a reset for userq hw unmap error*/
 	if (unlikely(r != AMDGPU_USERQ_STATE_UNMAPPED)) {
@@ -699,6 +779,7 @@ amdgpu_userq_create(struct drm_file *filp, union drm_amdgpu_userq *args)
 	kfree(queue_name);
 
 	args->out.queue_id = qid;
+	atomic_inc(&uq_mgr->userq_count[queue->queue_type]);
 
 unlock:
 	mutex_unlock(&uq_mgr->userq_mutex);
@@ -1043,6 +1124,7 @@ amdgpu_userq_evict_all(struct amdgpu_userq_mgr *uq_mgr)
 	unsigned long queue_id;
 	int ret = 0, r;
 
+	amdgpu_userq_detect_and_reset_queues(uq_mgr);
 	/* Try to unmap all the queues in this process ctx */
 	xa_for_each(&uq_mgr->userq_mgr_xa, queue_id, queue) {
 		r = amdgpu_userq_preempt_helper(uq_mgr, queue);
@@ -1055,6 +1137,23 @@ amdgpu_userq_evict_all(struct amdgpu_userq_mgr *uq_mgr)
 	return ret;
 }
 
+void amdgpu_userq_reset_work(struct work_struct *work)
+{
+	struct amdgpu_device *adev = container_of(work, struct amdgpu_device,
+						  userq_reset_work);
+	struct amdgpu_reset_context reset_context;
+
+	memset(&reset_context, 0, sizeof(reset_context));
+
+	reset_context.method = AMD_RESET_METHOD_NONE;
+	reset_context.reset_req_dev = adev;
+	reset_context.src = AMDGPU_RESET_SRC_USERQ;
+	set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
+	/*set_bit(AMDGPU_SKIP_COREDUMP, &reset_context.flags);*/
+
+	amdgpu_device_gpu_recover(adev, NULL, &reset_context);
+}
+
 static int
 amdgpu_userq_wait_for_signal(struct amdgpu_userq_mgr *uq_mgr)
 {
@@ -1082,22 +1181,19 @@ void
 amdgpu_userq_evict(struct amdgpu_userq_mgr *uq_mgr,
 		   struct amdgpu_eviction_fence *ev_fence)
 {
-	int ret;
 	struct amdgpu_fpriv *fpriv = uq_mgr_to_fpriv(uq_mgr);
 	struct amdgpu_eviction_fence_mgr *evf_mgr = &fpriv->evf_mgr;
+	struct amdgpu_device *adev = uq_mgr->adev;
+	int ret;
 
 	/* Wait for any pending userqueue fence work to finish */
 	ret = amdgpu_userq_wait_for_signal(uq_mgr);
-	if (ret) {
-		drm_file_err(uq_mgr->file, "Not evicting userqueue, timeout waiting for work\n");
-		return;
-	}
+	if (ret)
+		dev_err(adev->dev, "Not evicting userqueue, timeout waiting for work\n");
 
 	ret = amdgpu_userq_evict_all(uq_mgr);
-	if (ret) {
-		drm_file_err(uq_mgr->file, "Failed to evict userqueue\n");
-		return;
-	}
+	if (ret)
+		dev_err(adev->dev, "Failed to evict userqueue\n");
 
 	/* Signal current eviction fence */
 	amdgpu_eviction_fence_signal(evf_mgr, ev_fence);
@@ -1131,6 +1227,7 @@ void amdgpu_userq_mgr_fini(struct amdgpu_userq_mgr *userq_mgr)
 	cancel_delayed_work_sync(&userq_mgr->resume_work);
 
 	mutex_lock(&userq_mgr->userq_mutex);
+	amdgpu_userq_detect_and_reset_queues(userq_mgr);
 	xa_for_each(&userq_mgr->userq_mgr_xa, queue_id, queue) {
 		amdgpu_userq_wait_for_last_fence(userq_mgr, queue);
 		amdgpu_userq_unmap_helper(userq_mgr, queue);
@@ -1157,6 +1254,7 @@ int amdgpu_userq_suspend(struct amdgpu_device *adev)
 		uqm = queue->userq_mgr;
 		cancel_delayed_work_sync(&uqm->resume_work);
 		guard(mutex)(&uqm->userq_mutex);
+		amdgpu_userq_detect_and_reset_queues(uqm);
 		if (adev->in_s0ix)
 			r = amdgpu_userq_preempt_helper(uqm, queue);
 		else
@@ -1215,6 +1313,7 @@ int amdgpu_userq_stop_sched_for_enforce_isolation(struct amdgpu_device *adev,
 		if (((queue->queue_type == AMDGPU_HW_IP_GFX) ||
 		     (queue->queue_type == AMDGPU_HW_IP_COMPUTE)) &&
 		    (queue->xcp_id == idx)) {
+			amdgpu_userq_detect_and_reset_queues(uqm);
 			r = amdgpu_userq_preempt_helper(uqm, queue);
 			if (r)
 				ret = r;
@@ -1287,3 +1386,56 @@ int amdgpu_userq_gem_va_unmap_validate(struct amdgpu_device *adev,
 
 	return 0;
 }
+
+void amdgpu_userq_pre_reset(struct amdgpu_device *adev)
+{
+	const struct amdgpu_userq_funcs *userq_funcs;
+	struct amdgpu_usermode_queue *queue;
+	struct amdgpu_userq_mgr *uqm;
+	unsigned long queue_id;
+
+	xa_for_each(&adev->userq_doorbell_xa, queue_id, queue) {
+		uqm = queue->userq_mgr;
+		cancel_delayed_work_sync(&uqm->resume_work);
+		if (queue->state == AMDGPU_USERQ_STATE_MAPPED) {
+			amdgpu_userq_wait_for_last_fence(uqm, queue);
+			userq_funcs = adev->userq_funcs[queue->queue_type];
+			userq_funcs->unmap(uqm, queue);
+			/* just mark all queues as hung at this point.
+			 * if unmap succeeds, we could map again
+			 * in amdgpu_userq_post_reset() if vram is not lost
+			 */
+			queue->state = AMDGPU_USERQ_STATE_HUNG;
+			amdgpu_userq_fence_driver_force_completion(queue);
+		}
+	}
+}
+
+int amdgpu_userq_post_reset(struct amdgpu_device *adev, bool vram_lost)
+{
+	/* if any queue state is AMDGPU_USERQ_STATE_UNMAPPED
+	 * at this point, we should be able to map it again
+	 * and continue if vram is not lost.
+	 */
+	struct amdgpu_userq_mgr *uqm;
+	struct amdgpu_usermode_queue *queue;
+	const struct amdgpu_userq_funcs *userq_funcs;
+	unsigned long queue_id;
+	int r = 0;
+
+	xa_for_each(&adev->userq_doorbell_xa, queue_id, queue) {
+		uqm = queue->userq_mgr;
+		if (queue->state == AMDGPU_USERQ_STATE_HUNG && !vram_lost) {
+			userq_funcs = adev->userq_funcs[queue->queue_type];
+			/* Re-map queue */
+			r = userq_funcs->map(uqm, queue);
+			if (r) {
+				dev_err(adev->dev, "Failed to remap queue %ld\n", queue_id);
+				continue;
+			}
+			queue->state = AMDGPU_USERQ_STATE_MAPPED;
+		}
+	}
+
+	return r;
+}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
index 09da0617bfa2..c37444427a14 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
@@ -106,6 +106,7 @@ struct amdgpu_userq_mgr {
 	struct amdgpu_device		*adev;
 	struct delayed_work		resume_work;
 	struct drm_file			*file;
+	atomic_t                        userq_count[AMDGPU_RING_TYPE_MAX];
 };
 
 struct amdgpu_db_info {
@@ -148,6 +149,10 @@ int amdgpu_userq_stop_sched_for_enforce_isolation(struct amdgpu_device *adev,
 						  u32 idx);
 int amdgpu_userq_start_sched_for_enforce_isolation(struct amdgpu_device *adev,
 						   u32 idx);
+void amdgpu_userq_reset_work(struct work_struct *work);
+void amdgpu_userq_pre_reset(struct amdgpu_device *adev);
+int amdgpu_userq_post_reset(struct amdgpu_device *adev, bool vram_lost);
+
 int amdgpu_userq_input_va_validate(struct amdgpu_usermode_queue *queue,
 				   u64 addr, u64 expected_size);
 int amdgpu_userq_gem_va_unmap_validate(struct amdgpu_device *adev,
-- 
2.49.0


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [PATCH 2/3] drm/amdgpu: Add user queue reset mask support
  2025-10-24  9:43 [PATCH 1/3] drm/amdgpu: Implement user queue reset functionality Jesse.Zhang
@ 2025-10-24  9:43 ` Jesse.Zhang
  2025-10-29 18:13   ` Alex Deucher
  2025-10-24  9:43 ` [PATCH 3/3] drm/amdgpu: use irq-safe lock in amdgpu_userq_fence_driver_process Jesse.Zhang
  2025-10-29 18:09 ` [PATCH 1/3] drm/amdgpu: Implement user queue reset functionality Alex Deucher
  2 siblings, 1 reply; 9+ messages in thread
From: Jesse.Zhang @ 2025-10-24  9:43 UTC (permalink / raw)
  To: amd-gfx; +Cc: Alexander.Deucher, Christian Koenig, Jesse.Zhang, Alex Deucher

This commit adds support for tracking and exposing the reset capabilities
of user mode queues across different IP blocks (GFX, Compute, SDMA).

These changes allow userspace to query the reset capabilities of user
mode queues and ensure reset operations are only attempted when supported
by the hardware and driver.

Suggested-by: Alex Deucher <alexander.deucher@amd.com>
Signed-off-by: Jesse Zhang <Jesse.Zhang@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  3 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 17 ++++++++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c    | 44 ++++++++++++++++++++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c   | 21 +++++++++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c  | 13 +++++++
 drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c     | 17 +++++++++
 drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c     | 12 ++++++
 drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c     | 34 ++++++++++-------
 drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c     | 24 ++++++++----
 9 files changed, 163 insertions(+), 22 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index d0fb4eb1d7c4..48b21863065e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1182,6 +1182,7 @@ struct amdgpu_device {
 	 * Value: struct amdgpu_usermode_queue
 	 */
 	struct xarray userq_doorbell_xa;
+	u32 userq_supported_reset[AMDGPU_RING_TYPE_MAX];
 
 	/* df */
 	struct amdgpu_df                df;
@@ -1612,6 +1613,8 @@ struct dma_fence *amdgpu_device_enforce_isolation(struct amdgpu_device *adev,
 						  struct amdgpu_ring *ring,
 						  struct amdgpu_job *job);
 bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev);
+ssize_t amdgpu_userq_get_full_reset_mask(struct amdgpu_device *adev,
+				    int ring_type);
 ssize_t amdgpu_get_soft_full_reset_mask(struct amdgpu_ring *ring);
 ssize_t amdgpu_show_reset_mask(char *buf, uint32_t supported_reset);
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 8480b72258f2..a0064c5314df 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -7649,7 +7649,8 @@ ssize_t amdgpu_get_soft_full_reset_mask(struct amdgpu_ring *ring)
 	if (!ring || !ring->adev)
 		return size;
 
-	if (amdgpu_device_should_recover_gpu(ring->adev))
+	if (amdgpu_device_should_recover_gpu(ring->adev) &&
+	    unlikely(!ring->adev->debug_disable_gpu_ring_reset))
 		size |= AMDGPU_RESET_TYPE_FULL;
 
 	if (unlikely(!ring->adev->debug_disable_soft_recovery) &&
@@ -7659,6 +7660,20 @@ ssize_t amdgpu_get_soft_full_reset_mask(struct amdgpu_ring *ring)
 	return size;
 }
 
+ssize_t amdgpu_userq_get_full_reset_mask(struct amdgpu_device *adev, int ring_type)
+{
+	ssize_t size = 0;
+
+	if (!adev || !adev->userq_funcs[ring_type])
+		return size;
+
+	if (amdgpu_device_should_recover_gpu(adev) &&
+	    unlikely(!adev->debug_disable_gpu_ring_reset))
+		size |= AMDGPU_RESET_TYPE_FULL;
+
+	return size;
+}
+
 ssize_t amdgpu_show_reset_mask(char *buf, uint32_t supported_reset)
 {
 	ssize_t size = 0;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
index 3d24f9cd750a..5597753ec61a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
@@ -1826,6 +1826,32 @@ static ssize_t amdgpu_gfx_get_compute_reset_mask(struct device *dev,
 	return amdgpu_show_reset_mask(buf, adev->gfx.compute_supported_reset);
 }
 
+static ssize_t amdgpu_userq_get_gfx_reset_mask(struct device *dev,
+						struct device_attribute *attr,
+						char *buf)
+{
+	struct drm_device *ddev = dev_get_drvdata(dev);
+	struct amdgpu_device *adev = drm_to_adev(ddev);
+
+	if (!adev)
+		return -ENODEV;
+
+	return amdgpu_show_reset_mask(buf, adev->userq_supported_reset[AMDGPU_HW_IP_GFX]);
+}
+
+static ssize_t amdgpu_userq_get_compute_reset_mask(struct device *dev,
+						struct device_attribute *attr,
+						char *buf)
+{
+	struct drm_device *ddev = dev_get_drvdata(dev);
+	struct amdgpu_device *adev = drm_to_adev(ddev);
+
+	if (!adev)
+		return -ENODEV;
+
+	return amdgpu_show_reset_mask(buf, adev->userq_supported_reset[AMDGPU_HW_IP_COMPUTE]);
+}
+
 static DEVICE_ATTR(run_cleaner_shader, 0200,
 		   NULL, amdgpu_gfx_set_run_cleaner_shader);
 
@@ -1845,6 +1871,12 @@ static DEVICE_ATTR(gfx_reset_mask, 0444,
 static DEVICE_ATTR(compute_reset_mask, 0444,
 		   amdgpu_gfx_get_compute_reset_mask, NULL);
 
+static DEVICE_ATTR(gfx_userq_reset_mask, 0444,
+		   amdgpu_userq_get_gfx_reset_mask, NULL);
+
+static DEVICE_ATTR(compute_userq_reset_mask, 0444,
+		   amdgpu_userq_get_compute_reset_mask, NULL);
+
 static int amdgpu_gfx_sysfs_xcp_init(struct amdgpu_device *adev)
 {
 	struct amdgpu_xcp_mgr *xcp_mgr = adev->xcp_mgr;
@@ -1928,6 +1960,18 @@ static int amdgpu_gfx_sysfs_reset_mask_init(struct amdgpu_device *adev)
 			return r;
 	}
 
+	if (adev->userq_funcs[AMDGPU_HW_IP_GFX]) {
+		r = device_create_file(adev->dev, &dev_attr_gfx_userq_reset_mask);
+		if (r)
+			return r;
+	}
+
+	if (adev->userq_funcs[AMDGPU_HW_IP_COMPUTE]) {
+		r = device_create_file(adev->dev, &dev_attr_compute_userq_reset_mask);
+		if (r)
+			return r;
+	}
+
 	return r;
 }
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
index 8b8a04138711..2fb288b2bfc4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
@@ -471,6 +471,21 @@ static ssize_t amdgpu_get_sdma_reset_mask(struct device *dev,
 static DEVICE_ATTR(sdma_reset_mask, 0444,
 		   amdgpu_get_sdma_reset_mask, NULL);
 
+static ssize_t amdgpu_get_sdma_userq_reset_mask(struct device *dev,
+						struct device_attribute *attr,
+						char *buf)
+{
+	struct drm_device *ddev = dev_get_drvdata(dev);
+	struct amdgpu_device *adev = drm_to_adev(ddev);
+
+	if (!adev)
+		return -ENODEV;
+
+	return amdgpu_show_reset_mask(buf, adev->userq_supported_reset[AMDGPU_HW_IP_DMA]);
+}
+static DEVICE_ATTR(sdma_userq_reset_mask, 0444,
+		   amdgpu_get_sdma_userq_reset_mask, NULL);
+
 int amdgpu_sdma_sysfs_reset_mask_init(struct amdgpu_device *adev)
 {
 	int r = 0;
@@ -484,6 +499,12 @@ int amdgpu_sdma_sysfs_reset_mask_init(struct amdgpu_device *adev)
 			return r;
 	}
 
+	if (adev->userq_funcs[AMDGPU_HW_IP_DMA]) {
+		r = device_create_file(adev->dev, &dev_attr_sdma_userq_reset_mask);
+		if (r)
+			return r;
+	}
+
 	return r;
 }
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
index 188de848c229..15ae72e2d679 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
@@ -47,6 +47,16 @@ u32 amdgpu_userq_get_supported_ip_mask(struct amdgpu_device *adev)
 	return userq_ip_mask;
 }
 
+bool amdgpu_userq_is_reset_type_supported(struct amdgpu_device *adev,
+                                         int ring_type,
+                                         int reset_type)
+{
+    if (ring_type < 0 || ring_type >= AMDGPU_RING_TYPE_MAX)
+        return false;
+
+    return (adev->userq_supported_reset[ring_type] & reset_type) != 0;
+}
+
 static void amdgpu_userq_gpu_reset(struct amdgpu_device *adev)
 {
 	if (amdgpu_device_should_recover_gpu(adev)) {
@@ -94,6 +104,9 @@ amdgpu_userq_detect_and_reset_queues(struct amdgpu_userq_mgr *uq_mgr)
 		int ring_type = queue_types[i];
 		const struct amdgpu_userq_funcs *funcs = adev->userq_funcs[ring_type];
 
+		if (!amdgpu_userq_is_reset_type_supported(adev, ring_type, AMDGPU_RESET_TYPE_PER_QUEUE))
+				continue;
+
 		if (atomic_read(&uq_mgr->userq_count[ring_type]) > 0 &&
 		    funcs && funcs->detect_and_reset) {
 			r = funcs->detect_and_reset(adev, ring_type);
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
index 252517ce5d5a..82b7c365d720 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
@@ -1815,6 +1815,11 @@ static int gfx_v11_0_sw_init(struct amdgpu_ip_block *ip_block)
 		amdgpu_get_soft_full_reset_mask(&adev->gfx.gfx_ring[0]);
 	adev->gfx.compute_supported_reset =
 		amdgpu_get_soft_full_reset_mask(&adev->gfx.compute_ring[0]);
+	adev->userq_supported_reset[AMDGPU_HW_IP_GFX] =
+			amdgpu_userq_get_full_reset_mask(adev, AMDGPU_HW_IP_GFX);
+	adev->userq_supported_reset[AMDGPU_HW_IP_COMPUTE] =
+			amdgpu_userq_get_full_reset_mask(adev,AMDGPU_HW_IP_COMPUTE);
+
 	switch (amdgpu_ip_version(adev, GC_HWIP, 0)) {
 	case IP_VERSION(11, 0, 0):
 	case IP_VERSION(11, 0, 2):
@@ -1824,12 +1829,24 @@ static int gfx_v11_0_sw_init(struct amdgpu_ip_block *ip_block)
 		    !amdgpu_sriov_vf(adev)) {
 			adev->gfx.compute_supported_reset |= AMDGPU_RESET_TYPE_PER_QUEUE;
 			adev->gfx.gfx_supported_reset |= AMDGPU_RESET_TYPE_PER_QUEUE;
+			if (adev->userq_funcs[AMDGPU_HW_IP_GFX] &&
+			    adev->userq_funcs[AMDGPU_HW_IP_GFX]->detect_and_reset)
+				adev->userq_supported_reset[AMDGPU_HW_IP_GFX] |= AMDGPU_RESET_TYPE_PER_QUEUE;
+			if (adev->userq_funcs[AMDGPU_HW_IP_COMPUTE] &&
+			    adev->userq_funcs[AMDGPU_HW_IP_COMPUTE]->detect_and_reset)
+				adev->userq_supported_reset[AMDGPU_HW_IP_COMPUTE] |= AMDGPU_RESET_TYPE_PER_QUEUE;
 		}
 		break;
 	default:
 		if (!amdgpu_sriov_vf(adev)) {
 			adev->gfx.compute_supported_reset |= AMDGPU_RESET_TYPE_PER_QUEUE;
 			adev->gfx.gfx_supported_reset |= AMDGPU_RESET_TYPE_PER_QUEUE;
+			if (adev->userq_funcs[AMDGPU_HW_IP_GFX] &&
+			    adev->userq_funcs[AMDGPU_HW_IP_GFX]->detect_and_reset)
+				adev->userq_supported_reset[AMDGPU_HW_IP_GFX] |= AMDGPU_RESET_TYPE_PER_QUEUE;
+			if (adev->userq_funcs[AMDGPU_HW_IP_COMPUTE] &&
+			    adev->userq_funcs[AMDGPU_HW_IP_COMPUTE]->detect_and_reset)
+				adev->userq_supported_reset[AMDGPU_HW_IP_COMPUTE] |= AMDGPU_RESET_TYPE_PER_QUEUE;
 		}
 		break;
 	}
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c
index 35d5a7e99a7c..c5ac42a30789 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c
@@ -1543,6 +1543,11 @@ static int gfx_v12_0_sw_init(struct amdgpu_ip_block *ip_block)
 		amdgpu_get_soft_full_reset_mask(&adev->gfx.gfx_ring[0]);
 	adev->gfx.compute_supported_reset =
 		amdgpu_get_soft_full_reset_mask(&adev->gfx.compute_ring[0]);
+	adev->userq_supported_reset[AMDGPU_HW_IP_GFX] =
+		amdgpu_userq_get_full_reset_mask(adev, AMDGPU_HW_IP_GFX);
+	adev->userq_supported_reset[AMDGPU_HW_IP_COMPUTE] =
+		amdgpu_userq_get_full_reset_mask(adev,AMDGPU_HW_IP_COMPUTE);
+
 	switch (amdgpu_ip_version(adev, GC_HWIP, 0)) {
 	case IP_VERSION(12, 0, 0):
 	case IP_VERSION(12, 0, 1):
@@ -1551,6 +1556,13 @@ static int gfx_v12_0_sw_init(struct amdgpu_ip_block *ip_block)
 		    !amdgpu_sriov_vf(adev)) {
 			adev->gfx.compute_supported_reset |= AMDGPU_RESET_TYPE_PER_QUEUE;
 			adev->gfx.gfx_supported_reset |= AMDGPU_RESET_TYPE_PER_QUEUE;
+			if (adev->userq_funcs[AMDGPU_HW_IP_GFX] &&
+			    adev->userq_funcs[AMDGPU_HW_IP_GFX]->detect_and_reset)
+				adev->userq_supported_reset[AMDGPU_HW_IP_GFX] |= AMDGPU_RESET_TYPE_PER_QUEUE;
+			if (adev->userq_funcs[AMDGPU_HW_IP_COMPUTE] &&
+			    adev->userq_funcs[AMDGPU_HW_IP_COMPUTE]->detect_and_reset)
+				adev->userq_supported_reset[AMDGPU_HW_IP_COMPUTE] |= AMDGPU_RESET_TYPE_PER_QUEUE;
+
 		}
 		break;
 	default:
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c
index db6e41967f12..8850eaf8d2c4 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c
@@ -1349,19 +1349,6 @@ static int sdma_v6_0_sw_init(struct amdgpu_ip_block *ip_block)
 			return r;
 	}
 
-	adev->sdma.supported_reset =
-		amdgpu_get_soft_full_reset_mask(&adev->sdma.instance[0].ring);
-	switch (amdgpu_ip_version(adev, SDMA0_HWIP, 0)) {
-	case IP_VERSION(6, 0, 0):
-	case IP_VERSION(6, 0, 2):
-	case IP_VERSION(6, 0, 3):
-		if ((adev->sdma.instance[0].fw_version >= 21) &&
-		    !amdgpu_sriov_vf(adev))
-			adev->sdma.supported_reset |= AMDGPU_RESET_TYPE_PER_QUEUE;
-		break;
-	default:
-		break;
-	}
 
 	if (amdgpu_sdma_ras_sw_init(adev)) {
 		dev_err(adev->dev, "Failed to initialize sdma ras block!\n");
@@ -1412,6 +1399,27 @@ static int sdma_v6_0_sw_init(struct amdgpu_ip_block *ip_block)
 		break;
 	}
 
+	adev->sdma.supported_reset =
+		amdgpu_get_soft_full_reset_mask(&adev->sdma.instance[0].ring);
+	adev->userq_supported_reset[AMDGPU_HW_IP_DMA] =
+		amdgpu_userq_get_full_reset_mask(adev, AMDGPU_HW_IP_DMA);
+
+	switch (amdgpu_ip_version(adev, SDMA0_HWIP, 0)) {
+	case IP_VERSION(6, 0, 0):
+	case IP_VERSION(6, 0, 2):
+	case IP_VERSION(6, 0, 3):
+		if ((adev->sdma.instance[0].fw_version >= 21) &&
+		    !amdgpu_sriov_vf(adev)) {
+			adev->sdma.supported_reset |= AMDGPU_RESET_TYPE_PER_QUEUE;
+			if (adev->userq_funcs[AMDGPU_HW_IP_DMA] &&
+			    adev->userq_funcs[AMDGPU_HW_IP_DMA]->detect_and_reset)
+				adev->userq_supported_reset[AMDGPU_HW_IP_DMA] |= AMDGPU_RESET_TYPE_PER_QUEUE;
+
+		}
+		break;
+	default:
+		break;
+	}
 	r = amdgpu_sdma_sysfs_reset_mask_init(adev);
 	if (r)
 		return r;
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c
index 326ecc8d37d2..9de46ac8b1db 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c
@@ -1335,14 +1335,6 @@ static int sdma_v7_0_sw_init(struct amdgpu_ip_block *ip_block)
 			return r;
 	}
 
-	adev->sdma.supported_reset =
-		amdgpu_get_soft_full_reset_mask(&adev->sdma.instance[0].ring);
-	if (!amdgpu_sriov_vf(adev))
-		adev->sdma.supported_reset |= AMDGPU_RESET_TYPE_PER_QUEUE;
-
-	r = amdgpu_sdma_sysfs_reset_mask_init(adev);
-	if (r)
-		return r;
 	/* Allocate memory for SDMA IP Dump buffer */
 	ptr = kcalloc(adev->sdma.num_instances * reg_count, sizeof(uint32_t), GFP_KERNEL);
 	if (ptr)
@@ -1360,6 +1352,22 @@ static int sdma_v7_0_sw_init(struct amdgpu_ip_block *ip_block)
 		break;
 	}
 
+	adev->sdma.supported_reset =
+		amdgpu_get_soft_full_reset_mask(&adev->sdma.instance[0].ring);
+	adev->userq_supported_reset[AMDGPU_HW_IP_DMA] =
+		amdgpu_userq_get_full_reset_mask(adev, AMDGPU_HW_IP_DMA);
+
+	if (!amdgpu_sriov_vf(adev)) {
+		adev->sdma.supported_reset |= AMDGPU_RESET_TYPE_PER_QUEUE;
+		if (adev->userq_funcs[AMDGPU_HW_IP_DMA] &&
+		    adev->userq_funcs[AMDGPU_HW_IP_DMA]->detect_and_reset)
+			adev->userq_supported_reset[AMDGPU_HW_IP_DMA] |= AMDGPU_RESET_TYPE_PER_QUEUE;
+
+	}
+	r = amdgpu_sdma_sysfs_reset_mask_init(adev);
+	if (r)
+		return r;
+
 	return r;
 }
 
-- 
2.49.0


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [PATCH 2/3] drm/amdgpu: Add user queue reset mask support
  2025-10-24  9:43 ` [PATCH 2/3] drm/amdgpu: Add user queue reset mask support Jesse.Zhang
@ 2025-10-29 18:13   ` Alex Deucher
  2025-10-30  2:08     ` Zhang, Jesse(Jie)
  0 siblings, 1 reply; 9+ messages in thread
From: Alex Deucher @ 2025-10-29 18:13 UTC (permalink / raw)
  To: Jesse.Zhang; +Cc: amd-gfx, Alexander.Deucher, Christian Koenig

On Fri, Oct 24, 2025 at 5:45 AM Jesse.Zhang <Jesse.Zhang@amd.com> wrote:
>
> This commit adds support for tracking and exposing the reset capabilities
> of user mode queues across different IP blocks (GFX, Compute, SDMA).
>
> These changes allow userspace to query the reset capabilities of user
> mode queues and ensure reset operations are only attempted when supported
> by the hardware and driver.
>
> Suggested-by: Alex Deucher <alexander.deucher@amd.com>
> Signed-off-by: Jesse Zhang <Jesse.Zhang@amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  3 ++
>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 17 ++++++++-
>  drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c    | 44 ++++++++++++++++++++++
>  drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c   | 21 +++++++++++
>  drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c  | 13 +++++++
>  drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c     | 17 +++++++++
>  drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c     | 12 ++++++
>  drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c     | 34 ++++++++++-------
>  drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c     | 24 ++++++++----
>  9 files changed, 163 insertions(+), 22 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index d0fb4eb1d7c4..48b21863065e 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -1182,6 +1182,7 @@ struct amdgpu_device {
>          * Value: struct amdgpu_usermode_queue
>          */
>         struct xarray userq_doorbell_xa;
> +       u32 userq_supported_reset[AMDGPU_RING_TYPE_MAX];

I don't think we need a separate userq_supported_reset array.  Just
use the existing reset masks.  We use the same functionality in both
kernel and userq cases so I don't see a reason to have a separate
tracker.

Alex

>
>         /* df */
>         struct amdgpu_df                df;
> @@ -1612,6 +1613,8 @@ struct dma_fence *amdgpu_device_enforce_isolation(struct amdgpu_device *adev,
>                                                   struct amdgpu_ring *ring,
>                                                   struct amdgpu_job *job);
>  bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev);
> +ssize_t amdgpu_userq_get_full_reset_mask(struct amdgpu_device *adev,
> +                                   int ring_type);
>  ssize_t amdgpu_get_soft_full_reset_mask(struct amdgpu_ring *ring);
>  ssize_t amdgpu_show_reset_mask(char *buf, uint32_t supported_reset);
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 8480b72258f2..a0064c5314df 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -7649,7 +7649,8 @@ ssize_t amdgpu_get_soft_full_reset_mask(struct amdgpu_ring *ring)
>         if (!ring || !ring->adev)
>                 return size;
>
> -       if (amdgpu_device_should_recover_gpu(ring->adev))
> +       if (amdgpu_device_should_recover_gpu(ring->adev) &&
> +           unlikely(!ring->adev->debug_disable_gpu_ring_reset))
>                 size |= AMDGPU_RESET_TYPE_FULL;
>
>         if (unlikely(!ring->adev->debug_disable_soft_recovery) &&
> @@ -7659,6 +7660,20 @@ ssize_t amdgpu_get_soft_full_reset_mask(struct amdgpu_ring *ring)
>         return size;
>  }
>
> +ssize_t amdgpu_userq_get_full_reset_mask(struct amdgpu_device *adev, int ring_type)
> +{
> +       ssize_t size = 0;
> +
> +       if (!adev || !adev->userq_funcs[ring_type])
> +               return size;
> +
> +       if (amdgpu_device_should_recover_gpu(adev) &&
> +           unlikely(!adev->debug_disable_gpu_ring_reset))
> +               size |= AMDGPU_RESET_TYPE_FULL;
> +
> +       return size;
> +}
> +
>  ssize_t amdgpu_show_reset_mask(char *buf, uint32_t supported_reset)
>  {
>         ssize_t size = 0;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> index 3d24f9cd750a..5597753ec61a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> @@ -1826,6 +1826,32 @@ static ssize_t amdgpu_gfx_get_compute_reset_mask(struct device *dev,
>         return amdgpu_show_reset_mask(buf, adev->gfx.compute_supported_reset);
>  }
>
> +static ssize_t amdgpu_userq_get_gfx_reset_mask(struct device *dev,
> +                                               struct device_attribute *attr,
> +                                               char *buf)
> +{
> +       struct drm_device *ddev = dev_get_drvdata(dev);
> +       struct amdgpu_device *adev = drm_to_adev(ddev);
> +
> +       if (!adev)
> +               return -ENODEV;
> +
> +       return amdgpu_show_reset_mask(buf, adev->userq_supported_reset[AMDGPU_HW_IP_GFX]);
> +}
> +
> +static ssize_t amdgpu_userq_get_compute_reset_mask(struct device *dev,
> +                                               struct device_attribute *attr,
> +                                               char *buf)
> +{
> +       struct drm_device *ddev = dev_get_drvdata(dev);
> +       struct amdgpu_device *adev = drm_to_adev(ddev);
> +
> +       if (!adev)
> +               return -ENODEV;
> +
> +       return amdgpu_show_reset_mask(buf, adev->userq_supported_reset[AMDGPU_HW_IP_COMPUTE]);
> +}
> +
>  static DEVICE_ATTR(run_cleaner_shader, 0200,
>                    NULL, amdgpu_gfx_set_run_cleaner_shader);
>
> @@ -1845,6 +1871,12 @@ static DEVICE_ATTR(gfx_reset_mask, 0444,
>  static DEVICE_ATTR(compute_reset_mask, 0444,
>                    amdgpu_gfx_get_compute_reset_mask, NULL);
>
> +static DEVICE_ATTR(gfx_userq_reset_mask, 0444,
> +                  amdgpu_userq_get_gfx_reset_mask, NULL);
> +
> +static DEVICE_ATTR(compute_userq_reset_mask, 0444,
> +                  amdgpu_userq_get_compute_reset_mask, NULL);
> +
>  static int amdgpu_gfx_sysfs_xcp_init(struct amdgpu_device *adev)
>  {
>         struct amdgpu_xcp_mgr *xcp_mgr = adev->xcp_mgr;
> @@ -1928,6 +1960,18 @@ static int amdgpu_gfx_sysfs_reset_mask_init(struct amdgpu_device *adev)
>                         return r;
>         }
>
> +       if (adev->userq_funcs[AMDGPU_HW_IP_GFX]) {
> +               r = device_create_file(adev->dev, &dev_attr_gfx_userq_reset_mask);
> +               if (r)
> +                       return r;
> +       }
> +
> +       if (adev->userq_funcs[AMDGPU_HW_IP_COMPUTE]) {
> +               r = device_create_file(adev->dev, &dev_attr_compute_userq_reset_mask);
> +               if (r)
> +                       return r;
> +       }
> +
>         return r;
>  }
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
> index 8b8a04138711..2fb288b2bfc4 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
> @@ -471,6 +471,21 @@ static ssize_t amdgpu_get_sdma_reset_mask(struct device *dev,
>  static DEVICE_ATTR(sdma_reset_mask, 0444,
>                    amdgpu_get_sdma_reset_mask, NULL);
>
> +static ssize_t amdgpu_get_sdma_userq_reset_mask(struct device *dev,
> +                                               struct device_attribute *attr,
> +                                               char *buf)
> +{
> +       struct drm_device *ddev = dev_get_drvdata(dev);
> +       struct amdgpu_device *adev = drm_to_adev(ddev);
> +
> +       if (!adev)
> +               return -ENODEV;
> +
> +       return amdgpu_show_reset_mask(buf, adev->userq_supported_reset[AMDGPU_HW_IP_DMA]);
> +}
> +static DEVICE_ATTR(sdma_userq_reset_mask, 0444,
> +                  amdgpu_get_sdma_userq_reset_mask, NULL);
> +
>  int amdgpu_sdma_sysfs_reset_mask_init(struct amdgpu_device *adev)
>  {
>         int r = 0;
> @@ -484,6 +499,12 @@ int amdgpu_sdma_sysfs_reset_mask_init(struct amdgpu_device *adev)
>                         return r;
>         }
>
> +       if (adev->userq_funcs[AMDGPU_HW_IP_DMA]) {
> +               r = device_create_file(adev->dev, &dev_attr_sdma_userq_reset_mask);
> +               if (r)
> +                       return r;
> +       }
> +
>         return r;
>  }
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> index 188de848c229..15ae72e2d679 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> @@ -47,6 +47,16 @@ u32 amdgpu_userq_get_supported_ip_mask(struct amdgpu_device *adev)
>         return userq_ip_mask;
>  }
>
> +bool amdgpu_userq_is_reset_type_supported(struct amdgpu_device *adev,
> +                                         int ring_type,
> +                                         int reset_type)
> +{
> +    if (ring_type < 0 || ring_type >= AMDGPU_RING_TYPE_MAX)
> +        return false;
> +
> +    return (adev->userq_supported_reset[ring_type] & reset_type) != 0;
> +}
> +
>  static void amdgpu_userq_gpu_reset(struct amdgpu_device *adev)
>  {
>         if (amdgpu_device_should_recover_gpu(adev)) {
> @@ -94,6 +104,9 @@ amdgpu_userq_detect_and_reset_queues(struct amdgpu_userq_mgr *uq_mgr)
>                 int ring_type = queue_types[i];
>                 const struct amdgpu_userq_funcs *funcs = adev->userq_funcs[ring_type];
>
> +               if (!amdgpu_userq_is_reset_type_supported(adev, ring_type, AMDGPU_RESET_TYPE_PER_QUEUE))
> +                               continue;
> +
>                 if (atomic_read(&uq_mgr->userq_count[ring_type]) > 0 &&
>                     funcs && funcs->detect_and_reset) {
>                         r = funcs->detect_and_reset(adev, ring_type);
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
> index 252517ce5d5a..82b7c365d720 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
> @@ -1815,6 +1815,11 @@ static int gfx_v11_0_sw_init(struct amdgpu_ip_block *ip_block)
>                 amdgpu_get_soft_full_reset_mask(&adev->gfx.gfx_ring[0]);
>         adev->gfx.compute_supported_reset =
>                 amdgpu_get_soft_full_reset_mask(&adev->gfx.compute_ring[0]);
> +       adev->userq_supported_reset[AMDGPU_HW_IP_GFX] =
> +                       amdgpu_userq_get_full_reset_mask(adev, AMDGPU_HW_IP_GFX);
> +       adev->userq_supported_reset[AMDGPU_HW_IP_COMPUTE] =
> +                       amdgpu_userq_get_full_reset_mask(adev,AMDGPU_HW_IP_COMPUTE);
> +
>         switch (amdgpu_ip_version(adev, GC_HWIP, 0)) {
>         case IP_VERSION(11, 0, 0):
>         case IP_VERSION(11, 0, 2):
> @@ -1824,12 +1829,24 @@ static int gfx_v11_0_sw_init(struct amdgpu_ip_block *ip_block)
>                     !amdgpu_sriov_vf(adev)) {
>                         adev->gfx.compute_supported_reset |= AMDGPU_RESET_TYPE_PER_QUEUE;
>                         adev->gfx.gfx_supported_reset |= AMDGPU_RESET_TYPE_PER_QUEUE;
> +                       if (adev->userq_funcs[AMDGPU_HW_IP_GFX] &&
> +                           adev->userq_funcs[AMDGPU_HW_IP_GFX]->detect_and_reset)
> +                               adev->userq_supported_reset[AMDGPU_HW_IP_GFX] |= AMDGPU_RESET_TYPE_PER_QUEUE;
> +                       if (adev->userq_funcs[AMDGPU_HW_IP_COMPUTE] &&
> +                           adev->userq_funcs[AMDGPU_HW_IP_COMPUTE]->detect_and_reset)
> +                               adev->userq_supported_reset[AMDGPU_HW_IP_COMPUTE] |= AMDGPU_RESET_TYPE_PER_QUEUE;
>                 }
>                 break;
>         default:
>                 if (!amdgpu_sriov_vf(adev)) {
>                         adev->gfx.compute_supported_reset |= AMDGPU_RESET_TYPE_PER_QUEUE;
>                         adev->gfx.gfx_supported_reset |= AMDGPU_RESET_TYPE_PER_QUEUE;
> +                       if (adev->userq_funcs[AMDGPU_HW_IP_GFX] &&
> +                           adev->userq_funcs[AMDGPU_HW_IP_GFX]->detect_and_reset)
> +                               adev->userq_supported_reset[AMDGPU_HW_IP_GFX] |= AMDGPU_RESET_TYPE_PER_QUEUE;
> +                       if (adev->userq_funcs[AMDGPU_HW_IP_COMPUTE] &&
> +                           adev->userq_funcs[AMDGPU_HW_IP_COMPUTE]->detect_and_reset)
> +                               adev->userq_supported_reset[AMDGPU_HW_IP_COMPUTE] |= AMDGPU_RESET_TYPE_PER_QUEUE;
>                 }
>                 break;
>         }
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c
> index 35d5a7e99a7c..c5ac42a30789 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c
> @@ -1543,6 +1543,11 @@ static int gfx_v12_0_sw_init(struct amdgpu_ip_block *ip_block)
>                 amdgpu_get_soft_full_reset_mask(&adev->gfx.gfx_ring[0]);
>         adev->gfx.compute_supported_reset =
>                 amdgpu_get_soft_full_reset_mask(&adev->gfx.compute_ring[0]);
> +       adev->userq_supported_reset[AMDGPU_HW_IP_GFX] =
> +               amdgpu_userq_get_full_reset_mask(adev, AMDGPU_HW_IP_GFX);
> +       adev->userq_supported_reset[AMDGPU_HW_IP_COMPUTE] =
> +               amdgpu_userq_get_full_reset_mask(adev,AMDGPU_HW_IP_COMPUTE);
> +
>         switch (amdgpu_ip_version(adev, GC_HWIP, 0)) {
>         case IP_VERSION(12, 0, 0):
>         case IP_VERSION(12, 0, 1):
> @@ -1551,6 +1556,13 @@ static int gfx_v12_0_sw_init(struct amdgpu_ip_block *ip_block)
>                     !amdgpu_sriov_vf(adev)) {
>                         adev->gfx.compute_supported_reset |= AMDGPU_RESET_TYPE_PER_QUEUE;
>                         adev->gfx.gfx_supported_reset |= AMDGPU_RESET_TYPE_PER_QUEUE;
> +                       if (adev->userq_funcs[AMDGPU_HW_IP_GFX] &&
> +                           adev->userq_funcs[AMDGPU_HW_IP_GFX]->detect_and_reset)
> +                               adev->userq_supported_reset[AMDGPU_HW_IP_GFX] |= AMDGPU_RESET_TYPE_PER_QUEUE;
> +                       if (adev->userq_funcs[AMDGPU_HW_IP_COMPUTE] &&
> +                           adev->userq_funcs[AMDGPU_HW_IP_COMPUTE]->detect_and_reset)
> +                               adev->userq_supported_reset[AMDGPU_HW_IP_COMPUTE] |= AMDGPU_RESET_TYPE_PER_QUEUE;
> +
>                 }
>                 break;
>         default:
> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c
> index db6e41967f12..8850eaf8d2c4 100644
> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c
> @@ -1349,19 +1349,6 @@ static int sdma_v6_0_sw_init(struct amdgpu_ip_block *ip_block)
>                         return r;
>         }
>
> -       adev->sdma.supported_reset =
> -               amdgpu_get_soft_full_reset_mask(&adev->sdma.instance[0].ring);
> -       switch (amdgpu_ip_version(adev, SDMA0_HWIP, 0)) {
> -       case IP_VERSION(6, 0, 0):
> -       case IP_VERSION(6, 0, 2):
> -       case IP_VERSION(6, 0, 3):
> -               if ((adev->sdma.instance[0].fw_version >= 21) &&
> -                   !amdgpu_sriov_vf(adev))
> -                       adev->sdma.supported_reset |= AMDGPU_RESET_TYPE_PER_QUEUE;
> -               break;
> -       default:
> -               break;
> -       }
>
>         if (amdgpu_sdma_ras_sw_init(adev)) {
>                 dev_err(adev->dev, "Failed to initialize sdma ras block!\n");
> @@ -1412,6 +1399,27 @@ static int sdma_v6_0_sw_init(struct amdgpu_ip_block *ip_block)
>                 break;
>         }
>
> +       adev->sdma.supported_reset =
> +               amdgpu_get_soft_full_reset_mask(&adev->sdma.instance[0].ring);
> +       adev->userq_supported_reset[AMDGPU_HW_IP_DMA] =
> +               amdgpu_userq_get_full_reset_mask(adev, AMDGPU_HW_IP_DMA);
> +
> +       switch (amdgpu_ip_version(adev, SDMA0_HWIP, 0)) {
> +       case IP_VERSION(6, 0, 0):
> +       case IP_VERSION(6, 0, 2):
> +       case IP_VERSION(6, 0, 3):
> +               if ((adev->sdma.instance[0].fw_version >= 21) &&
> +                   !amdgpu_sriov_vf(adev)) {
> +                       adev->sdma.supported_reset |= AMDGPU_RESET_TYPE_PER_QUEUE;
> +                       if (adev->userq_funcs[AMDGPU_HW_IP_DMA] &&
> +                           adev->userq_funcs[AMDGPU_HW_IP_DMA]->detect_and_reset)
> +                               adev->userq_supported_reset[AMDGPU_HW_IP_DMA] |= AMDGPU_RESET_TYPE_PER_QUEUE;
> +
> +               }
> +               break;
> +       default:
> +               break;
> +       }
>         r = amdgpu_sdma_sysfs_reset_mask_init(adev);
>         if (r)
>                 return r;
> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c
> index 326ecc8d37d2..9de46ac8b1db 100644
> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c
> @@ -1335,14 +1335,6 @@ static int sdma_v7_0_sw_init(struct amdgpu_ip_block *ip_block)
>                         return r;
>         }
>
> -       adev->sdma.supported_reset =
> -               amdgpu_get_soft_full_reset_mask(&adev->sdma.instance[0].ring);
> -       if (!amdgpu_sriov_vf(adev))
> -               adev->sdma.supported_reset |= AMDGPU_RESET_TYPE_PER_QUEUE;
> -
> -       r = amdgpu_sdma_sysfs_reset_mask_init(adev);
> -       if (r)
> -               return r;
>         /* Allocate memory for SDMA IP Dump buffer */
>         ptr = kcalloc(adev->sdma.num_instances * reg_count, sizeof(uint32_t), GFP_KERNEL);
>         if (ptr)
> @@ -1360,6 +1352,22 @@ static int sdma_v7_0_sw_init(struct amdgpu_ip_block *ip_block)
>                 break;
>         }
>
> +       adev->sdma.supported_reset =
> +               amdgpu_get_soft_full_reset_mask(&adev->sdma.instance[0].ring);
> +       adev->userq_supported_reset[AMDGPU_HW_IP_DMA] =
> +               amdgpu_userq_get_full_reset_mask(adev, AMDGPU_HW_IP_DMA);
> +
> +       if (!amdgpu_sriov_vf(adev)) {
> +               adev->sdma.supported_reset |= AMDGPU_RESET_TYPE_PER_QUEUE;
> +               if (adev->userq_funcs[AMDGPU_HW_IP_DMA] &&
> +                   adev->userq_funcs[AMDGPU_HW_IP_DMA]->detect_and_reset)
> +                       adev->userq_supported_reset[AMDGPU_HW_IP_DMA] |= AMDGPU_RESET_TYPE_PER_QUEUE;
> +
> +       }
> +       r = amdgpu_sdma_sysfs_reset_mask_init(adev);
> +       if (r)
> +               return r;
> +
>         return r;
>  }
>
> --
> 2.49.0
>

^ permalink raw reply	[flat|nested] 9+ messages in thread

* RE: [PATCH 2/3] drm/amdgpu: Add user queue reset mask support
  2025-10-29 18:13   ` Alex Deucher
@ 2025-10-30  2:08     ` Zhang, Jesse(Jie)
  2025-10-30 12:41       ` Alex Deucher
  0 siblings, 1 reply; 9+ messages in thread
From: Zhang, Jesse(Jie) @ 2025-10-30  2:08 UTC (permalink / raw)
  To: Alex Deucher
  Cc: amd-gfx@lists.freedesktop.org, Deucher,  Alexander,
	Koenig, Christian

[AMD Official Use Only - AMD Internal Distribution Only]

> -----Original Message-----
> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of Alex
> Deucher
> Sent: Thursday, October 30, 2025 2:14 AM
> To: Zhang, Jesse(Jie) <Jesse.Zhang@amd.com>
> Cc: amd-gfx@lists.freedesktop.org; Deucher, Alexander
> <Alexander.Deucher@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>
> Subject: Re: [PATCH 2/3] drm/amdgpu: Add user queue reset mask support
>
> On Fri, Oct 24, 2025 at 5:45 AM Jesse.Zhang <Jesse.Zhang@amd.com> wrote:
> >
> > This commit adds support for tracking and exposing the reset
> > capabilities of user mode queues across different IP blocks (GFX, Compute,
> SDMA).
> >
> > These changes allow userspace to query the reset capabilities of user
> > mode queues and ensure reset operations are only attempted when
> > supported by the hardware and driver.
> >
> > Suggested-by: Alex Deucher <alexander.deucher@amd.com>
> > Signed-off-by: Jesse Zhang <Jesse.Zhang@amd.com>
> > ---
> >  drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  3 ++
> >  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 17 ++++++++-
> >  drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c    | 44
> ++++++++++++++++++++++
> >  drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c   | 21 +++++++++++
> >  drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c  | 13 +++++++
> >  drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c     | 17 +++++++++
> >  drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c     | 12 ++++++
> >  drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c     | 34 ++++++++++-------
> >  drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c     | 24 ++++++++----
> >  9 files changed, 163 insertions(+), 22 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> > index d0fb4eb1d7c4..48b21863065e 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> > @@ -1182,6 +1182,7 @@ struct amdgpu_device {
> >          * Value: struct amdgpu_usermode_queue
> >          */
> >         struct xarray userq_doorbell_xa;
> > +       u32 userq_supported_reset[AMDGPU_RING_TYPE_MAX];
>
> I don't think we need a separate userq_supported_reset array.  Just use the existing
> reset masks.  We use the same functionality in both kernel and userq cases so I
> don't see a reason to have a separate tracker.
[Zhang, Jesse(Jie)] Thanks Alex for reviewing.
I have another question regarding the user queue reset mask sysfs.
Should we also share the sysfs with the kernel reset mask, right? Or  set a separate mask for userq_mask_reset?
For example:
/sys/devices/pci0000:00/0000:00:01.1/0000:01:00.0/0000:02:00.0/0000:03:00.0/sdma_reset_mask
/sys/devices/pci0000:00/0000:00:01.1/0000:01:00.0/0000:02:00.0/0000:03:00.0/sdma_userq_reset_mask

Thanks
Jesse

>
> Alex
>
> >
> >         /* df */
> >         struct amdgpu_df                df;
> > @@ -1612,6 +1613,8 @@ struct dma_fence
> *amdgpu_device_enforce_isolation(struct amdgpu_device *adev,
> >                                                   struct amdgpu_ring *ring,
> >                                                   struct amdgpu_job
> > *job);  bool amdgpu_device_has_display_hardware(struct amdgpu_device
> > *adev);
> > +ssize_t amdgpu_userq_get_full_reset_mask(struct amdgpu_device *adev,
> > +                                   int ring_type);
> >  ssize_t amdgpu_get_soft_full_reset_mask(struct amdgpu_ring *ring);
> > ssize_t amdgpu_show_reset_mask(char *buf, uint32_t supported_reset);
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> > index 8480b72258f2..a0064c5314df 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> > @@ -7649,7 +7649,8 @@ ssize_t amdgpu_get_soft_full_reset_mask(struct
> amdgpu_ring *ring)
> >         if (!ring || !ring->adev)
> >                 return size;
> >
> > -       if (amdgpu_device_should_recover_gpu(ring->adev))
> > +       if (amdgpu_device_should_recover_gpu(ring->adev) &&
> > +           unlikely(!ring->adev->debug_disable_gpu_ring_reset))
> >                 size |= AMDGPU_RESET_TYPE_FULL;
> >
> >         if (unlikely(!ring->adev->debug_disable_soft_recovery) && @@
> > -7659,6 +7660,20 @@ ssize_t amdgpu_get_soft_full_reset_mask(struct
> amdgpu_ring *ring)
> >         return size;
> >  }
> >
> > +ssize_t amdgpu_userq_get_full_reset_mask(struct amdgpu_device *adev,
> > +int ring_type) {
> > +       ssize_t size = 0;
> > +
> > +       if (!adev || !adev->userq_funcs[ring_type])
> > +               return size;
> > +
> > +       if (amdgpu_device_should_recover_gpu(adev) &&
> > +           unlikely(!adev->debug_disable_gpu_ring_reset))
> > +               size |= AMDGPU_RESET_TYPE_FULL;
> > +
> > +       return size;
> > +}
> > +
> >  ssize_t amdgpu_show_reset_mask(char *buf, uint32_t supported_reset)
> > {
> >         ssize_t size = 0;
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> > index 3d24f9cd750a..5597753ec61a 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> > @@ -1826,6 +1826,32 @@ static ssize_t
> amdgpu_gfx_get_compute_reset_mask(struct device *dev,
> >         return amdgpu_show_reset_mask(buf,
> > adev->gfx.compute_supported_reset);
> >  }
> >
> > +static ssize_t amdgpu_userq_get_gfx_reset_mask(struct device *dev,
> > +                                               struct device_attribute *attr,
> > +                                               char *buf) {
> > +       struct drm_device *ddev = dev_get_drvdata(dev);
> > +       struct amdgpu_device *adev = drm_to_adev(ddev);
> > +
> > +       if (!adev)
> > +               return -ENODEV;
> > +
> > +       return amdgpu_show_reset_mask(buf,
> > +adev->userq_supported_reset[AMDGPU_HW_IP_GFX]);
> > +}
> > +
> > +static ssize_t amdgpu_userq_get_compute_reset_mask(struct device *dev,
> > +                                               struct device_attribute *attr,
> > +                                               char *buf) {
> > +       struct drm_device *ddev = dev_get_drvdata(dev);
> > +       struct amdgpu_device *adev = drm_to_adev(ddev);
> > +
> > +       if (!adev)
> > +               return -ENODEV;
> > +
> > +       return amdgpu_show_reset_mask(buf,
> > +adev->userq_supported_reset[AMDGPU_HW_IP_COMPUTE]);
> > +}
> > +
> >  static DEVICE_ATTR(run_cleaner_shader, 0200,
> >                    NULL, amdgpu_gfx_set_run_cleaner_shader);
> >
> > @@ -1845,6 +1871,12 @@ static DEVICE_ATTR(gfx_reset_mask, 0444,
> > static DEVICE_ATTR(compute_reset_mask, 0444,
> >                    amdgpu_gfx_get_compute_reset_mask, NULL);
> >
> > +static DEVICE_ATTR(gfx_userq_reset_mask, 0444,
> > +                  amdgpu_userq_get_gfx_reset_mask, NULL);
> > +
> > +static DEVICE_ATTR(compute_userq_reset_mask, 0444,
> > +                  amdgpu_userq_get_compute_reset_mask, NULL);
> > +
> >  static int amdgpu_gfx_sysfs_xcp_init(struct amdgpu_device *adev)  {
> >         struct amdgpu_xcp_mgr *xcp_mgr = adev->xcp_mgr; @@ -1928,6
> > +1960,18 @@ static int amdgpu_gfx_sysfs_reset_mask_init(struct
> amdgpu_device *adev)
> >                         return r;
> >         }
> >
> > +       if (adev->userq_funcs[AMDGPU_HW_IP_GFX]) {
> > +               r = device_create_file(adev->dev, &dev_attr_gfx_userq_reset_mask);
> > +               if (r)
> > +                       return r;
> > +       }
> > +
> > +       if (adev->userq_funcs[AMDGPU_HW_IP_COMPUTE]) {
> > +               r = device_create_file(adev->dev,
> &dev_attr_compute_userq_reset_mask);
> > +               if (r)
> > +                       return r;
> > +       }
> > +
> >         return r;
> >  }
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
> > index 8b8a04138711..2fb288b2bfc4 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
> > @@ -471,6 +471,21 @@ static ssize_t amdgpu_get_sdma_reset_mask(struct
> > device *dev,  static DEVICE_ATTR(sdma_reset_mask, 0444,
> >                    amdgpu_get_sdma_reset_mask, NULL);
> >
> > +static ssize_t amdgpu_get_sdma_userq_reset_mask(struct device *dev,
> > +                                               struct device_attribute *attr,
> > +                                               char *buf) {
> > +       struct drm_device *ddev = dev_get_drvdata(dev);
> > +       struct amdgpu_device *adev = drm_to_adev(ddev);
> > +
> > +       if (!adev)
> > +               return -ENODEV;
> > +
> > +       return amdgpu_show_reset_mask(buf,
> > +adev->userq_supported_reset[AMDGPU_HW_IP_DMA]);
> > +}
> > +static DEVICE_ATTR(sdma_userq_reset_mask, 0444,
> > +                  amdgpu_get_sdma_userq_reset_mask, NULL);
> > +
> >  int amdgpu_sdma_sysfs_reset_mask_init(struct amdgpu_device *adev)  {
> >         int r = 0;
> > @@ -484,6 +499,12 @@ int amdgpu_sdma_sysfs_reset_mask_init(struct
> amdgpu_device *adev)
> >                         return r;
> >         }
> >
> > +       if (adev->userq_funcs[AMDGPU_HW_IP_DMA]) {
> > +               r = device_create_file(adev->dev,
> &dev_attr_sdma_userq_reset_mask);
> > +               if (r)
> > +                       return r;
> > +       }
> > +
> >         return r;
> >  }
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> > index 188de848c229..15ae72e2d679 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> > @@ -47,6 +47,16 @@ u32 amdgpu_userq_get_supported_ip_mask(struct
> amdgpu_device *adev)
> >         return userq_ip_mask;
> >  }
> >
> > +bool amdgpu_userq_is_reset_type_supported(struct amdgpu_device *adev,
> > +                                         int ring_type,
> > +                                         int reset_type) {
> > +    if (ring_type < 0 || ring_type >= AMDGPU_RING_TYPE_MAX)
> > +        return false;
> > +
> > +    return (adev->userq_supported_reset[ring_type] & reset_type) !=
> > +0; }
> > +
> >  static void amdgpu_userq_gpu_reset(struct amdgpu_device *adev)  {
> >         if (amdgpu_device_should_recover_gpu(adev)) { @@ -94,6 +104,9
> > @@ amdgpu_userq_detect_and_reset_queues(struct amdgpu_userq_mgr
> *uq_mgr)
> >                 int ring_type = queue_types[i];
> >                 const struct amdgpu_userq_funcs *funcs =
> > adev->userq_funcs[ring_type];
> >
> > +               if (!amdgpu_userq_is_reset_type_supported(adev, ring_type,
> AMDGPU_RESET_TYPE_PER_QUEUE))
> > +                               continue;
> > +
> >                 if (atomic_read(&uq_mgr->userq_count[ring_type]) > 0 &&
> >                     funcs && funcs->detect_and_reset) {
> >                         r = funcs->detect_and_reset(adev, ring_type);
> > diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
> > b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
> > index 252517ce5d5a..82b7c365d720 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
> > @@ -1815,6 +1815,11 @@ static int gfx_v11_0_sw_init(struct amdgpu_ip_block
> *ip_block)
> >                 amdgpu_get_soft_full_reset_mask(&adev->gfx.gfx_ring[0]);
> >         adev->gfx.compute_supported_reset =
> >
> > amdgpu_get_soft_full_reset_mask(&adev->gfx.compute_ring[0]);
> > +       adev->userq_supported_reset[AMDGPU_HW_IP_GFX] =
> > +                       amdgpu_userq_get_full_reset_mask(adev,
> AMDGPU_HW_IP_GFX);
> > +       adev->userq_supported_reset[AMDGPU_HW_IP_COMPUTE] =
> > +
> > + amdgpu_userq_get_full_reset_mask(adev,AMDGPU_HW_IP_COMPUTE);
> > +
> >         switch (amdgpu_ip_version(adev, GC_HWIP, 0)) {
> >         case IP_VERSION(11, 0, 0):
> >         case IP_VERSION(11, 0, 2):
> > @@ -1824,12 +1829,24 @@ static int gfx_v11_0_sw_init(struct
> amdgpu_ip_block *ip_block)
> >                     !amdgpu_sriov_vf(adev)) {
> >                         adev->gfx.compute_supported_reset |=
> AMDGPU_RESET_TYPE_PER_QUEUE;
> >                         adev->gfx.gfx_supported_reset |=
> > AMDGPU_RESET_TYPE_PER_QUEUE;
> > +                       if (adev->userq_funcs[AMDGPU_HW_IP_GFX] &&
> > +                           adev->userq_funcs[AMDGPU_HW_IP_GFX]-
> >detect_and_reset)
> > +                               adev->userq_supported_reset[AMDGPU_HW_IP_GFX] |=
> AMDGPU_RESET_TYPE_PER_QUEUE;
> > +                       if (adev->userq_funcs[AMDGPU_HW_IP_COMPUTE] &&
> > +                           adev->userq_funcs[AMDGPU_HW_IP_COMPUTE]-
> >detect_and_reset)
> > +
> > + adev->userq_supported_reset[AMDGPU_HW_IP_COMPUTE] |=
> > + AMDGPU_RESET_TYPE_PER_QUEUE;
> >                 }
> >                 break;
> >         default:
> >                 if (!amdgpu_sriov_vf(adev)) {
> >                         adev->gfx.compute_supported_reset |=
> AMDGPU_RESET_TYPE_PER_QUEUE;
> >                         adev->gfx.gfx_supported_reset |=
> > AMDGPU_RESET_TYPE_PER_QUEUE;
> > +                       if (adev->userq_funcs[AMDGPU_HW_IP_GFX] &&
> > +                           adev->userq_funcs[AMDGPU_HW_IP_GFX]-
> >detect_and_reset)
> > +                               adev->userq_supported_reset[AMDGPU_HW_IP_GFX] |=
> AMDGPU_RESET_TYPE_PER_QUEUE;
> > +                       if (adev->userq_funcs[AMDGPU_HW_IP_COMPUTE] &&
> > +                           adev->userq_funcs[AMDGPU_HW_IP_COMPUTE]-
> >detect_and_reset)
> > +
> > + adev->userq_supported_reset[AMDGPU_HW_IP_COMPUTE] |=
> > + AMDGPU_RESET_TYPE_PER_QUEUE;
> >                 }
> >                 break;
> >         }
> > diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c
> > b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c
> > index 35d5a7e99a7c..c5ac42a30789 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c
> > @@ -1543,6 +1543,11 @@ static int gfx_v12_0_sw_init(struct amdgpu_ip_block
> *ip_block)
> >                 amdgpu_get_soft_full_reset_mask(&adev->gfx.gfx_ring[0]);
> >         adev->gfx.compute_supported_reset =
> >
> > amdgpu_get_soft_full_reset_mask(&adev->gfx.compute_ring[0]);
> > +       adev->userq_supported_reset[AMDGPU_HW_IP_GFX] =
> > +               amdgpu_userq_get_full_reset_mask(adev, AMDGPU_HW_IP_GFX);
> > +       adev->userq_supported_reset[AMDGPU_HW_IP_COMPUTE] =
> > +
> > + amdgpu_userq_get_full_reset_mask(adev,AMDGPU_HW_IP_COMPUTE);
> > +
> >         switch (amdgpu_ip_version(adev, GC_HWIP, 0)) {
> >         case IP_VERSION(12, 0, 0):
> >         case IP_VERSION(12, 0, 1):
> > @@ -1551,6 +1556,13 @@ static int gfx_v12_0_sw_init(struct amdgpu_ip_block
> *ip_block)
> >                     !amdgpu_sriov_vf(adev)) {
> >                         adev->gfx.compute_supported_reset |=
> AMDGPU_RESET_TYPE_PER_QUEUE;
> >                         adev->gfx.gfx_supported_reset |=
> > AMDGPU_RESET_TYPE_PER_QUEUE;
> > +                       if (adev->userq_funcs[AMDGPU_HW_IP_GFX] &&
> > +                           adev->userq_funcs[AMDGPU_HW_IP_GFX]-
> >detect_and_reset)
> > +                               adev->userq_supported_reset[AMDGPU_HW_IP_GFX] |=
> AMDGPU_RESET_TYPE_PER_QUEUE;
> > +                       if (adev->userq_funcs[AMDGPU_HW_IP_COMPUTE] &&
> > +                           adev->userq_funcs[AMDGPU_HW_IP_COMPUTE]-
> >detect_and_reset)
> > +
> > + adev->userq_supported_reset[AMDGPU_HW_IP_COMPUTE] |=
> > + AMDGPU_RESET_TYPE_PER_QUEUE;
> > +
> >                 }
> >                 break;
> >         default:
> > diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c
> > b/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c
> > index db6e41967f12..8850eaf8d2c4 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c
> > @@ -1349,19 +1349,6 @@ static int sdma_v6_0_sw_init(struct
> amdgpu_ip_block *ip_block)
> >                         return r;
> >         }
> >
> > -       adev->sdma.supported_reset =
> > -               amdgpu_get_soft_full_reset_mask(&adev->sdma.instance[0].ring);
> > -       switch (amdgpu_ip_version(adev, SDMA0_HWIP, 0)) {
> > -       case IP_VERSION(6, 0, 0):
> > -       case IP_VERSION(6, 0, 2):
> > -       case IP_VERSION(6, 0, 3):
> > -               if ((adev->sdma.instance[0].fw_version >= 21) &&
> > -                   !amdgpu_sriov_vf(adev))
> > -                       adev->sdma.supported_reset |=
> AMDGPU_RESET_TYPE_PER_QUEUE;
> > -               break;
> > -       default:
> > -               break;
> > -       }
> >
> >         if (amdgpu_sdma_ras_sw_init(adev)) {
> >                 dev_err(adev->dev, "Failed to initialize sdma ras
> > block!\n"); @@ -1412,6 +1399,27 @@ static int sdma_v6_0_sw_init(struct
> amdgpu_ip_block *ip_block)
> >                 break;
> >         }
> >
> > +       adev->sdma.supported_reset =
> > +               amdgpu_get_soft_full_reset_mask(&adev->sdma.instance[0].ring);
> > +       adev->userq_supported_reset[AMDGPU_HW_IP_DMA] =
> > +               amdgpu_userq_get_full_reset_mask(adev,
> > + AMDGPU_HW_IP_DMA);
> > +
> > +       switch (amdgpu_ip_version(adev, SDMA0_HWIP, 0)) {
> > +       case IP_VERSION(6, 0, 0):
> > +       case IP_VERSION(6, 0, 2):
> > +       case IP_VERSION(6, 0, 3):
> > +               if ((adev->sdma.instance[0].fw_version >= 21) &&
> > +                   !amdgpu_sriov_vf(adev)) {
> > +                       adev->sdma.supported_reset |=
> AMDGPU_RESET_TYPE_PER_QUEUE;
> > +                       if (adev->userq_funcs[AMDGPU_HW_IP_DMA] &&
> > +                           adev->userq_funcs[AMDGPU_HW_IP_DMA]-
> >detect_and_reset)
> > +
> > + adev->userq_supported_reset[AMDGPU_HW_IP_DMA] |=
> > + AMDGPU_RESET_TYPE_PER_QUEUE;
> > +
> > +               }
> > +               break;
> > +       default:
> > +               break;
> > +       }
> >         r = amdgpu_sdma_sysfs_reset_mask_init(adev);
> >         if (r)
> >                 return r;
> > diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c
> > b/drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c
> > index 326ecc8d37d2..9de46ac8b1db 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c
> > @@ -1335,14 +1335,6 @@ static int sdma_v7_0_sw_init(struct
> amdgpu_ip_block *ip_block)
> >                         return r;
> >         }
> >
> > -       adev->sdma.supported_reset =
> > -               amdgpu_get_soft_full_reset_mask(&adev->sdma.instance[0].ring);
> > -       if (!amdgpu_sriov_vf(adev))
> > -               adev->sdma.supported_reset |=
> AMDGPU_RESET_TYPE_PER_QUEUE;
> > -
> > -       r = amdgpu_sdma_sysfs_reset_mask_init(adev);
> > -       if (r)
> > -               return r;
> >         /* Allocate memory for SDMA IP Dump buffer */
> >         ptr = kcalloc(adev->sdma.num_instances * reg_count, sizeof(uint32_t),
> GFP_KERNEL);
> >         if (ptr)
> > @@ -1360,6 +1352,22 @@ static int sdma_v7_0_sw_init(struct
> amdgpu_ip_block *ip_block)
> >                 break;
> >         }
> >
> > +       adev->sdma.supported_reset =
> > +               amdgpu_get_soft_full_reset_mask(&adev->sdma.instance[0].ring);
> > +       adev->userq_supported_reset[AMDGPU_HW_IP_DMA] =
> > +               amdgpu_userq_get_full_reset_mask(adev,
> > + AMDGPU_HW_IP_DMA);
> > +
> > +       if (!amdgpu_sriov_vf(adev)) {
> > +               adev->sdma.supported_reset |=
> AMDGPU_RESET_TYPE_PER_QUEUE;
> > +               if (adev->userq_funcs[AMDGPU_HW_IP_DMA] &&
> > +                   adev->userq_funcs[AMDGPU_HW_IP_DMA]->detect_and_reset)
> > +                       adev->userq_supported_reset[AMDGPU_HW_IP_DMA]
> > + |= AMDGPU_RESET_TYPE_PER_QUEUE;
> > +
> > +       }
> > +       r = amdgpu_sdma_sysfs_reset_mask_init(adev);
> > +       if (r)
> > +               return r;
> > +
> >         return r;
> >  }
> >
> > --
> > 2.49.0
> >

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 2/3] drm/amdgpu: Add user queue reset mask support
  2025-10-30  2:08     ` Zhang, Jesse(Jie)
@ 2025-10-30 12:41       ` Alex Deucher
  0 siblings, 0 replies; 9+ messages in thread
From: Alex Deucher @ 2025-10-30 12:41 UTC (permalink / raw)
  To: Zhang, Jesse(Jie)
  Cc: amd-gfx@lists.freedesktop.org, Deucher, Alexander,
	Koenig, Christian

On Wed, Oct 29, 2025 at 10:08 PM Zhang, Jesse(Jie) <Jesse.Zhang@amd.com> wrote:
>
> [AMD Official Use Only - AMD Internal Distribution Only]
>
> > -----Original Message-----
> > From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of Alex
> > Deucher
> > Sent: Thursday, October 30, 2025 2:14 AM
> > To: Zhang, Jesse(Jie) <Jesse.Zhang@amd.com>
> > Cc: amd-gfx@lists.freedesktop.org; Deucher, Alexander
> > <Alexander.Deucher@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>
> > Subject: Re: [PATCH 2/3] drm/amdgpu: Add user queue reset mask support
> >
> > On Fri, Oct 24, 2025 at 5:45 AM Jesse.Zhang <Jesse.Zhang@amd.com> wrote:
> > >
> > > This commit adds support for tracking and exposing the reset
> > > capabilities of user mode queues across different IP blocks (GFX, Compute,
> > SDMA).
> > >
> > > These changes allow userspace to query the reset capabilities of user
> > > mode queues and ensure reset operations are only attempted when
> > > supported by the hardware and driver.
> > >
> > > Suggested-by: Alex Deucher <alexander.deucher@amd.com>
> > > Signed-off-by: Jesse Zhang <Jesse.Zhang@amd.com>
> > > ---
> > >  drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  3 ++
> > >  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 17 ++++++++-
> > >  drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c    | 44
> > ++++++++++++++++++++++
> > >  drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c   | 21 +++++++++++
> > >  drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c  | 13 +++++++
> > >  drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c     | 17 +++++++++
> > >  drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c     | 12 ++++++
> > >  drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c     | 34 ++++++++++-------
> > >  drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c     | 24 ++++++++----
> > >  9 files changed, 163 insertions(+), 22 deletions(-)
> > >
> > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> > > b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> > > index d0fb4eb1d7c4..48b21863065e 100644
> > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> > > @@ -1182,6 +1182,7 @@ struct amdgpu_device {
> > >          * Value: struct amdgpu_usermode_queue
> > >          */
> > >         struct xarray userq_doorbell_xa;
> > > +       u32 userq_supported_reset[AMDGPU_RING_TYPE_MAX];
> >
> > I don't think we need a separate userq_supported_reset array.  Just use the existing
> > reset masks.  We use the same functionality in both kernel and userq cases so I
> > don't see a reason to have a separate tracker.
> [Zhang, Jesse(Jie)] Thanks Alex for reviewing.
> I have another question regarding the user queue reset mask sysfs.
> Should we also share the sysfs with the kernel reset mask, right? Or  set a separate mask for userq_mask_reset?
> For example:
> /sys/devices/pci0000:00/0000:00:01.1/0000:01:00.0/0000:02:00.0/0000:03:00.0/sdma_reset_mask
> /sys/devices/pci0000:00/0000:00:01.1/0000:01:00.0/0000:02:00.0/0000:03:00.0/sdma_userq_reset_mask

I think a single one is fine unless you can think of a case where they
would be different.

Alex

>
> Thanks
> Jesse
>
> >
> > Alex
> >
> > >
> > >         /* df */
> > >         struct amdgpu_df                df;
> > > @@ -1612,6 +1613,8 @@ struct dma_fence
> > *amdgpu_device_enforce_isolation(struct amdgpu_device *adev,
> > >                                                   struct amdgpu_ring *ring,
> > >                                                   struct amdgpu_job
> > > *job);  bool amdgpu_device_has_display_hardware(struct amdgpu_device
> > > *adev);
> > > +ssize_t amdgpu_userq_get_full_reset_mask(struct amdgpu_device *adev,
> > > +                                   int ring_type);
> > >  ssize_t amdgpu_get_soft_full_reset_mask(struct amdgpu_ring *ring);
> > > ssize_t amdgpu_show_reset_mask(char *buf, uint32_t supported_reset);
> > >
> > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> > > b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> > > index 8480b72258f2..a0064c5314df 100644
> > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> > > @@ -7649,7 +7649,8 @@ ssize_t amdgpu_get_soft_full_reset_mask(struct
> > amdgpu_ring *ring)
> > >         if (!ring || !ring->adev)
> > >                 return size;
> > >
> > > -       if (amdgpu_device_should_recover_gpu(ring->adev))
> > > +       if (amdgpu_device_should_recover_gpu(ring->adev) &&
> > > +           unlikely(!ring->adev->debug_disable_gpu_ring_reset))
> > >                 size |= AMDGPU_RESET_TYPE_FULL;
> > >
> > >         if (unlikely(!ring->adev->debug_disable_soft_recovery) && @@
> > > -7659,6 +7660,20 @@ ssize_t amdgpu_get_soft_full_reset_mask(struct
> > amdgpu_ring *ring)
> > >         return size;
> > >  }
> > >
> > > +ssize_t amdgpu_userq_get_full_reset_mask(struct amdgpu_device *adev,
> > > +int ring_type) {
> > > +       ssize_t size = 0;
> > > +
> > > +       if (!adev || !adev->userq_funcs[ring_type])
> > > +               return size;
> > > +
> > > +       if (amdgpu_device_should_recover_gpu(adev) &&
> > > +           unlikely(!adev->debug_disable_gpu_ring_reset))
> > > +               size |= AMDGPU_RESET_TYPE_FULL;
> > > +
> > > +       return size;
> > > +}
> > > +
> > >  ssize_t amdgpu_show_reset_mask(char *buf, uint32_t supported_reset)
> > > {
> > >         ssize_t size = 0;
> > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> > > b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> > > index 3d24f9cd750a..5597753ec61a 100644
> > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> > > @@ -1826,6 +1826,32 @@ static ssize_t
> > amdgpu_gfx_get_compute_reset_mask(struct device *dev,
> > >         return amdgpu_show_reset_mask(buf,
> > > adev->gfx.compute_supported_reset);
> > >  }
> > >
> > > +static ssize_t amdgpu_userq_get_gfx_reset_mask(struct device *dev,
> > > +                                               struct device_attribute *attr,
> > > +                                               char *buf) {
> > > +       struct drm_device *ddev = dev_get_drvdata(dev);
> > > +       struct amdgpu_device *adev = drm_to_adev(ddev);
> > > +
> > > +       if (!adev)
> > > +               return -ENODEV;
> > > +
> > > +       return amdgpu_show_reset_mask(buf,
> > > +adev->userq_supported_reset[AMDGPU_HW_IP_GFX]);
> > > +}
> > > +
> > > +static ssize_t amdgpu_userq_get_compute_reset_mask(struct device *dev,
> > > +                                               struct device_attribute *attr,
> > > +                                               char *buf) {
> > > +       struct drm_device *ddev = dev_get_drvdata(dev);
> > > +       struct amdgpu_device *adev = drm_to_adev(ddev);
> > > +
> > > +       if (!adev)
> > > +               return -ENODEV;
> > > +
> > > +       return amdgpu_show_reset_mask(buf,
> > > +adev->userq_supported_reset[AMDGPU_HW_IP_COMPUTE]);
> > > +}
> > > +
> > >  static DEVICE_ATTR(run_cleaner_shader, 0200,
> > >                    NULL, amdgpu_gfx_set_run_cleaner_shader);
> > >
> > > @@ -1845,6 +1871,12 @@ static DEVICE_ATTR(gfx_reset_mask, 0444,
> > > static DEVICE_ATTR(compute_reset_mask, 0444,
> > >                    amdgpu_gfx_get_compute_reset_mask, NULL);
> > >
> > > +static DEVICE_ATTR(gfx_userq_reset_mask, 0444,
> > > +                  amdgpu_userq_get_gfx_reset_mask, NULL);
> > > +
> > > +static DEVICE_ATTR(compute_userq_reset_mask, 0444,
> > > +                  amdgpu_userq_get_compute_reset_mask, NULL);
> > > +
> > >  static int amdgpu_gfx_sysfs_xcp_init(struct amdgpu_device *adev)  {
> > >         struct amdgpu_xcp_mgr *xcp_mgr = adev->xcp_mgr; @@ -1928,6
> > > +1960,18 @@ static int amdgpu_gfx_sysfs_reset_mask_init(struct
> > amdgpu_device *adev)
> > >                         return r;
> > >         }
> > >
> > > +       if (adev->userq_funcs[AMDGPU_HW_IP_GFX]) {
> > > +               r = device_create_file(adev->dev, &dev_attr_gfx_userq_reset_mask);
> > > +               if (r)
> > > +                       return r;
> > > +       }
> > > +
> > > +       if (adev->userq_funcs[AMDGPU_HW_IP_COMPUTE]) {
> > > +               r = device_create_file(adev->dev,
> > &dev_attr_compute_userq_reset_mask);
> > > +               if (r)
> > > +                       return r;
> > > +       }
> > > +
> > >         return r;
> > >  }
> > >
> > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
> > > b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
> > > index 8b8a04138711..2fb288b2bfc4 100644
> > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
> > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
> > > @@ -471,6 +471,21 @@ static ssize_t amdgpu_get_sdma_reset_mask(struct
> > > device *dev,  static DEVICE_ATTR(sdma_reset_mask, 0444,
> > >                    amdgpu_get_sdma_reset_mask, NULL);
> > >
> > > +static ssize_t amdgpu_get_sdma_userq_reset_mask(struct device *dev,
> > > +                                               struct device_attribute *attr,
> > > +                                               char *buf) {
> > > +       struct drm_device *ddev = dev_get_drvdata(dev);
> > > +       struct amdgpu_device *adev = drm_to_adev(ddev);
> > > +
> > > +       if (!adev)
> > > +               return -ENODEV;
> > > +
> > > +       return amdgpu_show_reset_mask(buf,
> > > +adev->userq_supported_reset[AMDGPU_HW_IP_DMA]);
> > > +}
> > > +static DEVICE_ATTR(sdma_userq_reset_mask, 0444,
> > > +                  amdgpu_get_sdma_userq_reset_mask, NULL);
> > > +
> > >  int amdgpu_sdma_sysfs_reset_mask_init(struct amdgpu_device *adev)  {
> > >         int r = 0;
> > > @@ -484,6 +499,12 @@ int amdgpu_sdma_sysfs_reset_mask_init(struct
> > amdgpu_device *adev)
> > >                         return r;
> > >         }
> > >
> > > +       if (adev->userq_funcs[AMDGPU_HW_IP_DMA]) {
> > > +               r = device_create_file(adev->dev,
> > &dev_attr_sdma_userq_reset_mask);
> > > +               if (r)
> > > +                       return r;
> > > +       }
> > > +
> > >         return r;
> > >  }
> > >
> > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> > > b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> > > index 188de848c229..15ae72e2d679 100644
> > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> > > @@ -47,6 +47,16 @@ u32 amdgpu_userq_get_supported_ip_mask(struct
> > amdgpu_device *adev)
> > >         return userq_ip_mask;
> > >  }
> > >
> > > +bool amdgpu_userq_is_reset_type_supported(struct amdgpu_device *adev,
> > > +                                         int ring_type,
> > > +                                         int reset_type) {
> > > +    if (ring_type < 0 || ring_type >= AMDGPU_RING_TYPE_MAX)
> > > +        return false;
> > > +
> > > +    return (adev->userq_supported_reset[ring_type] & reset_type) !=
> > > +0; }
> > > +
> > >  static void amdgpu_userq_gpu_reset(struct amdgpu_device *adev)  {
> > >         if (amdgpu_device_should_recover_gpu(adev)) { @@ -94,6 +104,9
> > > @@ amdgpu_userq_detect_and_reset_queues(struct amdgpu_userq_mgr
> > *uq_mgr)
> > >                 int ring_type = queue_types[i];
> > >                 const struct amdgpu_userq_funcs *funcs =
> > > adev->userq_funcs[ring_type];
> > >
> > > +               if (!amdgpu_userq_is_reset_type_supported(adev, ring_type,
> > AMDGPU_RESET_TYPE_PER_QUEUE))
> > > +                               continue;
> > > +
> > >                 if (atomic_read(&uq_mgr->userq_count[ring_type]) > 0 &&
> > >                     funcs && funcs->detect_and_reset) {
> > >                         r = funcs->detect_and_reset(adev, ring_type);
> > > diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
> > > b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
> > > index 252517ce5d5a..82b7c365d720 100644
> > > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
> > > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
> > > @@ -1815,6 +1815,11 @@ static int gfx_v11_0_sw_init(struct amdgpu_ip_block
> > *ip_block)
> > >                 amdgpu_get_soft_full_reset_mask(&adev->gfx.gfx_ring[0]);
> > >         adev->gfx.compute_supported_reset =
> > >
> > > amdgpu_get_soft_full_reset_mask(&adev->gfx.compute_ring[0]);
> > > +       adev->userq_supported_reset[AMDGPU_HW_IP_GFX] =
> > > +                       amdgpu_userq_get_full_reset_mask(adev,
> > AMDGPU_HW_IP_GFX);
> > > +       adev->userq_supported_reset[AMDGPU_HW_IP_COMPUTE] =
> > > +
> > > + amdgpu_userq_get_full_reset_mask(adev,AMDGPU_HW_IP_COMPUTE);
> > > +
> > >         switch (amdgpu_ip_version(adev, GC_HWIP, 0)) {
> > >         case IP_VERSION(11, 0, 0):
> > >         case IP_VERSION(11, 0, 2):
> > > @@ -1824,12 +1829,24 @@ static int gfx_v11_0_sw_init(struct
> > amdgpu_ip_block *ip_block)
> > >                     !amdgpu_sriov_vf(adev)) {
> > >                         adev->gfx.compute_supported_reset |=
> > AMDGPU_RESET_TYPE_PER_QUEUE;
> > >                         adev->gfx.gfx_supported_reset |=
> > > AMDGPU_RESET_TYPE_PER_QUEUE;
> > > +                       if (adev->userq_funcs[AMDGPU_HW_IP_GFX] &&
> > > +                           adev->userq_funcs[AMDGPU_HW_IP_GFX]-
> > >detect_and_reset)
> > > +                               adev->userq_supported_reset[AMDGPU_HW_IP_GFX] |=
> > AMDGPU_RESET_TYPE_PER_QUEUE;
> > > +                       if (adev->userq_funcs[AMDGPU_HW_IP_COMPUTE] &&
> > > +                           adev->userq_funcs[AMDGPU_HW_IP_COMPUTE]-
> > >detect_and_reset)
> > > +
> > > + adev->userq_supported_reset[AMDGPU_HW_IP_COMPUTE] |=
> > > + AMDGPU_RESET_TYPE_PER_QUEUE;
> > >                 }
> > >                 break;
> > >         default:
> > >                 if (!amdgpu_sriov_vf(adev)) {
> > >                         adev->gfx.compute_supported_reset |=
> > AMDGPU_RESET_TYPE_PER_QUEUE;
> > >                         adev->gfx.gfx_supported_reset |=
> > > AMDGPU_RESET_TYPE_PER_QUEUE;
> > > +                       if (adev->userq_funcs[AMDGPU_HW_IP_GFX] &&
> > > +                           adev->userq_funcs[AMDGPU_HW_IP_GFX]-
> > >detect_and_reset)
> > > +                               adev->userq_supported_reset[AMDGPU_HW_IP_GFX] |=
> > AMDGPU_RESET_TYPE_PER_QUEUE;
> > > +                       if (adev->userq_funcs[AMDGPU_HW_IP_COMPUTE] &&
> > > +                           adev->userq_funcs[AMDGPU_HW_IP_COMPUTE]-
> > >detect_and_reset)
> > > +
> > > + adev->userq_supported_reset[AMDGPU_HW_IP_COMPUTE] |=
> > > + AMDGPU_RESET_TYPE_PER_QUEUE;
> > >                 }
> > >                 break;
> > >         }
> > > diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c
> > > b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c
> > > index 35d5a7e99a7c..c5ac42a30789 100644
> > > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c
> > > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c
> > > @@ -1543,6 +1543,11 @@ static int gfx_v12_0_sw_init(struct amdgpu_ip_block
> > *ip_block)
> > >                 amdgpu_get_soft_full_reset_mask(&adev->gfx.gfx_ring[0]);
> > >         adev->gfx.compute_supported_reset =
> > >
> > > amdgpu_get_soft_full_reset_mask(&adev->gfx.compute_ring[0]);
> > > +       adev->userq_supported_reset[AMDGPU_HW_IP_GFX] =
> > > +               amdgpu_userq_get_full_reset_mask(adev, AMDGPU_HW_IP_GFX);
> > > +       adev->userq_supported_reset[AMDGPU_HW_IP_COMPUTE] =
> > > +
> > > + amdgpu_userq_get_full_reset_mask(adev,AMDGPU_HW_IP_COMPUTE);
> > > +
> > >         switch (amdgpu_ip_version(adev, GC_HWIP, 0)) {
> > >         case IP_VERSION(12, 0, 0):
> > >         case IP_VERSION(12, 0, 1):
> > > @@ -1551,6 +1556,13 @@ static int gfx_v12_0_sw_init(struct amdgpu_ip_block
> > *ip_block)
> > >                     !amdgpu_sriov_vf(adev)) {
> > >                         adev->gfx.compute_supported_reset |=
> > AMDGPU_RESET_TYPE_PER_QUEUE;
> > >                         adev->gfx.gfx_supported_reset |=
> > > AMDGPU_RESET_TYPE_PER_QUEUE;
> > > +                       if (adev->userq_funcs[AMDGPU_HW_IP_GFX] &&
> > > +                           adev->userq_funcs[AMDGPU_HW_IP_GFX]-
> > >detect_and_reset)
> > > +                               adev->userq_supported_reset[AMDGPU_HW_IP_GFX] |=
> > AMDGPU_RESET_TYPE_PER_QUEUE;
> > > +                       if (adev->userq_funcs[AMDGPU_HW_IP_COMPUTE] &&
> > > +                           adev->userq_funcs[AMDGPU_HW_IP_COMPUTE]-
> > >detect_and_reset)
> > > +
> > > + adev->userq_supported_reset[AMDGPU_HW_IP_COMPUTE] |=
> > > + AMDGPU_RESET_TYPE_PER_QUEUE;
> > > +
> > >                 }
> > >                 break;
> > >         default:
> > > diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c
> > > b/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c
> > > index db6e41967f12..8850eaf8d2c4 100644
> > > --- a/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c
> > > +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c
> > > @@ -1349,19 +1349,6 @@ static int sdma_v6_0_sw_init(struct
> > amdgpu_ip_block *ip_block)
> > >                         return r;
> > >         }
> > >
> > > -       adev->sdma.supported_reset =
> > > -               amdgpu_get_soft_full_reset_mask(&adev->sdma.instance[0].ring);
> > > -       switch (amdgpu_ip_version(adev, SDMA0_HWIP, 0)) {
> > > -       case IP_VERSION(6, 0, 0):
> > > -       case IP_VERSION(6, 0, 2):
> > > -       case IP_VERSION(6, 0, 3):
> > > -               if ((adev->sdma.instance[0].fw_version >= 21) &&
> > > -                   !amdgpu_sriov_vf(adev))
> > > -                       adev->sdma.supported_reset |=
> > AMDGPU_RESET_TYPE_PER_QUEUE;
> > > -               break;
> > > -       default:
> > > -               break;
> > > -       }
> > >
> > >         if (amdgpu_sdma_ras_sw_init(adev)) {
> > >                 dev_err(adev->dev, "Failed to initialize sdma ras
> > > block!\n"); @@ -1412,6 +1399,27 @@ static int sdma_v6_0_sw_init(struct
> > amdgpu_ip_block *ip_block)
> > >                 break;
> > >         }
> > >
> > > +       adev->sdma.supported_reset =
> > > +               amdgpu_get_soft_full_reset_mask(&adev->sdma.instance[0].ring);
> > > +       adev->userq_supported_reset[AMDGPU_HW_IP_DMA] =
> > > +               amdgpu_userq_get_full_reset_mask(adev,
> > > + AMDGPU_HW_IP_DMA);
> > > +
> > > +       switch (amdgpu_ip_version(adev, SDMA0_HWIP, 0)) {
> > > +       case IP_VERSION(6, 0, 0):
> > > +       case IP_VERSION(6, 0, 2):
> > > +       case IP_VERSION(6, 0, 3):
> > > +               if ((adev->sdma.instance[0].fw_version >= 21) &&
> > > +                   !amdgpu_sriov_vf(adev)) {
> > > +                       adev->sdma.supported_reset |=
> > AMDGPU_RESET_TYPE_PER_QUEUE;
> > > +                       if (adev->userq_funcs[AMDGPU_HW_IP_DMA] &&
> > > +                           adev->userq_funcs[AMDGPU_HW_IP_DMA]-
> > >detect_and_reset)
> > > +
> > > + adev->userq_supported_reset[AMDGPU_HW_IP_DMA] |=
> > > + AMDGPU_RESET_TYPE_PER_QUEUE;
> > > +
> > > +               }
> > > +               break;
> > > +       default:
> > > +               break;
> > > +       }
> > >         r = amdgpu_sdma_sysfs_reset_mask_init(adev);
> > >         if (r)
> > >                 return r;
> > > diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c
> > > b/drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c
> > > index 326ecc8d37d2..9de46ac8b1db 100644
> > > --- a/drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c
> > > +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c
> > > @@ -1335,14 +1335,6 @@ static int sdma_v7_0_sw_init(struct
> > amdgpu_ip_block *ip_block)
> > >                         return r;
> > >         }
> > >
> > > -       adev->sdma.supported_reset =
> > > -               amdgpu_get_soft_full_reset_mask(&adev->sdma.instance[0].ring);
> > > -       if (!amdgpu_sriov_vf(adev))
> > > -               adev->sdma.supported_reset |=
> > AMDGPU_RESET_TYPE_PER_QUEUE;
> > > -
> > > -       r = amdgpu_sdma_sysfs_reset_mask_init(adev);
> > > -       if (r)
> > > -               return r;
> > >         /* Allocate memory for SDMA IP Dump buffer */
> > >         ptr = kcalloc(adev->sdma.num_instances * reg_count, sizeof(uint32_t),
> > GFP_KERNEL);
> > >         if (ptr)
> > > @@ -1360,6 +1352,22 @@ static int sdma_v7_0_sw_init(struct
> > amdgpu_ip_block *ip_block)
> > >                 break;
> > >         }
> > >
> > > +       adev->sdma.supported_reset =
> > > +               amdgpu_get_soft_full_reset_mask(&adev->sdma.instance[0].ring);
> > > +       adev->userq_supported_reset[AMDGPU_HW_IP_DMA] =
> > > +               amdgpu_userq_get_full_reset_mask(adev,
> > > + AMDGPU_HW_IP_DMA);
> > > +
> > > +       if (!amdgpu_sriov_vf(adev)) {
> > > +               adev->sdma.supported_reset |=
> > AMDGPU_RESET_TYPE_PER_QUEUE;
> > > +               if (adev->userq_funcs[AMDGPU_HW_IP_DMA] &&
> > > +                   adev->userq_funcs[AMDGPU_HW_IP_DMA]->detect_and_reset)
> > > +                       adev->userq_supported_reset[AMDGPU_HW_IP_DMA]
> > > + |= AMDGPU_RESET_TYPE_PER_QUEUE;
> > > +
> > > +       }
> > > +       r = amdgpu_sdma_sysfs_reset_mask_init(adev);
> > > +       if (r)
> > > +               return r;
> > > +
> > >         return r;
> > >  }
> > >
> > > --
> > > 2.49.0
> > >

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [PATCH 3/3] drm/amdgpu: use irq-safe lock in amdgpu_userq_fence_driver_process
  2025-10-24  9:43 [PATCH 1/3] drm/amdgpu: Implement user queue reset functionality Jesse.Zhang
  2025-10-24  9:43 ` [PATCH 2/3] drm/amdgpu: Add user queue reset mask support Jesse.Zhang
@ 2025-10-24  9:43 ` Jesse.Zhang
  2025-10-28  4:58   ` Zhang, Jesse(Jie)
  2025-10-29 18:09   ` Alex Deucher
  2025-10-29 18:09 ` [PATCH 1/3] drm/amdgpu: Implement user queue reset functionality Alex Deucher
  2 siblings, 2 replies; 9+ messages in thread
From: Jesse.Zhang @ 2025-10-24  9:43 UTC (permalink / raw)
  To: amd-gfx; +Cc: Alexander.Deucher, Christian Koenig, Jesse.Zhang

The amdgpu_userq_fence_driver_process() function can be called from
both interrupt context (IRQ handlers like gfx_v11_0_eop_irq) and
process context (workqueues like eviction suspend worker). Using
regular spin_lock() in interrupt context triggers lockdep warnings
and could lead to potential deadlocks.

Replace the regular spin_lock()/spin_unlock() with their interrupt-
safe variants spin_lock_irqsave()/spin_unlock_irqrestore() to
ensure proper locking semantics in all execution contexts.

This ensures:
- Interrupts are properly disabled when locking in interrupt context
- No lockdep warnings due to mixed context usage
- Safe execution across all code paths that process user queue fences

Signed-off-by: Jesse Zhang <Jesse.Zhang@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c
index 2aeeaa954882..69908b90d255 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c
@@ -151,15 +151,16 @@ void amdgpu_userq_fence_driver_process(struct amdgpu_userq_fence_driver *fence_d
 {
 	struct amdgpu_userq_fence *userq_fence, *tmp;
 	struct dma_fence *fence;
+	unsigned long flags;
 	u64 rptr;
 	int i;
 
 	if (!fence_drv)
 		return;
 
+	spin_lock_irqsave(&fence_drv->fence_list_lock, flags);
 	rptr = amdgpu_userq_fence_read(fence_drv);
 
-	spin_lock(&fence_drv->fence_list_lock);
 	list_for_each_entry_safe(userq_fence, tmp, &fence_drv->fences, link) {
 		fence = &userq_fence->base;
 
@@ -174,7 +175,7 @@ void amdgpu_userq_fence_driver_process(struct amdgpu_userq_fence_driver *fence_d
 		list_del(&userq_fence->link);
 		dma_fence_put(fence);
 	}
-	spin_unlock(&fence_drv->fence_list_lock);
+	spin_unlock_irqrestore(&fence_drv->fence_list_lock, flags);
 }
 
 void amdgpu_userq_fence_driver_destroy(struct kref *ref)
-- 
2.49.0


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* RE: [PATCH 3/3] drm/amdgpu: use irq-safe lock in amdgpu_userq_fence_driver_process
  2025-10-24  9:43 ` [PATCH 3/3] drm/amdgpu: use irq-safe lock in amdgpu_userq_fence_driver_process Jesse.Zhang
@ 2025-10-28  4:58   ` Zhang, Jesse(Jie)
  2025-10-29 18:09   ` Alex Deucher
  1 sibling, 0 replies; 9+ messages in thread
From: Zhang, Jesse(Jie) @ 2025-10-28  4:58 UTC (permalink / raw)
  To: Zhang, Jesse(Jie), amd-gfx@lists.freedesktop.org
  Cc: Deucher, Alexander, Koenig, Christian

[AMD Official Use Only - AMD Internal Distribution Only]

Ping this series.


> -----Original Message-----
> From: Jesse.Zhang <Jesse.Zhang@amd.com>
> Sent: Friday, October 24, 2025 5:44 PM
> To: amd-gfx@lists.freedesktop.org
> Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Koenig, Christian
> <Christian.Koenig@amd.com>; Zhang, Jesse(Jie) <Jesse.Zhang@amd.com>
> Subject: [PATCH 3/3] drm/amdgpu: use irq-safe lock in
> amdgpu_userq_fence_driver_process
>
> The amdgpu_userq_fence_driver_process() function can be called from both
> interrupt context (IRQ handlers like gfx_v11_0_eop_irq) and process context
> (workqueues like eviction suspend worker). Using regular spin_lock() in interrupt
> context triggers lockdep warnings and could lead to potential deadlocks.
>
> Replace the regular spin_lock()/spin_unlock() with their interrupt- safe variants
> spin_lock_irqsave()/spin_unlock_irqrestore() to ensure proper locking semantics in
> all execution contexts.
>
> This ensures:
> - Interrupts are properly disabled when locking in interrupt context
> - No lockdep warnings due to mixed context usage
> - Safe execution across all code paths that process user queue fences
>
> Signed-off-by: Jesse Zhang <Jesse.Zhang@amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c | 5 +++--
>  1 file changed, 3 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c
> index 2aeeaa954882..69908b90d255 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c
> @@ -151,15 +151,16 @@ void amdgpu_userq_fence_driver_process(struct
> amdgpu_userq_fence_driver *fence_d  {
>       struct amdgpu_userq_fence *userq_fence, *tmp;
>       struct dma_fence *fence;
> +     unsigned long flags;
>       u64 rptr;
>       int i;
>
>       if (!fence_drv)
>               return;
>
> +     spin_lock_irqsave(&fence_drv->fence_list_lock, flags);
>       rptr = amdgpu_userq_fence_read(fence_drv);
>
> -     spin_lock(&fence_drv->fence_list_lock);
>       list_for_each_entry_safe(userq_fence, tmp, &fence_drv->fences, link) {
>               fence = &userq_fence->base;
>
> @@ -174,7 +175,7 @@ void amdgpu_userq_fence_driver_process(struct
> amdgpu_userq_fence_driver *fence_d
>               list_del(&userq_fence->link);
>               dma_fence_put(fence);
>       }
> -     spin_unlock(&fence_drv->fence_list_lock);
> +     spin_unlock_irqrestore(&fence_drv->fence_list_lock, flags);
>  }
>
>  void amdgpu_userq_fence_driver_destroy(struct kref *ref)
> --
> 2.49.0


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 3/3] drm/amdgpu: use irq-safe lock in amdgpu_userq_fence_driver_process
  2025-10-24  9:43 ` [PATCH 3/3] drm/amdgpu: use irq-safe lock in amdgpu_userq_fence_driver_process Jesse.Zhang
  2025-10-28  4:58   ` Zhang, Jesse(Jie)
@ 2025-10-29 18:09   ` Alex Deucher
  1 sibling, 0 replies; 9+ messages in thread
From: Alex Deucher @ 2025-10-29 18:09 UTC (permalink / raw)
  To: Jesse.Zhang; +Cc: amd-gfx, Alexander.Deucher, Christian Koenig

On Fri, Oct 24, 2025 at 5:45 AM Jesse.Zhang <Jesse.Zhang@amd.com> wrote:
>
> The amdgpu_userq_fence_driver_process() function can be called from
> both interrupt context (IRQ handlers like gfx_v11_0_eop_irq) and
> process context (workqueues like eviction suspend worker). Using
> regular spin_lock() in interrupt context triggers lockdep warnings
> and could lead to potential deadlocks.
>
> Replace the regular spin_lock()/spin_unlock() with their interrupt-
> safe variants spin_lock_irqsave()/spin_unlock_irqrestore() to
> ensure proper locking semantics in all execution contexts.
>
> This ensures:
> - Interrupts are properly disabled when locking in interrupt context
> - No lockdep warnings due to mixed context usage
> - Safe execution across all code paths that process user queue fences
>
> Signed-off-by: Jesse Zhang <Jesse.Zhang@amd.com>

Reviewed-by: Alex Deucher <alexander.deucher@amd.com>

> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c | 5 +++--
>  1 file changed, 3 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c
> index 2aeeaa954882..69908b90d255 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c
> @@ -151,15 +151,16 @@ void amdgpu_userq_fence_driver_process(struct amdgpu_userq_fence_driver *fence_d
>  {
>         struct amdgpu_userq_fence *userq_fence, *tmp;
>         struct dma_fence *fence;
> +       unsigned long flags;
>         u64 rptr;
>         int i;
>
>         if (!fence_drv)
>                 return;
>
> +       spin_lock_irqsave(&fence_drv->fence_list_lock, flags);
>         rptr = amdgpu_userq_fence_read(fence_drv);
>
> -       spin_lock(&fence_drv->fence_list_lock);
>         list_for_each_entry_safe(userq_fence, tmp, &fence_drv->fences, link) {
>                 fence = &userq_fence->base;
>
> @@ -174,7 +175,7 @@ void amdgpu_userq_fence_driver_process(struct amdgpu_userq_fence_driver *fence_d
>                 list_del(&userq_fence->link);
>                 dma_fence_put(fence);
>         }
> -       spin_unlock(&fence_drv->fence_list_lock);
> +       spin_unlock_irqrestore(&fence_drv->fence_list_lock, flags);
>  }
>
>  void amdgpu_userq_fence_driver_destroy(struct kref *ref)
> --
> 2.49.0
>

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 1/3] drm/amdgpu: Implement user queue reset functionality
  2025-10-24  9:43 [PATCH 1/3] drm/amdgpu: Implement user queue reset functionality Jesse.Zhang
  2025-10-24  9:43 ` [PATCH 2/3] drm/amdgpu: Add user queue reset mask support Jesse.Zhang
  2025-10-24  9:43 ` [PATCH 3/3] drm/amdgpu: use irq-safe lock in amdgpu_userq_fence_driver_process Jesse.Zhang
@ 2025-10-29 18:09 ` Alex Deucher
  2 siblings, 0 replies; 9+ messages in thread
From: Alex Deucher @ 2025-10-29 18:09 UTC (permalink / raw)
  To: Jesse.Zhang; +Cc: amd-gfx, Alexander.Deucher, Christian Koenig

On Fri, Oct 24, 2025 at 6:01 AM Jesse.Zhang <Jesse.Zhang@amd.com> wrote:
>
> From: Alex Deucher <alexander.deucher@amd.com>
>
> This patch adds robust reset handling for user queues (userq) to improve
> recovery from queue failures. The key components include:
>
> 1. Queue detection and reset logic:
>    - amdgpu_userq_detect_and_reset_queues() identifies failed queues
>    - Per-IP detect_and_reset callbacks for targeted recovery
>    - Falls back to full GPU reset when needed
>
> 2. Reset infrastructure:
>    - Adds userq_reset_work workqueue for async reset handling
>    - Implements pre/post reset handlers for queue state management
>    - Integrates with existing GPU reset framework
>
> 3. Error handling improvements:
>    - Enhanced state tracking with HUNG state
>    - Automatic reset triggering on critical failures
>    - VRAM loss handling during recovery
>
> 4. Integration points:
>    - Added to device init/reset paths
>    - Called during queue destroy, suspend, and isolation events
>    - Handles both individual queue and full GPU resets
>
> The reset functionality works with both gfx/compute and sdma queues,
> providing better resilience against queue failures while minimizing
> disruption to unaffected queues.
>
> v2: add detection and reset calls when preemption/unmaped fails.
>     add a per device userq counter for each user queue type.(Alex)
> v3: make sure we hold the adev->userq_mutex when we call amdgpu_userq_detect_and_reset_queues. (Alex)
>    warn if the adev->userq_mutex is not held.
> v4: make sure we have all of the uqm->userq_mutex held.
>    warn if the uqm->userq_mutex is not held.
>
> v5: Use array for user queue type counters.(Alex)
>     all of the uqm->userq_mutex need to be held when calling detect and reset.  (Alex)
>
> v6: fix lock dep warning in amdgpu_userq_fence_dence_driver_process
>
> v7: add the queue types in an array and use a loop in amdgpu_userq_detect_and_reset_queues (Lijo)
> v8: remove atomic_set(&userq_mgr->userq_count[i], 0).
>    it should already be 0 since we kzalloc the structure (Alex)
>
> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
> Signed-off-by: Jesse Zhang <Jesse.Zhang@amd.com>

Reviewed-by: Alex Deucher <alexander.deucher@amd.com>


> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu.h        |   1 +
>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |   8 +
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h   |   1 +
>  drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c  | 176 +++++++++++++++++++--
>  drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h  |   5 +
>  5 files changed, 179 insertions(+), 12 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index 50079209c472..d0fb4eb1d7c4 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -1316,6 +1316,7 @@ struct amdgpu_device {
>         bool                            apu_prefer_gtt;
>
>         bool                            userq_halt_for_enforce_isolation;
> +       struct work_struct              userq_reset_work;
>         struct amdgpu_uid *uid_info;
>
>         /* KFD
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index b8d91247f51a..8480b72258f2 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -4608,6 +4608,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>         }
>
>         INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
> +       INIT_WORK(&adev->userq_reset_work, amdgpu_userq_reset_work);
>
>         adev->gfx.gfx_off_req_count = 1;
>         adev->gfx.gfx_off_residency = 0;
> @@ -5990,6 +5991,10 @@ int amdgpu_device_reinit_after_reset(struct amdgpu_reset_context *reset_context)
>                                 if (r)
>                                         goto out;
>
> +                               r = amdgpu_userq_post_reset(tmp_adev, vram_lost);
> +                               if (r)
> +                                       goto out;
> +
>                                 drm_client_dev_resume(adev_to_drm(tmp_adev), false);
>
>                                 /*
> @@ -6212,6 +6217,7 @@ static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev)
>         if (!amdgpu_sriov_vf(adev))
>                 cancel_work(&adev->reset_work);
>  #endif
> +       cancel_work(&adev->userq_reset_work);
>
>         if (adev->kfd.dev)
>                 cancel_work(&adev->kfd.reset_work);
> @@ -6332,6 +6338,8 @@ static void amdgpu_device_halt_activities(struct amdgpu_device *adev,
>                     amdgpu_device_ip_need_full_reset(tmp_adev))
>                         amdgpu_ras_suspend(tmp_adev);
>
> +               amdgpu_userq_pre_reset(tmp_adev);
> +
>                 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>                         struct amdgpu_ring *ring = tmp_adev->rings[i];
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> index 87b962df5460..7a27c6c4bb44 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> @@ -83,6 +83,7 @@ enum amdgpu_ring_type {
>         AMDGPU_RING_TYPE_MES,
>         AMDGPU_RING_TYPE_UMSCH_MM,
>         AMDGPU_RING_TYPE_CPER,
> +       AMDGPU_RING_TYPE_MAX,
>  };
>
>  enum amdgpu_ib_pool_type {
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> index c50b162e80a7..188de848c229 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> @@ -25,8 +25,10 @@
>  #include <drm/drm_auth.h>
>  #include <drm/drm_exec.h>
>  #include <linux/pm_runtime.h>
> +#include <drm/drm_drv.h>
>
>  #include "amdgpu.h"
> +#include "amdgpu_reset.h"
>  #include "amdgpu_vm.h"
>  #include "amdgpu_userq.h"
>  #include "amdgpu_hmm.h"
> @@ -45,6 +47,69 @@ u32 amdgpu_userq_get_supported_ip_mask(struct amdgpu_device *adev)
>         return userq_ip_mask;
>  }
>
> +static void amdgpu_userq_gpu_reset(struct amdgpu_device *adev)
> +{
> +       if (amdgpu_device_should_recover_gpu(adev)) {
> +               amdgpu_reset_domain_schedule(adev->reset_domain,
> +                                            &adev->userq_reset_work);
> +               /* Wait for the reset job to complete */
> +               flush_work(&adev->userq_reset_work);
> +       }
> +}
> +
> +static int
> +amdgpu_userq_detect_and_reset_queues(struct amdgpu_userq_mgr *uq_mgr)
> +{
> +       struct amdgpu_device *adev = uq_mgr->adev;
> +       const int queue_types[] = {
> +               AMDGPU_RING_TYPE_COMPUTE,
> +               AMDGPU_RING_TYPE_GFX,
> +               AMDGPU_RING_TYPE_SDMA
> +       };
> +       const int num_queue_types = ARRAY_SIZE(queue_types);
> +       bool gpu_reset = false;
> +       int r = 0;
> +       int i;
> +
> +       /* Warning if current process mutex is not held */
> +       WARN_ON(!mutex_is_locked(&uq_mgr->userq_mutex));
> +
> +       if (unlikely(adev->debug_disable_gpu_ring_reset)) {
> +               dev_err(adev->dev, "userq reset disabled by debug mask\n");
> +               return 0;
> +       }
> +
> +       /*
> +        * If GPU recovery feature is disabled system-wide,
> +        * skip all reset detection logic
> +        */
> +       if (!amdgpu_gpu_recovery)
> +               return 0;
> +
> +       /*
> +        * Iterate through all queue types to detect and reset problematic queues
> +        * Process each queue type in the defined order
> +        */
> +       for (i = 0; i < num_queue_types; i++) {
> +               int ring_type = queue_types[i];
> +               const struct amdgpu_userq_funcs *funcs = adev->userq_funcs[ring_type];
> +
> +               if (atomic_read(&uq_mgr->userq_count[ring_type]) > 0 &&
> +                   funcs && funcs->detect_and_reset) {
> +                       r = funcs->detect_and_reset(adev, ring_type);
> +                       if (r) {
> +                               gpu_reset = true;
> +                               break;
> +                       }
> +               }
> +       }
> +
> +       if (gpu_reset)
> +               amdgpu_userq_gpu_reset(adev);
> +
> +       return r;
> +}
> +
>  static int amdgpu_userq_buffer_va_list_add(struct amdgpu_usermode_queue *queue,
>                                            struct amdgpu_bo_va_mapping *va_map, u64 addr)
>  {
> @@ -175,17 +240,22 @@ amdgpu_userq_preempt_helper(struct amdgpu_userq_mgr *uq_mgr,
>         struct amdgpu_device *adev = uq_mgr->adev;
>         const struct amdgpu_userq_funcs *userq_funcs =
>                 adev->userq_funcs[queue->queue_type];
> +       bool found_hung_queue = false;
>         int r = 0;
>
>         if (queue->state == AMDGPU_USERQ_STATE_MAPPED) {
>                 r = userq_funcs->preempt(uq_mgr, queue);
>                 if (r) {
>                         queue->state = AMDGPU_USERQ_STATE_HUNG;
> +                       found_hung_queue = true;
>                 } else {
>                         queue->state = AMDGPU_USERQ_STATE_PREEMPTED;
>                 }
>         }
>
> +       if (found_hung_queue)
> +               amdgpu_userq_detect_and_reset_queues(uq_mgr);
> +
>         return r;
>  }
>
> @@ -217,16 +287,23 @@ amdgpu_userq_unmap_helper(struct amdgpu_userq_mgr *uq_mgr,
>         struct amdgpu_device *adev = uq_mgr->adev;
>         const struct amdgpu_userq_funcs *userq_funcs =
>                 adev->userq_funcs[queue->queue_type];
> +       bool found_hung_queue = false;
>         int r = 0;
>
>         if ((queue->state == AMDGPU_USERQ_STATE_MAPPED) ||
>                 (queue->state == AMDGPU_USERQ_STATE_PREEMPTED)) {
>                 r = userq_funcs->unmap(uq_mgr, queue);
> -               if (r)
> +               if (r) {
>                         queue->state = AMDGPU_USERQ_STATE_HUNG;
> -               else
> +                       found_hung_queue = true;
> +               } else {
>                         queue->state = AMDGPU_USERQ_STATE_UNMAPPED;
> +               }
>         }
> +
> +       if (found_hung_queue)
> +               amdgpu_userq_detect_and_reset_queues(uq_mgr);
> +
>         return r;
>  }
>
> @@ -243,10 +320,12 @@ amdgpu_userq_map_helper(struct amdgpu_userq_mgr *uq_mgr,
>                 r = userq_funcs->map(uq_mgr, queue);
>                 if (r) {
>                         queue->state = AMDGPU_USERQ_STATE_HUNG;
> +                       amdgpu_userq_detect_and_reset_queues(uq_mgr);
>                 } else {
>                         queue->state = AMDGPU_USERQ_STATE_MAPPED;
>                 }
>         }
> +
>         return r;
>  }
>
> @@ -474,10 +553,11 @@ amdgpu_userq_destroy(struct drm_file *filp, int queue_id)
>                 amdgpu_bo_unreserve(queue->db_obj.obj);
>         }
>         amdgpu_bo_unref(&queue->db_obj.obj);
> -
> +       atomic_dec(&uq_mgr->userq_count[queue->queue_type]);
>  #if defined(CONFIG_DEBUG_FS)
>         debugfs_remove_recursive(queue->debugfs_queue);
>  #endif
> +       amdgpu_userq_detect_and_reset_queues(uq_mgr);
>         r = amdgpu_userq_unmap_helper(uq_mgr, queue);
>         /*TODO: It requires a reset for userq hw unmap error*/
>         if (unlikely(r != AMDGPU_USERQ_STATE_UNMAPPED)) {
> @@ -699,6 +779,7 @@ amdgpu_userq_create(struct drm_file *filp, union drm_amdgpu_userq *args)
>         kfree(queue_name);
>
>         args->out.queue_id = qid;
> +       atomic_inc(&uq_mgr->userq_count[queue->queue_type]);
>
>  unlock:
>         mutex_unlock(&uq_mgr->userq_mutex);
> @@ -1043,6 +1124,7 @@ amdgpu_userq_evict_all(struct amdgpu_userq_mgr *uq_mgr)
>         unsigned long queue_id;
>         int ret = 0, r;
>
> +       amdgpu_userq_detect_and_reset_queues(uq_mgr);
>         /* Try to unmap all the queues in this process ctx */
>         xa_for_each(&uq_mgr->userq_mgr_xa, queue_id, queue) {
>                 r = amdgpu_userq_preempt_helper(uq_mgr, queue);
> @@ -1055,6 +1137,23 @@ amdgpu_userq_evict_all(struct amdgpu_userq_mgr *uq_mgr)
>         return ret;
>  }
>
> +void amdgpu_userq_reset_work(struct work_struct *work)
> +{
> +       struct amdgpu_device *adev = container_of(work, struct amdgpu_device,
> +                                                 userq_reset_work);
> +       struct amdgpu_reset_context reset_context;
> +
> +       memset(&reset_context, 0, sizeof(reset_context));
> +
> +       reset_context.method = AMD_RESET_METHOD_NONE;
> +       reset_context.reset_req_dev = adev;
> +       reset_context.src = AMDGPU_RESET_SRC_USERQ;
> +       set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
> +       /*set_bit(AMDGPU_SKIP_COREDUMP, &reset_context.flags);*/
> +
> +       amdgpu_device_gpu_recover(adev, NULL, &reset_context);
> +}
> +
>  static int
>  amdgpu_userq_wait_for_signal(struct amdgpu_userq_mgr *uq_mgr)
>  {
> @@ -1082,22 +1181,19 @@ void
>  amdgpu_userq_evict(struct amdgpu_userq_mgr *uq_mgr,
>                    struct amdgpu_eviction_fence *ev_fence)
>  {
> -       int ret;
>         struct amdgpu_fpriv *fpriv = uq_mgr_to_fpriv(uq_mgr);
>         struct amdgpu_eviction_fence_mgr *evf_mgr = &fpriv->evf_mgr;
> +       struct amdgpu_device *adev = uq_mgr->adev;
> +       int ret;
>
>         /* Wait for any pending userqueue fence work to finish */
>         ret = amdgpu_userq_wait_for_signal(uq_mgr);
> -       if (ret) {
> -               drm_file_err(uq_mgr->file, "Not evicting userqueue, timeout waiting for work\n");
> -               return;
> -       }
> +       if (ret)
> +               dev_err(adev->dev, "Not evicting userqueue, timeout waiting for work\n");
>
>         ret = amdgpu_userq_evict_all(uq_mgr);
> -       if (ret) {
> -               drm_file_err(uq_mgr->file, "Failed to evict userqueue\n");
> -               return;
> -       }
> +       if (ret)
> +               dev_err(adev->dev, "Failed to evict userqueue\n");
>
>         /* Signal current eviction fence */
>         amdgpu_eviction_fence_signal(evf_mgr, ev_fence);
> @@ -1131,6 +1227,7 @@ void amdgpu_userq_mgr_fini(struct amdgpu_userq_mgr *userq_mgr)
>         cancel_delayed_work_sync(&userq_mgr->resume_work);
>
>         mutex_lock(&userq_mgr->userq_mutex);
> +       amdgpu_userq_detect_and_reset_queues(userq_mgr);
>         xa_for_each(&userq_mgr->userq_mgr_xa, queue_id, queue) {
>                 amdgpu_userq_wait_for_last_fence(userq_mgr, queue);
>                 amdgpu_userq_unmap_helper(userq_mgr, queue);
> @@ -1157,6 +1254,7 @@ int amdgpu_userq_suspend(struct amdgpu_device *adev)
>                 uqm = queue->userq_mgr;
>                 cancel_delayed_work_sync(&uqm->resume_work);
>                 guard(mutex)(&uqm->userq_mutex);
> +               amdgpu_userq_detect_and_reset_queues(uqm);
>                 if (adev->in_s0ix)
>                         r = amdgpu_userq_preempt_helper(uqm, queue);
>                 else
> @@ -1215,6 +1313,7 @@ int amdgpu_userq_stop_sched_for_enforce_isolation(struct amdgpu_device *adev,
>                 if (((queue->queue_type == AMDGPU_HW_IP_GFX) ||
>                      (queue->queue_type == AMDGPU_HW_IP_COMPUTE)) &&
>                     (queue->xcp_id == idx)) {
> +                       amdgpu_userq_detect_and_reset_queues(uqm);
>                         r = amdgpu_userq_preempt_helper(uqm, queue);
>                         if (r)
>                                 ret = r;
> @@ -1287,3 +1386,56 @@ int amdgpu_userq_gem_va_unmap_validate(struct amdgpu_device *adev,
>
>         return 0;
>  }
> +
> +void amdgpu_userq_pre_reset(struct amdgpu_device *adev)
> +{
> +       const struct amdgpu_userq_funcs *userq_funcs;
> +       struct amdgpu_usermode_queue *queue;
> +       struct amdgpu_userq_mgr *uqm;
> +       unsigned long queue_id;
> +
> +       xa_for_each(&adev->userq_doorbell_xa, queue_id, queue) {
> +               uqm = queue->userq_mgr;
> +               cancel_delayed_work_sync(&uqm->resume_work);
> +               if (queue->state == AMDGPU_USERQ_STATE_MAPPED) {
> +                       amdgpu_userq_wait_for_last_fence(uqm, queue);
> +                       userq_funcs = adev->userq_funcs[queue->queue_type];
> +                       userq_funcs->unmap(uqm, queue);
> +                       /* just mark all queues as hung at this point.
> +                        * if unmap succeeds, we could map again
> +                        * in amdgpu_userq_post_reset() if vram is not lost
> +                        */
> +                       queue->state = AMDGPU_USERQ_STATE_HUNG;
> +                       amdgpu_userq_fence_driver_force_completion(queue);
> +               }
> +       }
> +}
> +
> +int amdgpu_userq_post_reset(struct amdgpu_device *adev, bool vram_lost)
> +{
> +       /* if any queue state is AMDGPU_USERQ_STATE_UNMAPPED
> +        * at this point, we should be able to map it again
> +        * and continue if vram is not lost.
> +        */
> +       struct amdgpu_userq_mgr *uqm;
> +       struct amdgpu_usermode_queue *queue;
> +       const struct amdgpu_userq_funcs *userq_funcs;
> +       unsigned long queue_id;
> +       int r = 0;
> +
> +       xa_for_each(&adev->userq_doorbell_xa, queue_id, queue) {
> +               uqm = queue->userq_mgr;
> +               if (queue->state == AMDGPU_USERQ_STATE_HUNG && !vram_lost) {
> +                       userq_funcs = adev->userq_funcs[queue->queue_type];
> +                       /* Re-map queue */
> +                       r = userq_funcs->map(uqm, queue);
> +                       if (r) {
> +                               dev_err(adev->dev, "Failed to remap queue %ld\n", queue_id);
> +                               continue;
> +                       }
> +                       queue->state = AMDGPU_USERQ_STATE_MAPPED;
> +               }
> +       }
> +
> +       return r;
> +}
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
> index 09da0617bfa2..c37444427a14 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
> @@ -106,6 +106,7 @@ struct amdgpu_userq_mgr {
>         struct amdgpu_device            *adev;
>         struct delayed_work             resume_work;
>         struct drm_file                 *file;
> +       atomic_t                        userq_count[AMDGPU_RING_TYPE_MAX];
>  };
>
>  struct amdgpu_db_info {
> @@ -148,6 +149,10 @@ int amdgpu_userq_stop_sched_for_enforce_isolation(struct amdgpu_device *adev,
>                                                   u32 idx);
>  int amdgpu_userq_start_sched_for_enforce_isolation(struct amdgpu_device *adev,
>                                                    u32 idx);
> +void amdgpu_userq_reset_work(struct work_struct *work);
> +void amdgpu_userq_pre_reset(struct amdgpu_device *adev);
> +int amdgpu_userq_post_reset(struct amdgpu_device *adev, bool vram_lost);
> +
>  int amdgpu_userq_input_va_validate(struct amdgpu_usermode_queue *queue,
>                                    u64 addr, u64 expected_size);
>  int amdgpu_userq_gem_va_unmap_validate(struct amdgpu_device *adev,
> --
> 2.49.0
>

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2025-10-30 12:41 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-10-24  9:43 [PATCH 1/3] drm/amdgpu: Implement user queue reset functionality Jesse.Zhang
2025-10-24  9:43 ` [PATCH 2/3] drm/amdgpu: Add user queue reset mask support Jesse.Zhang
2025-10-29 18:13   ` Alex Deucher
2025-10-30  2:08     ` Zhang, Jesse(Jie)
2025-10-30 12:41       ` Alex Deucher
2025-10-24  9:43 ` [PATCH 3/3] drm/amdgpu: use irq-safe lock in amdgpu_userq_fence_driver_process Jesse.Zhang
2025-10-28  4:58   ` Zhang, Jesse(Jie)
2025-10-29 18:09   ` Alex Deucher
2025-10-29 18:09 ` [PATCH 1/3] drm/amdgpu: Implement user queue reset functionality Alex Deucher

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox