AMD-GFX Archive on lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v8 1/2] drm/amdgpu: Convert amdgpu userqueue management from IDR to XArray
@ 2025-10-15  1:08 Jesse.Zhang
  2025-10-15  1:08 ` [PATCH v8 2/2] drm/amdgpu: Implement user queue reset functionality Jesse.Zhang
                   ` (2 more replies)
  0 siblings, 3 replies; 5+ messages in thread
From: Jesse.Zhang @ 2025-10-15  1:08 UTC (permalink / raw)
  To: amd-gfx; +Cc: Alexander.Deucher, Christian Koenig, Jesse.Zhang

This commit refactors the AMDGPU userqueue management subsystem to replace
IDR (ID Allocation) with XArray for improved performance, scalability, and
maintainability. The changes address several issues with the previous IDR
implementation and provide better locking semantics.

Key changes:

1. **Global XArray Introduction**:
   - Added `userq_doorbell_xa` to `struct amdgpu_device` for global queue tracking
   - Uses doorbell_index as key for efficient global lookup
   - Replaces the previous `userq_mgr_list` linked list approach

2. **Per-process XArray Conversion**:
   - Replaced `userq_idr` with `userq_mgr_xa` in `struct amdgpu_userq_mgr`
   - Maintains per-process queue tracking with queue_id as key
   - Uses XA_FLAGS_ALLOC for automatic ID allocation

3. **Locking Improvements**:
   - Removed global `userq_mutex` from `struct amdgpu_device`
   - Replaced with fine-grained XArray locking using XArray's internal spinlocks

4. **Runtime Idle Check Optimization**:
   - Updated `amdgpu_runtime_idle_check_userq()` to use xa_empty

5. **Queue Management Functions**:
   - Converted all IDR operations to equivalent XArray functions:
     - `idr_alloc()` → `xa_alloc()`
     - `idr_find()` → `xa_load()`
     - `idr_remove()` → `xa_erase()`
     - `idr_for_each()` → `xa_for_each()`

Benefits:
- **Performance**: XArray provides better scalability for large numbers of queues
- **Memory Efficiency**: Reduced memory overhead compared to IDR
- **Thread Safety**: Improved locking semantics with XArray's internal spinlocks

v2: rename userq_global_xa/userq_xa to userq_doorbell_xa/userq_mgr_xa
    Remove xa_lock and use its own lock.

v3: Set queue->userq_mgr = uq_mgr in amdgpu_userq_create()
v4: use xa_store_irq (Christian)
    hold the read side of the reset lock while creating/destroying queues and the manager data structure. (Chritian)

Suggested-by: Christian König <christian.koenig@amd.com>
Signed-off-by: Jesse Zhang <Jesse.Zhang@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h           |   8 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c    |   3 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c       |  16 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c     | 151 +++++++++---------
 drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h     |   8 +-
 .../gpu/drm/amd/amdgpu/amdgpu_userq_fence.c   |   4 +-
 drivers/gpu/drm/amd/amdgpu/mes_userqueue.c    |  22 ++-
 7 files changed, 101 insertions(+), 111 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index d9bd1d71552e..3d71da46b471 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1174,6 +1174,12 @@ struct amdgpu_device {
 	 * queue fence.
 	 */
 	struct xarray			userq_xa;
+	/**
+	 * @userq_doorbell_xa: Global user queue map (doorbell index → queue)
+	 * Key: doorbell_index (unique global identifier for the queue)
+	 * Value: struct amdgpu_usermode_queue
+	 */
+	struct xarray userq_doorbell_xa;
 
 	/* df */
 	struct amdgpu_df                df;
@@ -1307,8 +1313,6 @@ struct amdgpu_device {
 	 */
 	bool                            apu_prefer_gtt;
 
-	struct list_head		userq_mgr_list;
-	struct mutex                    userq_mutex;
 	bool                            userq_halt_for_enforce_isolation;
 	struct amdgpu_uid *uid_info;
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 0d5585bc3b04..5d9fd2cabc58 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -4533,7 +4533,6 @@ int amdgpu_device_init(struct amdgpu_device *adev,
 	mutex_init(&adev->gfx.userq_sch_mutex);
 	mutex_init(&adev->gfx.workload_profile_mutex);
 	mutex_init(&adev->vcn.workload_profile_mutex);
-	mutex_init(&adev->userq_mutex);
 
 	amdgpu_device_init_apu_flags(adev);
 
@@ -4561,7 +4560,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
 
 	INIT_LIST_HEAD(&adev->pm.od_kobj_list);
 
-	INIT_LIST_HEAD(&adev->userq_mgr_list);
+	xa_init(&adev->userq_doorbell_xa);
 
 	INIT_DELAYED_WORK(&adev->delayed_init_work,
 			  amdgpu_device_delayed_init_work_handler);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index bebf2ebc4f34..e8909abcf9c0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -2772,22 +2772,8 @@ static int amdgpu_runtime_idle_check_userq(struct device *dev)
 	struct pci_dev *pdev = to_pci_dev(dev);
 	struct drm_device *drm_dev = pci_get_drvdata(pdev);
 	struct amdgpu_device *adev = drm_to_adev(drm_dev);
-	struct amdgpu_usermode_queue *queue;
-	struct amdgpu_userq_mgr *uqm, *tmp;
-	int queue_id;
-	int ret = 0;
-
-	mutex_lock(&adev->userq_mutex);
-	list_for_each_entry_safe(uqm, tmp, &adev->userq_mgr_list, list) {
-		idr_for_each_entry(&uqm->userq_idr, queue, queue_id) {
-			ret = -EBUSY;
-			goto done;
-		}
-	}
-done:
-	mutex_unlock(&adev->userq_mutex);
 
-	return ret;
+	return xa_empty(&adev->userq_doorbell_xa) ? 0 : -EBUSY;
 }
 
 static int amdgpu_pmops_runtime_suspend(struct device *dev)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
index 153d64bc3dcd..1e3a1013eb71 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
@@ -30,6 +30,7 @@
 #include "amdgpu_vm.h"
 #include "amdgpu_userq.h"
 #include "amdgpu_userq_fence.h"
+#include "amdgpu_reset.h"
 
 u32 amdgpu_userq_get_supported_ip_mask(struct amdgpu_device *adev)
 {
@@ -277,19 +278,27 @@ amdgpu_userq_cleanup(struct amdgpu_userq_mgr *uq_mgr,
 	struct amdgpu_device *adev = uq_mgr->adev;
 	const struct amdgpu_userq_funcs *uq_funcs = adev->userq_funcs[queue->queue_type];
 
+	/* Wait for mode-1 reset to complete */
+	down_read(&adev->reset_domain->sem);
+
 	/* Drop the userq reference. */
 	amdgpu_userq_buffer_vas_list_cleanup(adev, queue);
 	uq_funcs->mqd_destroy(uq_mgr, queue);
 	amdgpu_userq_fence_driver_free(queue);
-	idr_remove(&uq_mgr->userq_idr, queue_id);
+	/* Use interrupt-safe locking since IRQ handlers may access these XArrays */
+	xa_erase_irq(&uq_mgr->userq_mgr_xa, (unsigned long)queue_id);
+	xa_erase_irq(&adev->userq_doorbell_xa, queue->doorbell_index);
+	queue->userq_mgr = NULL;
 	list_del(&queue->userq_va_list);
 	kfree(queue);
+
+	up_read(&adev->reset_domain->sem);
 }
 
 static struct amdgpu_usermode_queue *
 amdgpu_userq_find(struct amdgpu_userq_mgr *uq_mgr, int qid)
 {
-	return idr_find(&uq_mgr->userq_idr, qid);
+	return xa_load(&uq_mgr->userq_mgr_xa, qid);
 }
 
 void
@@ -550,8 +559,9 @@ amdgpu_userq_create(struct drm_file *filp, union drm_amdgpu_userq *args)
 	struct amdgpu_db_info db_info;
 	char *queue_name;
 	bool skip_map_queue;
+	u32 qid;
 	uint64_t index;
-	int qid, r = 0;
+	int r = 0;
 	int priority =
 		(args->in.flags & AMDGPU_USERQ_CREATE_FLAGS_QUEUE_PRIORITY_MASK) >>
 		AMDGPU_USERQ_CREATE_FLAGS_QUEUE_PRIORITY_SHIFT;
@@ -574,7 +584,6 @@ amdgpu_userq_create(struct drm_file *filp, union drm_amdgpu_userq *args)
 	 *
 	 * This will also make sure we have a valid eviction fence ready to be used.
 	 */
-	mutex_lock(&adev->userq_mutex);
 	amdgpu_userq_ensure_ev_fence(&fpriv->userq_mgr, &fpriv->evf_mgr);
 
 	uq_funcs = adev->userq_funcs[args->in.ip_type];
@@ -637,15 +646,27 @@ amdgpu_userq_create(struct drm_file *filp, union drm_amdgpu_userq *args)
 		goto unlock;
 	}
 
-	qid = idr_alloc(&uq_mgr->userq_idr, queue, 1, AMDGPU_MAX_USERQ_COUNT, GFP_KERNEL);
-	if (qid < 0) {
+	/* Wait for mode-1 reset to complete */
+	down_read(&adev->reset_domain->sem);
+	r =xa_err(xa_store_irq(&adev->userq_doorbell_xa, index, queue, GFP_KERNEL));
+	if (r) {
+		kfree(queue);
+		up_read(&adev->reset_domain->sem);
+		goto unlock;
+	}
+
+	r = xa_alloc(&uq_mgr->userq_mgr_xa, &qid, queue, XA_LIMIT(1, AMDGPU_MAX_USERQ_COUNT), GFP_KERNEL);
+	if (r) {
 		drm_file_err(uq_mgr->file, "Failed to allocate a queue id\n");
 		amdgpu_userq_fence_driver_free(queue);
 		uq_funcs->mqd_destroy(uq_mgr, queue);
 		kfree(queue);
 		r = -ENOMEM;
+		up_read(&adev->reset_domain->sem);
 		goto unlock;
 	}
+	up_read(&adev->reset_domain->sem);
+	queue->userq_mgr = uq_mgr;
 
 	/* don't map the queue if scheduling is halted */
 	if (adev->userq_halt_for_enforce_isolation &&
@@ -658,7 +679,7 @@ amdgpu_userq_create(struct drm_file *filp, union drm_amdgpu_userq *args)
 		r = amdgpu_userq_map_helper(uq_mgr, queue);
 		if (r) {
 			drm_file_err(uq_mgr->file, "Failed to map Queue\n");
-			idr_remove(&uq_mgr->userq_idr, qid);
+			xa_erase(&uq_mgr->userq_mgr_xa, qid);
 			amdgpu_userq_fence_driver_free(queue);
 			uq_funcs->mqd_destroy(uq_mgr, queue);
 			kfree(queue);
@@ -681,7 +702,6 @@ amdgpu_userq_create(struct drm_file *filp, union drm_amdgpu_userq *args)
 
 unlock:
 	mutex_unlock(&uq_mgr->userq_mutex);
-	mutex_unlock(&adev->userq_mutex);
 
 	return r;
 }
@@ -779,11 +799,11 @@ static int
 amdgpu_userq_restore_all(struct amdgpu_userq_mgr *uq_mgr)
 {
 	struct amdgpu_usermode_queue *queue;
-	int queue_id;
+	unsigned long queue_id;
 	int ret = 0, r;
 
 	/* Resume all the queues for this process */
-	idr_for_each_entry(&uq_mgr->userq_idr, queue, queue_id) {
+	xa_for_each(&uq_mgr->userq_mgr_xa, queue_id, queue) {
 
 		if (!amdgpu_userq_buffer_vas_mapped(queue)) {
 			drm_file_err(uq_mgr->file,
@@ -942,11 +962,11 @@ static int
 amdgpu_userq_evict_all(struct amdgpu_userq_mgr *uq_mgr)
 {
 	struct amdgpu_usermode_queue *queue;
-	int queue_id;
+	unsigned long queue_id;
 	int ret = 0, r;
 
 	/* Try to unmap all the queues in this process ctx */
-	idr_for_each_entry(&uq_mgr->userq_idr, queue, queue_id) {
+	xa_for_each(&uq_mgr->userq_mgr_xa, queue_id, queue) {
 		r = amdgpu_userq_preempt_helper(uq_mgr, queue);
 		if (r)
 			ret = r;
@@ -961,9 +981,10 @@ static int
 amdgpu_userq_wait_for_signal(struct amdgpu_userq_mgr *uq_mgr)
 {
 	struct amdgpu_usermode_queue *queue;
-	int queue_id, ret;
+	unsigned long queue_id;
+	int ret;
 
-	idr_for_each_entry(&uq_mgr->userq_idr, queue, queue_id) {
+	xa_for_each(&uq_mgr->userq_mgr_xa, queue_id, queue) {
 		struct dma_fence *f = queue->last_fence;
 
 		if (!f || dma_fence_is_signaled(f))
@@ -1016,44 +1037,30 @@ int amdgpu_userq_mgr_init(struct amdgpu_userq_mgr *userq_mgr, struct drm_file *f
 			  struct amdgpu_device *adev)
 {
 	mutex_init(&userq_mgr->userq_mutex);
-	idr_init_base(&userq_mgr->userq_idr, 1);
+	xa_init_flags(&userq_mgr->userq_mgr_xa, XA_FLAGS_ALLOC);
 	userq_mgr->adev = adev;
 	userq_mgr->file = file_priv;
 
-	mutex_lock(&adev->userq_mutex);
-	list_add(&userq_mgr->list, &adev->userq_mgr_list);
-	mutex_unlock(&adev->userq_mutex);
-
 	INIT_DELAYED_WORK(&userq_mgr->resume_work, amdgpu_userq_restore_worker);
 	return 0;
 }
 
 void amdgpu_userq_mgr_fini(struct amdgpu_userq_mgr *userq_mgr)
 {
-	struct amdgpu_device *adev = userq_mgr->adev;
 	struct amdgpu_usermode_queue *queue;
-	struct amdgpu_userq_mgr *uqm, *tmp;
-	uint32_t queue_id;
+	unsigned long queue_id;
 
 	cancel_delayed_work_sync(&userq_mgr->resume_work);
 
-	mutex_lock(&adev->userq_mutex);
 	mutex_lock(&userq_mgr->userq_mutex);
-	idr_for_each_entry(&userq_mgr->userq_idr, queue, queue_id) {
+	xa_for_each(&userq_mgr->userq_mgr_xa, queue_id, queue) {
 		amdgpu_userq_wait_for_last_fence(userq_mgr, queue);
 		amdgpu_userq_unmap_helper(userq_mgr, queue);
 		amdgpu_userq_cleanup(userq_mgr, queue, queue_id);
 	}
 
-	list_for_each_entry_safe(uqm, tmp, &adev->userq_mgr_list, list) {
-		if (uqm == userq_mgr) {
-			list_del(&uqm->list);
-			break;
-		}
-	}
-	idr_destroy(&userq_mgr->userq_idr);
+	xa_destroy(&userq_mgr->userq_mgr_xa);
 	mutex_unlock(&userq_mgr->userq_mutex);
-	mutex_unlock(&adev->userq_mutex);
 	mutex_destroy(&userq_mgr->userq_mutex);
 }
 
@@ -1061,25 +1068,23 @@ int amdgpu_userq_suspend(struct amdgpu_device *adev)
 {
 	u32 ip_mask = amdgpu_userq_get_supported_ip_mask(adev);
 	struct amdgpu_usermode_queue *queue;
-	struct amdgpu_userq_mgr *uqm, *tmp;
-	int queue_id;
+	struct amdgpu_userq_mgr *uqm;
+	unsigned long queue_id;
 	int r;
 
 	if (!ip_mask)
 		return 0;
 
-	guard(mutex)(&adev->userq_mutex);
-	list_for_each_entry_safe(uqm, tmp, &adev->userq_mgr_list, list) {
+	xa_for_each(&adev->userq_doorbell_xa, queue_id, queue) {
+		uqm = queue->userq_mgr;
 		cancel_delayed_work_sync(&uqm->resume_work);
 		guard(mutex)(&uqm->userq_mutex);
-		idr_for_each_entry(&uqm->userq_idr, queue, queue_id) {
-			if (adev->in_s0ix)
-				r = amdgpu_userq_preempt_helper(uqm, queue);
-			else
-				r = amdgpu_userq_unmap_helper(uqm, queue);
-			if (r)
-				return r;
-		}
+		if (adev->in_s0ix)
+			r = amdgpu_userq_preempt_helper(uqm, queue);
+		else
+			r = amdgpu_userq_unmap_helper(uqm, queue);
+		if (r)
+			return r;
 	}
 	return 0;
 }
@@ -1088,24 +1093,22 @@ int amdgpu_userq_resume(struct amdgpu_device *adev)
 {
 	u32 ip_mask = amdgpu_userq_get_supported_ip_mask(adev);
 	struct amdgpu_usermode_queue *queue;
-	struct amdgpu_userq_mgr *uqm, *tmp;
-	int queue_id;
+	struct amdgpu_userq_mgr *uqm;
+	unsigned long queue_id;
 	int r;
 
 	if (!ip_mask)
 		return 0;
 
-	guard(mutex)(&adev->userq_mutex);
-	list_for_each_entry_safe(uqm, tmp, &adev->userq_mgr_list, list) {
+	xa_for_each(&adev->userq_doorbell_xa, queue_id, queue) {
+		uqm = queue->userq_mgr;
 		guard(mutex)(&uqm->userq_mutex);
-		idr_for_each_entry(&uqm->userq_idr, queue, queue_id) {
-			if (adev->in_s0ix)
-				r = amdgpu_userq_restore_helper(uqm, queue);
-			else
-				r = amdgpu_userq_map_helper(uqm, queue);
-			if (r)
-				return r;
-		}
+		if (adev->in_s0ix)
+			r = amdgpu_userq_restore_helper(uqm, queue);
+		else
+			r = amdgpu_userq_map_helper(uqm, queue);
+		if (r)
+			return r;
 	}
 
 	return 0;
@@ -1116,33 +1119,31 @@ int amdgpu_userq_stop_sched_for_enforce_isolation(struct amdgpu_device *adev,
 {
 	u32 ip_mask = amdgpu_userq_get_supported_ip_mask(adev);
 	struct amdgpu_usermode_queue *queue;
-	struct amdgpu_userq_mgr *uqm, *tmp;
-	int queue_id;
+	struct amdgpu_userq_mgr *uqm;
+	unsigned long queue_id;
 	int ret = 0, r;
 
 	/* only need to stop gfx/compute */
 	if (!(ip_mask & ((1 << AMDGPU_HW_IP_GFX) | (1 << AMDGPU_HW_IP_COMPUTE))))
 		return 0;
 
-	mutex_lock(&adev->userq_mutex);
 	if (adev->userq_halt_for_enforce_isolation)
 		dev_warn(adev->dev, "userq scheduling already stopped!\n");
 	adev->userq_halt_for_enforce_isolation = true;
-	list_for_each_entry_safe(uqm, tmp, &adev->userq_mgr_list, list) {
+	xa_for_each(&adev->userq_doorbell_xa, queue_id, queue) {
+		uqm = queue->userq_mgr;
 		cancel_delayed_work_sync(&uqm->resume_work);
 		mutex_lock(&uqm->userq_mutex);
-		idr_for_each_entry(&uqm->userq_idr, queue, queue_id) {
-			if (((queue->queue_type == AMDGPU_HW_IP_GFX) ||
-			     (queue->queue_type == AMDGPU_HW_IP_COMPUTE)) &&
-			    (queue->xcp_id == idx)) {
-				r = amdgpu_userq_preempt_helper(uqm, queue);
-				if (r)
-					ret = r;
-			}
+		if (((queue->queue_type == AMDGPU_HW_IP_GFX) ||
+		     (queue->queue_type == AMDGPU_HW_IP_COMPUTE)) &&
+		    (queue->xcp_id == idx)) {
+			r = amdgpu_userq_preempt_helper(uqm, queue);
+			if (r)
+				ret = r;
 		}
 		mutex_unlock(&uqm->userq_mutex);
 	}
-	mutex_unlock(&adev->userq_mutex);
+
 	return ret;
 }
 
@@ -1151,21 +1152,20 @@ int amdgpu_userq_start_sched_for_enforce_isolation(struct amdgpu_device *adev,
 {
 	u32 ip_mask = amdgpu_userq_get_supported_ip_mask(adev);
 	struct amdgpu_usermode_queue *queue;
-	struct amdgpu_userq_mgr *uqm, *tmp;
-	int queue_id;
+	struct amdgpu_userq_mgr *uqm;
+	unsigned long queue_id;
 	int ret = 0, r;
 
 	/* only need to stop gfx/compute */
 	if (!(ip_mask & ((1 << AMDGPU_HW_IP_GFX) | (1 << AMDGPU_HW_IP_COMPUTE))))
 		return 0;
 
-	mutex_lock(&adev->userq_mutex);
 	if (!adev->userq_halt_for_enforce_isolation)
 		dev_warn(adev->dev, "userq scheduling already started!\n");
 	adev->userq_halt_for_enforce_isolation = false;
-	list_for_each_entry_safe(uqm, tmp, &adev->userq_mgr_list, list) {
+	xa_for_each(&adev->userq_doorbell_xa, queue_id, queue) {
+		uqm = queue->userq_mgr;
 		mutex_lock(&uqm->userq_mutex);
-		idr_for_each_entry(&uqm->userq_idr, queue, queue_id) {
 			if (((queue->queue_type == AMDGPU_HW_IP_GFX) ||
 			     (queue->queue_type == AMDGPU_HW_IP_COMPUTE)) &&
 			    (queue->xcp_id == idx)) {
@@ -1173,10 +1173,9 @@ int amdgpu_userq_start_sched_for_enforce_isolation(struct amdgpu_device *adev,
 				if (r)
 					ret = r;
 			}
-		}
 		mutex_unlock(&uqm->userq_mutex);
 	}
-	mutex_unlock(&adev->userq_mutex);
+
 	return ret;
 }
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
index 036d8dd585cd..09da0617bfa2 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
@@ -96,11 +96,15 @@ struct amdgpu_userq_funcs {
 
 /* Usermode queues for gfx */
 struct amdgpu_userq_mgr {
-	struct idr			userq_idr;
+	/**
+	 * @userq_mgr_xa: Per-process user queue map (queue ID → queue)
+	 * Key: queue_id (unique ID within the process's userq manager)
+	 * Value: struct amdgpu_usermode_queue
+	 */
+	struct xarray			userq_mgr_xa;
 	struct mutex			userq_mutex;
 	struct amdgpu_device		*adev;
 	struct delayed_work		resume_work;
-	struct list_head		list;
 	struct drm_file			*file;
 };
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c
index 761bad98da3e..2aeeaa954882 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c
@@ -537,7 +537,7 @@ int amdgpu_userq_signal_ioctl(struct drm_device *dev, void *data,
 	}
 
 	/* Retrieve the user queue */
-	queue = idr_find(&userq_mgr->userq_idr, args->queue_id);
+	queue = xa_load(&userq_mgr->userq_mgr_xa, args->queue_id);
 	if (!queue) {
 		r = -ENOENT;
 		goto put_gobj_write;
@@ -899,7 +899,7 @@ int amdgpu_userq_wait_ioctl(struct drm_device *dev, void *data,
 		 */
 		num_fences = dma_fence_dedup_array(fences, num_fences);
 
-		waitq = idr_find(&userq_mgr->userq_idr, wait_info->waitq_id);
+		waitq = xa_load(&userq_mgr->userq_mgr_xa, wait_info->waitq_id);
 		if (!waitq) {
 			r = -EINVAL;
 			goto free_fences;
diff --git a/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c b/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c
index 829129ad7bd1..b8486b0d0d69 100644
--- a/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c
+++ b/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c
@@ -205,9 +205,9 @@ static int mes_userq_detect_and_reset(struct amdgpu_device *adev,
 	int db_array_size = amdgpu_mes_get_hung_queue_db_array_size(adev);
 	struct mes_detect_and_reset_queue_input input;
 	struct amdgpu_usermode_queue *queue;
-	struct amdgpu_userq_mgr *uqm, *tmp;
 	unsigned int hung_db_num = 0;
-	int queue_id, r, i;
+	unsigned long queue_id;
+	int r, i;
 	u32 db_array[4];
 
 	if (db_array_size > 4) {
@@ -227,16 +227,14 @@ static int mes_userq_detect_and_reset(struct amdgpu_device *adev,
 	if (r) {
 		dev_err(adev->dev, "Failed to detect and reset queues, err (%d)\n", r);
 	} else if (hung_db_num) {
-		list_for_each_entry_safe(uqm, tmp, &adev->userq_mgr_list, list) {
-			idr_for_each_entry(&uqm->userq_idr, queue, queue_id) {
-				if (queue->queue_type == queue_type) {
-					for (i = 0; i < hung_db_num; i++) {
-						if (queue->doorbell_index == db_array[i]) {
-							queue->state = AMDGPU_USERQ_STATE_HUNG;
-							atomic_inc(&adev->gpu_reset_counter);
-							amdgpu_userq_fence_driver_force_completion(queue);
-							drm_dev_wedged_event(adev_to_drm(adev), DRM_WEDGE_RECOVERY_NONE, NULL);
-						}
+		xa_for_each(&adev->userq_doorbell_xa, queue_id, queue) {
+			if (queue->queue_type == queue_type) {
+				for (i = 0; i < hung_db_num; i++) {
+					if (queue->doorbell_index == db_array[i]) {
+						queue->state = AMDGPU_USERQ_STATE_HUNG;
+						atomic_inc(&adev->gpu_reset_counter);
+						amdgpu_userq_fence_driver_force_completion(queue);
+						drm_dev_wedged_event(adev_to_drm(adev), DRM_WEDGE_RECOVERY_NONE, NULL);
 					}
 				}
 			}
-- 
2.49.0


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* [PATCH v8 2/2] drm/amdgpu: Implement user queue reset functionality
  2025-10-15  1:08 [PATCH v8 1/2] drm/amdgpu: Convert amdgpu userqueue management from IDR to XArray Jesse.Zhang
@ 2025-10-15  1:08 ` Jesse.Zhang
  2025-10-23 20:41   ` Alex Deucher
  2025-10-17  1:25 ` [PATCH v8 1/2] drm/amdgpu: Convert amdgpu userqueue management from IDR to XArray Zhang, Jesse(Jie)
  2025-10-23 20:31 ` Alex Deucher
  2 siblings, 1 reply; 5+ messages in thread
From: Jesse.Zhang @ 2025-10-15  1:08 UTC (permalink / raw)
  To: amd-gfx; +Cc: Alexander.Deucher, Christian Koenig, Jesse.Zhang, Alex Deucher

This patch adds robust reset handling for user queues (userq) to improve
recovery from queue failures. The key components include:

1. Queue detection and reset logic:
   - amdgpu_userq_detect_and_reset_queues() identifies failed queues
   - Per-IP detect_and_reset callbacks for targeted recovery
   - Falls back to full GPU reset when needed

2. Reset infrastructure:
   - Adds userq_reset_work workqueue for async reset handling
   - Implements pre/post reset handlers for queue state management
   - Integrates with existing GPU reset framework

3. Error handling improvements:
   - Enhanced state tracking with HUNG state
   - Automatic reset triggering on critical failures
   - VRAM loss handling during recovery

4. Integration points:
   - Added to device init/reset paths
   - Called during queue destroy, suspend, and isolation events
   - Handles both individual queue and full GPU resets

The reset functionality works with both gfx/compute and sdma queues,
providing better resilience against queue failures while minimizing
disruption to unaffected queues.

v2: add detection and reset calls when preemption/unmaped fails.
    add a per device userq counter for each user queue type.(Alex)
v3: make sure we hold the adev->userq_mutex when we call amdgpu_userq_detect_and_reset_queues. (Alex)
   warn if the adev->userq_mutex is not held.
v4: make sure we have all of the uqm->userq_mutex held.
   warn if the uqm->userq_mutex is not held.

v5: Use array for user queue type counters.(Alex)
    all of the uqm->userq_mutex need to be held when calling detect and reset.  (Alex)

v6: fix lock dep warning in amdgpu_userq_fence_dence_driver_process

v7: add the queue types in an array and use a loop in amdgpu_userq_detect_and_reset_queues (Lijo)

Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Signed-off-by: Jesse Zhang <Jesse.Zhang@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h           |   1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c    |   8 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h      |   1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c     | 183 ++++++++++++++++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h     |   5 +
 .../gpu/drm/amd/amdgpu/amdgpu_userq_fence.c   |  13 +-
 6 files changed, 195 insertions(+), 16 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 3d71da46b471..dc695bf69092 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1314,6 +1314,7 @@ struct amdgpu_device {
 	bool                            apu_prefer_gtt;
 
 	bool                            userq_halt_for_enforce_isolation;
+	struct work_struct              userq_reset_work;
 	struct amdgpu_uid *uid_info;
 
 	/* KFD
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 5d9fd2cabc58..b63807b7b15a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -4583,6 +4583,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
 	}
 
 	INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
+	INIT_WORK(&adev->userq_reset_work, amdgpu_userq_reset_work);
 
 	adev->gfx.gfx_off_req_count = 1;
 	adev->gfx.gfx_off_residency = 0;
@@ -5965,6 +5966,10 @@ int amdgpu_device_reinit_after_reset(struct amdgpu_reset_context *reset_context)
 				if (r)
 					goto out;
 
+				r = amdgpu_userq_post_reset(tmp_adev, vram_lost);
+				if (r)
+					goto out;
+
 				drm_client_dev_resume(adev_to_drm(tmp_adev), false);
 
 				/*
@@ -6187,6 +6192,7 @@ static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev)
 	if (!amdgpu_sriov_vf(adev))
 		cancel_work(&adev->reset_work);
 #endif
+	cancel_work(&adev->userq_reset_work);
 
 	if (adev->kfd.dev)
 		cancel_work(&adev->kfd.reset_work);
@@ -6307,6 +6313,8 @@ static void amdgpu_device_halt_activities(struct amdgpu_device *adev,
 		    amdgpu_device_ip_need_full_reset(tmp_adev))
 			amdgpu_ras_suspend(tmp_adev);
 
+		amdgpu_userq_pre_reset(tmp_adev);
+
 		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
 			struct amdgpu_ring *ring = tmp_adev->rings[i];
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
index 87b962df5460..7a27c6c4bb44 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
@@ -83,6 +83,7 @@ enum amdgpu_ring_type {
 	AMDGPU_RING_TYPE_MES,
 	AMDGPU_RING_TYPE_UMSCH_MM,
 	AMDGPU_RING_TYPE_CPER,
+	AMDGPU_RING_TYPE_MAX,
 };
 
 enum amdgpu_ib_pool_type {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
index 1e3a1013eb71..4a7a954ee8ed 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
@@ -25,8 +25,10 @@
 #include <drm/drm_auth.h>
 #include <drm/drm_exec.h>
 #include <linux/pm_runtime.h>
+#include <drm/drm_drv.h>
 
 #include "amdgpu.h"
+#include "amdgpu_reset.h"
 #include "amdgpu_vm.h"
 #include "amdgpu_userq.h"
 #include "amdgpu_userq_fence.h"
@@ -45,6 +47,69 @@ u32 amdgpu_userq_get_supported_ip_mask(struct amdgpu_device *adev)
 	return userq_ip_mask;
 }
 
+static void amdgpu_userq_gpu_reset(struct amdgpu_device *adev)
+{
+	if (amdgpu_device_should_recover_gpu(adev)) {
+		amdgpu_reset_domain_schedule(adev->reset_domain,
+					     &adev->userq_reset_work);
+		/* Wait for the reset job to complete */
+		flush_work(&adev->userq_reset_work);
+	}
+}
+
+static int
+amdgpu_userq_detect_and_reset_queues(struct amdgpu_userq_mgr *uq_mgr)
+{
+	struct amdgpu_device *adev = uq_mgr->adev;
+	const int queue_types[] = {
+		AMDGPU_RING_TYPE_COMPUTE,
+		AMDGPU_RING_TYPE_GFX,
+		AMDGPU_RING_TYPE_SDMA
+	};
+	const int num_queue_types = ARRAY_SIZE(queue_types);
+	bool gpu_reset = false;
+	int r = 0;
+	int i;
+
+	/* Warning if current process mutex is not held */
+	WARN_ON(!mutex_is_locked(&uq_mgr->userq_mutex));
+
+	if (unlikely(adev->debug_disable_gpu_ring_reset)) {
+		dev_err(adev->dev, "userq reset disabled by debug mask\n");
+		return 0;
+	}
+
+	/*
+	 * If GPU recovery feature is disabled system-wide,
+	 * skip all reset detection logic
+	 */
+	if (!amdgpu_gpu_recovery)
+		return 0;
+
+	/*
+	 * Iterate through all queue types to detect and reset problematic queues
+	 * Process each queue type in the defined order
+	 */
+	for (i = 0; i < num_queue_types; i++) {
+		int ring_type = queue_types[i];
+		const struct amdgpu_userq_funcs *funcs = adev->userq_funcs[ring_type];
+
+		if (atomic_read(&uq_mgr->userq_count[ring_type]) > 0 &&
+		    funcs && funcs->detect_and_reset) {
+			r = funcs->detect_and_reset(adev, ring_type);
+			if (r) {
+				gpu_reset = true;
+				break;
+			}
+		}
+	}
+
+	if (gpu_reset)
+		amdgpu_userq_gpu_reset(adev);
+
+	return r;
+}
+
 static int amdgpu_userq_buffer_va_list_add(struct amdgpu_usermode_queue *queue,
 					   struct amdgpu_bo_va_mapping *va_map, u64 addr)
 {
@@ -175,17 +240,22 @@ amdgpu_userq_preempt_helper(struct amdgpu_userq_mgr *uq_mgr,
 	struct amdgpu_device *adev = uq_mgr->adev;
 	const struct amdgpu_userq_funcs *userq_funcs =
 		adev->userq_funcs[queue->queue_type];
+	bool found_hung_queue = false;
 	int r = 0;
 
 	if (queue->state == AMDGPU_USERQ_STATE_MAPPED) {
 		r = userq_funcs->preempt(uq_mgr, queue);
 		if (r) {
 			queue->state = AMDGPU_USERQ_STATE_HUNG;
+			found_hung_queue = true;
 		} else {
 			queue->state = AMDGPU_USERQ_STATE_PREEMPTED;
 		}
 	}
 
+	if (found_hung_queue)
+		amdgpu_userq_detect_and_reset_queues(uq_mgr);
+
 	return r;
 }
 
@@ -217,16 +287,23 @@ amdgpu_userq_unmap_helper(struct amdgpu_userq_mgr *uq_mgr,
 	struct amdgpu_device *adev = uq_mgr->adev;
 	const struct amdgpu_userq_funcs *userq_funcs =
 		adev->userq_funcs[queue->queue_type];
+	bool found_hung_queue = false;
 	int r = 0;
 
 	if ((queue->state == AMDGPU_USERQ_STATE_MAPPED) ||
 		(queue->state == AMDGPU_USERQ_STATE_PREEMPTED)) {
 		r = userq_funcs->unmap(uq_mgr, queue);
-		if (r)
+		if (r) {
 			queue->state = AMDGPU_USERQ_STATE_HUNG;
-		else
+			found_hung_queue = true;
+		} else {
 			queue->state = AMDGPU_USERQ_STATE_UNMAPPED;
+		}
 	}
+
+	if (found_hung_queue)
+		amdgpu_userq_detect_and_reset_queues(uq_mgr);
+
 	return r;
 }
 
@@ -243,10 +320,12 @@ amdgpu_userq_map_helper(struct amdgpu_userq_mgr *uq_mgr,
 		r = userq_funcs->map(uq_mgr, queue);
 		if (r) {
 			queue->state = AMDGPU_USERQ_STATE_HUNG;
+			amdgpu_userq_detect_and_reset_queues(uq_mgr);
 		} else {
 			queue->state = AMDGPU_USERQ_STATE_MAPPED;
 		}
 	}
+
 	return r;
 }
 
@@ -474,10 +553,11 @@ amdgpu_userq_destroy(struct drm_file *filp, int queue_id)
 		amdgpu_bo_unreserve(queue->db_obj.obj);
 	}
 	amdgpu_bo_unref(&queue->db_obj.obj);
-
+	atomic_dec(&uq_mgr->userq_count[queue->queue_type]);
 #if defined(CONFIG_DEBUG_FS)
 	debugfs_remove_recursive(queue->debugfs_queue);
 #endif
+	amdgpu_userq_detect_and_reset_queues(uq_mgr);
 	r = amdgpu_userq_unmap_helper(uq_mgr, queue);
 	/*TODO: It requires a reset for userq hw unmap error*/
 	if (unlikely(r != AMDGPU_USERQ_STATE_UNMAPPED)) {
@@ -699,6 +779,7 @@ amdgpu_userq_create(struct drm_file *filp, union drm_amdgpu_userq *args)
 	kfree(queue_name);
 
 	args->out.queue_id = qid;
+	atomic_inc(&uq_mgr->userq_count[queue->queue_type]);
 
 unlock:
 	mutex_unlock(&uq_mgr->userq_mutex);
@@ -965,6 +1046,7 @@ amdgpu_userq_evict_all(struct amdgpu_userq_mgr *uq_mgr)
 	unsigned long queue_id;
 	int ret = 0, r;
 
+	amdgpu_userq_detect_and_reset_queues(uq_mgr);
 	/* Try to unmap all the queues in this process ctx */
 	xa_for_each(&uq_mgr->userq_mgr_xa, queue_id, queue) {
 		r = amdgpu_userq_preempt_helper(uq_mgr, queue);
@@ -977,6 +1059,23 @@ amdgpu_userq_evict_all(struct amdgpu_userq_mgr *uq_mgr)
 	return ret;
 }
 
+void amdgpu_userq_reset_work(struct work_struct *work)
+{
+	struct amdgpu_device *adev = container_of(work, struct amdgpu_device,
+						  userq_reset_work);
+	struct amdgpu_reset_context reset_context;
+
+	memset(&reset_context, 0, sizeof(reset_context));
+
+	reset_context.method = AMD_RESET_METHOD_NONE;
+	reset_context.reset_req_dev = adev;
+	reset_context.src = AMDGPU_RESET_SRC_USERQ;
+	set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
+	/*set_bit(AMDGPU_SKIP_COREDUMP, &reset_context.flags);*/
+
+	amdgpu_device_gpu_recover(adev, NULL, &reset_context);
+}
+
 static int
 amdgpu_userq_wait_for_signal(struct amdgpu_userq_mgr *uq_mgr)
 {
@@ -1004,22 +1103,19 @@ void
 amdgpu_userq_evict(struct amdgpu_userq_mgr *uq_mgr,
 		   struct amdgpu_eviction_fence *ev_fence)
 {
-	int ret;
 	struct amdgpu_fpriv *fpriv = uq_mgr_to_fpriv(uq_mgr);
 	struct amdgpu_eviction_fence_mgr *evf_mgr = &fpriv->evf_mgr;
+	struct amdgpu_device *adev = uq_mgr->adev;
+	int ret;
 
 	/* Wait for any pending userqueue fence work to finish */
 	ret = amdgpu_userq_wait_for_signal(uq_mgr);
-	if (ret) {
-		drm_file_err(uq_mgr->file, "Not evicting userqueue, timeout waiting for work\n");
-		return;
-	}
+	if (ret)
+		dev_err(adev->dev, "Not evicting userqueue, timeout waiting for work\n");
 
 	ret = amdgpu_userq_evict_all(uq_mgr);
-	if (ret) {
-		drm_file_err(uq_mgr->file, "Failed to evict userqueue\n");
-		return;
-	}
+	if (ret)
+		dev_err(adev->dev, "Failed to evict userqueue\n");
 
 	/* Signal current eviction fence */
 	amdgpu_eviction_fence_signal(evf_mgr, ev_fence);
@@ -1036,11 +1132,18 @@ amdgpu_userq_evict(struct amdgpu_userq_mgr *uq_mgr,
 int amdgpu_userq_mgr_init(struct amdgpu_userq_mgr *userq_mgr, struct drm_file *file_priv,
 			  struct amdgpu_device *adev)
 {
+	int i;
+
 	mutex_init(&userq_mgr->userq_mutex);
 	xa_init_flags(&userq_mgr->userq_mgr_xa, XA_FLAGS_ALLOC);
 	userq_mgr->adev = adev;
 	userq_mgr->file = file_priv;
 
+	/* Initialize all queue type counters to zero */
+	for (i = 0; i < AMDGPU_RING_TYPE_MAX; i++) {
+		atomic_set(&userq_mgr->userq_count[i], 0);
+	}
+
 	INIT_DELAYED_WORK(&userq_mgr->resume_work, amdgpu_userq_restore_worker);
 	return 0;
 }
@@ -1053,6 +1156,7 @@ void amdgpu_userq_mgr_fini(struct amdgpu_userq_mgr *userq_mgr)
 	cancel_delayed_work_sync(&userq_mgr->resume_work);
 
 	mutex_lock(&userq_mgr->userq_mutex);
+	amdgpu_userq_detect_and_reset_queues(userq_mgr);
 	xa_for_each(&userq_mgr->userq_mgr_xa, queue_id, queue) {
 		amdgpu_userq_wait_for_last_fence(userq_mgr, queue);
 		amdgpu_userq_unmap_helper(userq_mgr, queue);
@@ -1079,6 +1183,7 @@ int amdgpu_userq_suspend(struct amdgpu_device *adev)
 		uqm = queue->userq_mgr;
 		cancel_delayed_work_sync(&uqm->resume_work);
 		guard(mutex)(&uqm->userq_mutex);
+		amdgpu_userq_detect_and_reset_queues(uqm);
 		if (adev->in_s0ix)
 			r = amdgpu_userq_preempt_helper(uqm, queue);
 		else
@@ -1137,6 +1242,7 @@ int amdgpu_userq_stop_sched_for_enforce_isolation(struct amdgpu_device *adev,
 		if (((queue->queue_type == AMDGPU_HW_IP_GFX) ||
 		     (queue->queue_type == AMDGPU_HW_IP_COMPUTE)) &&
 		    (queue->xcp_id == idx)) {
+			amdgpu_userq_detect_and_reset_queues(uqm);
 			r = amdgpu_userq_preempt_helper(uqm, queue);
 			if (r)
 				ret = r;
@@ -1209,3 +1315,56 @@ int amdgpu_userq_gem_va_unmap_validate(struct amdgpu_device *adev,
 
 	return 0;
 }
+
+void amdgpu_userq_pre_reset(struct amdgpu_device *adev)
+{
+	const struct amdgpu_userq_funcs *userq_funcs;
+	struct amdgpu_usermode_queue *queue;
+	struct amdgpu_userq_mgr *uqm;
+	unsigned long queue_id;
+
+	xa_for_each(&adev->userq_doorbell_xa, queue_id, queue) {
+		uqm = queue->userq_mgr;
+		cancel_delayed_work_sync(&uqm->resume_work);
+		if (queue->state == AMDGPU_USERQ_STATE_MAPPED) {
+			amdgpu_userq_wait_for_last_fence(uqm, queue);
+			userq_funcs = adev->userq_funcs[queue->queue_type];
+			userq_funcs->unmap(uqm, queue);
+			/* just mark all queues as hung at this point.
+			 * if unmap succeeds, we could map again
+			 * in amdgpu_userq_post_reset() if vram is not lost
+			 */
+			queue->state = AMDGPU_USERQ_STATE_HUNG;
+			amdgpu_userq_fence_driver_force_completion(queue);
+		}
+	}
+}
+
+int amdgpu_userq_post_reset(struct amdgpu_device *adev, bool vram_lost)
+{
+	/* if any queue state is AMDGPU_USERQ_STATE_UNMAPPED
+	 * at this point, we should be able to map it again
+	 * and continue if vram is not lost.
+	 */
+	struct amdgpu_userq_mgr *uqm;
+	struct amdgpu_usermode_queue *queue;
+	const struct amdgpu_userq_funcs *userq_funcs;
+	unsigned long queue_id;
+	int r = 0;
+
+	xa_for_each(&adev->userq_doorbell_xa, queue_id, queue) {
+		uqm = queue->userq_mgr;
+		if (queue->state == AMDGPU_USERQ_STATE_HUNG && !vram_lost) {
+			userq_funcs = adev->userq_funcs[queue->queue_type];
+			/* Re-map queue */
+			r = userq_funcs->map(uqm, queue);
+			if (r) {
+				dev_err(adev->dev, "Failed to remap queue %ld\n", queue_id);
+				continue;
+			}
+			queue->state = AMDGPU_USERQ_STATE_MAPPED;
+		}
+	}
+
+	return r;
+}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
index 09da0617bfa2..c37444427a14 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
@@ -106,6 +106,7 @@ struct amdgpu_userq_mgr {
 	struct amdgpu_device		*adev;
 	struct delayed_work		resume_work;
 	struct drm_file			*file;
+	atomic_t                        userq_count[AMDGPU_RING_TYPE_MAX];
 };
 
 struct amdgpu_db_info {
@@ -148,6 +149,10 @@ int amdgpu_userq_stop_sched_for_enforce_isolation(struct amdgpu_device *adev,
 						  u32 idx);
 int amdgpu_userq_start_sched_for_enforce_isolation(struct amdgpu_device *adev,
 						   u32 idx);
+void amdgpu_userq_reset_work(struct work_struct *work);
+void amdgpu_userq_pre_reset(struct amdgpu_device *adev);
+int amdgpu_userq_post_reset(struct amdgpu_device *adev, bool vram_lost);
+
 int amdgpu_userq_input_va_validate(struct amdgpu_usermode_queue *queue,
 				   u64 addr, u64 expected_size);
 int amdgpu_userq_gem_va_unmap_validate(struct amdgpu_device *adev,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c
index 2aeeaa954882..aab55f38d81f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c
@@ -151,15 +151,20 @@ void amdgpu_userq_fence_driver_process(struct amdgpu_userq_fence_driver *fence_d
 {
 	struct amdgpu_userq_fence *userq_fence, *tmp;
 	struct dma_fence *fence;
+	unsigned long flags;
 	u64 rptr;
 	int i;
 
 	if (!fence_drv)
 		return;
-
+	/*
+	 * Use interrupt-safe spinlock since this function can be called from:
+	 * 1. Interrupt context (IRQ handlers like gfx_v11_0_eop_irq)
+	 * 2. Process context (workqueues like eviction suspend worker)
+	 * Using regular spin_lock() in interrupt context causes lockdep warnings.
+	 */
+	spin_lock_irqsave(&fence_drv->fence_list_lock, flags);
 	rptr = amdgpu_userq_fence_read(fence_drv);
-
-	spin_lock(&fence_drv->fence_list_lock);
 	list_for_each_entry_safe(userq_fence, tmp, &fence_drv->fences, link) {
 		fence = &userq_fence->base;
 
@@ -174,7 +179,7 @@ void amdgpu_userq_fence_driver_process(struct amdgpu_userq_fence_driver *fence_d
 		list_del(&userq_fence->link);
 		dma_fence_put(fence);
 	}
-	spin_unlock(&fence_drv->fence_list_lock);
+	spin_unlock_irqrestore(&fence_drv->fence_list_lock, flags);
 }
 
 void amdgpu_userq_fence_driver_destroy(struct kref *ref)
-- 
2.49.0


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* RE: [PATCH v8 1/2] drm/amdgpu: Convert amdgpu userqueue management from IDR to XArray
  2025-10-15  1:08 [PATCH v8 1/2] drm/amdgpu: Convert amdgpu userqueue management from IDR to XArray Jesse.Zhang
  2025-10-15  1:08 ` [PATCH v8 2/2] drm/amdgpu: Implement user queue reset functionality Jesse.Zhang
@ 2025-10-17  1:25 ` Zhang, Jesse(Jie)
  2025-10-23 20:31 ` Alex Deucher
  2 siblings, 0 replies; 5+ messages in thread
From: Zhang, Jesse(Jie) @ 2025-10-17  1:25 UTC (permalink / raw)
  To: Zhang, Jesse(Jie), amd-gfx@lists.freedesktop.org,
	Koenig, Christian
  Cc: Deucher, Alexander

[AMD Official Use Only - AMD Internal Distribution Only]

Ping..
 @Koenig, Christian,
Could you please review the patch again when you have some free time?
Thank you in advance.

Thanks
Jesse

> -----Original Message-----
> From: Jesse.Zhang <Jesse.Zhang@amd.com>
> Sent: Wednesday, October 15, 2025 9:09 AM
> To: amd-gfx@lists.freedesktop.org
> Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Koenig, Christian
> <Christian.Koenig@amd.com>; Zhang, Jesse(Jie) <Jesse.Zhang@amd.com>
> Subject: [PATCH v8 1/2] drm/amdgpu: Convert amdgpu userqueue management
> from IDR to XArray
>
> This commit refactors the AMDGPU userqueue management subsystem to replace
> IDR (ID Allocation) with XArray for improved performance, scalability, and
> maintainability. The changes address several issues with the previous IDR
> implementation and provide better locking semantics.
>
> Key changes:
>
> 1. **Global XArray Introduction**:
>    - Added `userq_doorbell_xa` to `struct amdgpu_device` for global queue tracking
>    - Uses doorbell_index as key for efficient global lookup
>    - Replaces the previous `userq_mgr_list` linked list approach
>
> 2. **Per-process XArray Conversion**:
>    - Replaced `userq_idr` with `userq_mgr_xa` in `struct amdgpu_userq_mgr`
>    - Maintains per-process queue tracking with queue_id as key
>    - Uses XA_FLAGS_ALLOC for automatic ID allocation
>
> 3. **Locking Improvements**:
>    - Removed global `userq_mutex` from `struct amdgpu_device`
>    - Replaced with fine-grained XArray locking using XArray's internal spinlocks
>
> 4. **Runtime Idle Check Optimization**:
>    - Updated `amdgpu_runtime_idle_check_userq()` to use xa_empty
>
> 5. **Queue Management Functions**:
>    - Converted all IDR operations to equivalent XArray functions:
>      - `idr_alloc()` → `xa_alloc()`
>      - `idr_find()` → `xa_load()`
>      - `idr_remove()` → `xa_erase()`
>      - `idr_for_each()` → `xa_for_each()`
>
> Benefits:
> - **Performance**: XArray provides better scalability for large numbers of queues
> - **Memory Efficiency**: Reduced memory overhead compared to IDR
> - **Thread Safety**: Improved locking semantics with XArray's internal spinlocks
>
> v2: rename userq_global_xa/userq_xa to userq_doorbell_xa/userq_mgr_xa
>     Remove xa_lock and use its own lock.
>
> v3: Set queue->userq_mgr = uq_mgr in amdgpu_userq_create()
> v4: use xa_store_irq (Christian)
>     hold the read side of the reset lock while creating/destroying queues and the
> manager data structure. (Chritian)
>
> Suggested-by: Christian König <christian.koenig@amd.com>
> Signed-off-by: Jesse Zhang <Jesse.Zhang@amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu.h           |   8 +-
>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c    |   3 +-
>  drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c       |  16 +-
>  drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c     | 151 +++++++++---------
>  drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h     |   8 +-
>  .../gpu/drm/amd/amdgpu/amdgpu_userq_fence.c   |   4 +-
>  drivers/gpu/drm/amd/amdgpu/mes_userqueue.c    |  22 ++-
>  7 files changed, 101 insertions(+), 111 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index d9bd1d71552e..3d71da46b471 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -1174,6 +1174,12 @@ struct amdgpu_device {
>        * queue fence.
>        */
>       struct xarray                   userq_xa;
> +     /**
> +      * @userq_doorbell_xa: Global user queue map (doorbell index → queue)
> +      * Key: doorbell_index (unique global identifier for the queue)
> +      * Value: struct amdgpu_usermode_queue
> +      */
> +     struct xarray userq_doorbell_xa;
>
>       /* df */
>       struct amdgpu_df                df;
> @@ -1307,8 +1313,6 @@ struct amdgpu_device {
>        */
>       bool                            apu_prefer_gtt;
>
> -     struct list_head                userq_mgr_list;
> -     struct mutex                    userq_mutex;
>       bool                            userq_halt_for_enforce_isolation;
>       struct amdgpu_uid *uid_info;
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 0d5585bc3b04..5d9fd2cabc58 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -4533,7 +4533,6 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>       mutex_init(&adev->gfx.userq_sch_mutex);
>       mutex_init(&adev->gfx.workload_profile_mutex);
>       mutex_init(&adev->vcn.workload_profile_mutex);
> -     mutex_init(&adev->userq_mutex);
>
>       amdgpu_device_init_apu_flags(adev);
>
> @@ -4561,7 +4560,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>
>       INIT_LIST_HEAD(&adev->pm.od_kobj_list);
>
> -     INIT_LIST_HEAD(&adev->userq_mgr_list);
> +     xa_init(&adev->userq_doorbell_xa);
>
>       INIT_DELAYED_WORK(&adev->delayed_init_work,
>                         amdgpu_device_delayed_init_work_handler);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> index bebf2ebc4f34..e8909abcf9c0 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> @@ -2772,22 +2772,8 @@ static int amdgpu_runtime_idle_check_userq(struct
> device *dev)
>       struct pci_dev *pdev = to_pci_dev(dev);
>       struct drm_device *drm_dev = pci_get_drvdata(pdev);
>       struct amdgpu_device *adev = drm_to_adev(drm_dev);
> -     struct amdgpu_usermode_queue *queue;
> -     struct amdgpu_userq_mgr *uqm, *tmp;
> -     int queue_id;
> -     int ret = 0;
> -
> -     mutex_lock(&adev->userq_mutex);
> -     list_for_each_entry_safe(uqm, tmp, &adev->userq_mgr_list, list) {
> -             idr_for_each_entry(&uqm->userq_idr, queue, queue_id) {
> -                     ret = -EBUSY;
> -                     goto done;
> -             }
> -     }
> -done:
> -     mutex_unlock(&adev->userq_mutex);
>
> -     return ret;
> +     return xa_empty(&adev->userq_doorbell_xa) ? 0 : -EBUSY;
>  }
>
>  static int amdgpu_pmops_runtime_suspend(struct device *dev) diff --git
> a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> index 153d64bc3dcd..1e3a1013eb71 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> @@ -30,6 +30,7 @@
>  #include "amdgpu_vm.h"
>  #include "amdgpu_userq.h"
>  #include "amdgpu_userq_fence.h"
> +#include "amdgpu_reset.h"
>
>  u32 amdgpu_userq_get_supported_ip_mask(struct amdgpu_device *adev)  { @@
> -277,19 +278,27 @@ amdgpu_userq_cleanup(struct amdgpu_userq_mgr *uq_mgr,
>       struct amdgpu_device *adev = uq_mgr->adev;
>       const struct amdgpu_userq_funcs *uq_funcs = adev->userq_funcs[queue-
> >queue_type];
>
> +     /* Wait for mode-1 reset to complete */
> +     down_read(&adev->reset_domain->sem);
> +
>       /* Drop the userq reference. */
>       amdgpu_userq_buffer_vas_list_cleanup(adev, queue);
>       uq_funcs->mqd_destroy(uq_mgr, queue);
>       amdgpu_userq_fence_driver_free(queue);
> -     idr_remove(&uq_mgr->userq_idr, queue_id);
> +     /* Use interrupt-safe locking since IRQ handlers may access these XArrays
> */
> +     xa_erase_irq(&uq_mgr->userq_mgr_xa, (unsigned long)queue_id);
> +     xa_erase_irq(&adev->userq_doorbell_xa, queue->doorbell_index);
> +     queue->userq_mgr = NULL;
>       list_del(&queue->userq_va_list);
>       kfree(queue);
> +
> +     up_read(&adev->reset_domain->sem);
>  }
>
>  static struct amdgpu_usermode_queue *
>  amdgpu_userq_find(struct amdgpu_userq_mgr *uq_mgr, int qid)  {
> -     return idr_find(&uq_mgr->userq_idr, qid);
> +     return xa_load(&uq_mgr->userq_mgr_xa, qid);
>  }
>
>  void
> @@ -550,8 +559,9 @@ amdgpu_userq_create(struct drm_file *filp, union
> drm_amdgpu_userq *args)
>       struct amdgpu_db_info db_info;
>       char *queue_name;
>       bool skip_map_queue;
> +     u32 qid;
>       uint64_t index;
> -     int qid, r = 0;
> +     int r = 0;
>       int priority =
>               (args->in.flags &
> AMDGPU_USERQ_CREATE_FLAGS_QUEUE_PRIORITY_MASK) >>
>
>       AMDGPU_USERQ_CREATE_FLAGS_QUEUE_PRIORITY_SHIFT;
> @@ -574,7 +584,6 @@ amdgpu_userq_create(struct drm_file *filp, union
> drm_amdgpu_userq *args)
>        *
>        * This will also make sure we have a valid eviction fence ready to be used.
>        */
> -     mutex_lock(&adev->userq_mutex);
>       amdgpu_userq_ensure_ev_fence(&fpriv->userq_mgr, &fpriv->evf_mgr);
>
>       uq_funcs = adev->userq_funcs[args->in.ip_type];
> @@ -637,15 +646,27 @@ amdgpu_userq_create(struct drm_file *filp, union
> drm_amdgpu_userq *args)
>               goto unlock;
>       }
>
> -     qid = idr_alloc(&uq_mgr->userq_idr, queue, 1,
> AMDGPU_MAX_USERQ_COUNT, GFP_KERNEL);
> -     if (qid < 0) {
> +     /* Wait for mode-1 reset to complete */
> +     down_read(&adev->reset_domain->sem);
> +     r =xa_err(xa_store_irq(&adev->userq_doorbell_xa, index, queue,
> GFP_KERNEL));
> +     if (r) {
> +             kfree(queue);
> +             up_read(&adev->reset_domain->sem);
> +             goto unlock;
> +     }
> +
> +     r = xa_alloc(&uq_mgr->userq_mgr_xa, &qid, queue, XA_LIMIT(1,
> AMDGPU_MAX_USERQ_COUNT), GFP_KERNEL);
> +     if (r) {
>               drm_file_err(uq_mgr->file, "Failed to allocate a queue id\n");
>               amdgpu_userq_fence_driver_free(queue);
>               uq_funcs->mqd_destroy(uq_mgr, queue);
>               kfree(queue);
>               r = -ENOMEM;
> +             up_read(&adev->reset_domain->sem);
>               goto unlock;
>       }
> +     up_read(&adev->reset_domain->sem);
> +     queue->userq_mgr = uq_mgr;
>
>       /* don't map the queue if scheduling is halted */
>       if (adev->userq_halt_for_enforce_isolation && @@ -658,7 +679,7 @@
> amdgpu_userq_create(struct drm_file *filp, union drm_amdgpu_userq *args)
>               r = amdgpu_userq_map_helper(uq_mgr, queue);
>               if (r) {
>                       drm_file_err(uq_mgr->file, "Failed to map Queue\n");
> -                     idr_remove(&uq_mgr->userq_idr, qid);
> +                     xa_erase(&uq_mgr->userq_mgr_xa, qid);
>                       amdgpu_userq_fence_driver_free(queue);
>                       uq_funcs->mqd_destroy(uq_mgr, queue);
>                       kfree(queue);
> @@ -681,7 +702,6 @@ amdgpu_userq_create(struct drm_file *filp, union
> drm_amdgpu_userq *args)
>
>  unlock:
>       mutex_unlock(&uq_mgr->userq_mutex);
> -     mutex_unlock(&adev->userq_mutex);
>
>       return r;
>  }
> @@ -779,11 +799,11 @@ static int
>  amdgpu_userq_restore_all(struct amdgpu_userq_mgr *uq_mgr)  {
>       struct amdgpu_usermode_queue *queue;
> -     int queue_id;
> +     unsigned long queue_id;
>       int ret = 0, r;
>
>       /* Resume all the queues for this process */
> -     idr_for_each_entry(&uq_mgr->userq_idr, queue, queue_id) {
> +     xa_for_each(&uq_mgr->userq_mgr_xa, queue_id, queue) {
>
>               if (!amdgpu_userq_buffer_vas_mapped(queue)) {
>                       drm_file_err(uq_mgr->file,
> @@ -942,11 +962,11 @@ static int
>  amdgpu_userq_evict_all(struct amdgpu_userq_mgr *uq_mgr)  {
>       struct amdgpu_usermode_queue *queue;
> -     int queue_id;
> +     unsigned long queue_id;
>       int ret = 0, r;
>
>       /* Try to unmap all the queues in this process ctx */
> -     idr_for_each_entry(&uq_mgr->userq_idr, queue, queue_id) {
> +     xa_for_each(&uq_mgr->userq_mgr_xa, queue_id, queue) {
>               r = amdgpu_userq_preempt_helper(uq_mgr, queue);
>               if (r)
>                       ret = r;
> @@ -961,9 +981,10 @@ static int
>  amdgpu_userq_wait_for_signal(struct amdgpu_userq_mgr *uq_mgr)  {
>       struct amdgpu_usermode_queue *queue;
> -     int queue_id, ret;
> +     unsigned long queue_id;
> +     int ret;
>
> -     idr_for_each_entry(&uq_mgr->userq_idr, queue, queue_id) {
> +     xa_for_each(&uq_mgr->userq_mgr_xa, queue_id, queue) {
>               struct dma_fence *f = queue->last_fence;
>
>               if (!f || dma_fence_is_signaled(f))
> @@ -1016,44 +1037,30 @@ int amdgpu_userq_mgr_init(struct
> amdgpu_userq_mgr *userq_mgr, struct drm_file *f
>                         struct amdgpu_device *adev)
>  {
>       mutex_init(&userq_mgr->userq_mutex);
> -     idr_init_base(&userq_mgr->userq_idr, 1);
> +     xa_init_flags(&userq_mgr->userq_mgr_xa, XA_FLAGS_ALLOC);
>       userq_mgr->adev = adev;
>       userq_mgr->file = file_priv;
>
> -     mutex_lock(&adev->userq_mutex);
> -     list_add(&userq_mgr->list, &adev->userq_mgr_list);
> -     mutex_unlock(&adev->userq_mutex);
> -
>       INIT_DELAYED_WORK(&userq_mgr->resume_work,
> amdgpu_userq_restore_worker);
>       return 0;
>  }
>
>  void amdgpu_userq_mgr_fini(struct amdgpu_userq_mgr *userq_mgr)  {
> -     struct amdgpu_device *adev = userq_mgr->adev;
>       struct amdgpu_usermode_queue *queue;
> -     struct amdgpu_userq_mgr *uqm, *tmp;
> -     uint32_t queue_id;
> +     unsigned long queue_id;
>
>       cancel_delayed_work_sync(&userq_mgr->resume_work);
>
> -     mutex_lock(&adev->userq_mutex);
>       mutex_lock(&userq_mgr->userq_mutex);
> -     idr_for_each_entry(&userq_mgr->userq_idr, queue, queue_id) {
> +     xa_for_each(&userq_mgr->userq_mgr_xa, queue_id, queue) {
>               amdgpu_userq_wait_for_last_fence(userq_mgr, queue);
>               amdgpu_userq_unmap_helper(userq_mgr, queue);
>               amdgpu_userq_cleanup(userq_mgr, queue, queue_id);
>       }
>
> -     list_for_each_entry_safe(uqm, tmp, &adev->userq_mgr_list, list) {
> -             if (uqm == userq_mgr) {
> -                     list_del(&uqm->list);
> -                     break;
> -             }
> -     }
> -     idr_destroy(&userq_mgr->userq_idr);
> +     xa_destroy(&userq_mgr->userq_mgr_xa);
>       mutex_unlock(&userq_mgr->userq_mutex);
> -     mutex_unlock(&adev->userq_mutex);
>       mutex_destroy(&userq_mgr->userq_mutex);
>  }
>
> @@ -1061,25 +1068,23 @@ int amdgpu_userq_suspend(struct amdgpu_device
> *adev)  {
>       u32 ip_mask = amdgpu_userq_get_supported_ip_mask(adev);
>       struct amdgpu_usermode_queue *queue;
> -     struct amdgpu_userq_mgr *uqm, *tmp;
> -     int queue_id;
> +     struct amdgpu_userq_mgr *uqm;
> +     unsigned long queue_id;
>       int r;
>
>       if (!ip_mask)
>               return 0;
>
> -     guard(mutex)(&adev->userq_mutex);
> -     list_for_each_entry_safe(uqm, tmp, &adev->userq_mgr_list, list) {
> +     xa_for_each(&adev->userq_doorbell_xa, queue_id, queue) {
> +             uqm = queue->userq_mgr;
>               cancel_delayed_work_sync(&uqm->resume_work);
>               guard(mutex)(&uqm->userq_mutex);
> -             idr_for_each_entry(&uqm->userq_idr, queue, queue_id) {
> -                     if (adev->in_s0ix)
> -                             r = amdgpu_userq_preempt_helper(uqm, queue);
> -                     else
> -                             r = amdgpu_userq_unmap_helper(uqm, queue);
> -                     if (r)
> -                             return r;
> -             }
> +             if (adev->in_s0ix)
> +                     r = amdgpu_userq_preempt_helper(uqm, queue);
> +             else
> +                     r = amdgpu_userq_unmap_helper(uqm, queue);
> +             if (r)
> +                     return r;
>       }
>       return 0;
>  }
> @@ -1088,24 +1093,22 @@ int amdgpu_userq_resume(struct amdgpu_device
> *adev)  {
>       u32 ip_mask = amdgpu_userq_get_supported_ip_mask(adev);
>       struct amdgpu_usermode_queue *queue;
> -     struct amdgpu_userq_mgr *uqm, *tmp;
> -     int queue_id;
> +     struct amdgpu_userq_mgr *uqm;
> +     unsigned long queue_id;
>       int r;
>
>       if (!ip_mask)
>               return 0;
>
> -     guard(mutex)(&adev->userq_mutex);
> -     list_for_each_entry_safe(uqm, tmp, &adev->userq_mgr_list, list) {
> +     xa_for_each(&adev->userq_doorbell_xa, queue_id, queue) {
> +             uqm = queue->userq_mgr;
>               guard(mutex)(&uqm->userq_mutex);
> -             idr_for_each_entry(&uqm->userq_idr, queue, queue_id) {
> -                     if (adev->in_s0ix)
> -                             r = amdgpu_userq_restore_helper(uqm, queue);
> -                     else
> -                             r = amdgpu_userq_map_helper(uqm, queue);
> -                     if (r)
> -                             return r;
> -             }
> +             if (adev->in_s0ix)
> +                     r = amdgpu_userq_restore_helper(uqm, queue);
> +             else
> +                     r = amdgpu_userq_map_helper(uqm, queue);
> +             if (r)
> +                     return r;
>       }
>
>       return 0;
> @@ -1116,33 +1119,31 @@ int
> amdgpu_userq_stop_sched_for_enforce_isolation(struct amdgpu_device *adev,  {
>       u32 ip_mask = amdgpu_userq_get_supported_ip_mask(adev);
>       struct amdgpu_usermode_queue *queue;
> -     struct amdgpu_userq_mgr *uqm, *tmp;
> -     int queue_id;
> +     struct amdgpu_userq_mgr *uqm;
> +     unsigned long queue_id;
>       int ret = 0, r;
>
>       /* only need to stop gfx/compute */
>       if (!(ip_mask & ((1 << AMDGPU_HW_IP_GFX) | (1 <<
> AMDGPU_HW_IP_COMPUTE))))
>               return 0;
>
> -     mutex_lock(&adev->userq_mutex);
>       if (adev->userq_halt_for_enforce_isolation)
>               dev_warn(adev->dev, "userq scheduling already stopped!\n");
>       adev->userq_halt_for_enforce_isolation = true;
> -     list_for_each_entry_safe(uqm, tmp, &adev->userq_mgr_list, list) {
> +     xa_for_each(&adev->userq_doorbell_xa, queue_id, queue) {
> +             uqm = queue->userq_mgr;
>               cancel_delayed_work_sync(&uqm->resume_work);
>               mutex_lock(&uqm->userq_mutex);
> -             idr_for_each_entry(&uqm->userq_idr, queue, queue_id) {
> -                     if (((queue->queue_type == AMDGPU_HW_IP_GFX) ||
> -                          (queue->queue_type == AMDGPU_HW_IP_COMPUTE))
> &&
> -                         (queue->xcp_id == idx)) {
> -                             r = amdgpu_userq_preempt_helper(uqm, queue);
> -                             if (r)
> -                                     ret = r;
> -                     }
> +             if (((queue->queue_type == AMDGPU_HW_IP_GFX) ||
> +                  (queue->queue_type == AMDGPU_HW_IP_COMPUTE)) &&
> +                 (queue->xcp_id == idx)) {
> +                     r = amdgpu_userq_preempt_helper(uqm, queue);
> +                     if (r)
> +                             ret = r;
>               }
>               mutex_unlock(&uqm->userq_mutex);
>       }
> -     mutex_unlock(&adev->userq_mutex);
> +
>       return ret;
>  }
>
> @@ -1151,21 +1152,20 @@ int
> amdgpu_userq_start_sched_for_enforce_isolation(struct amdgpu_device *adev,  {
>       u32 ip_mask = amdgpu_userq_get_supported_ip_mask(adev);
>       struct amdgpu_usermode_queue *queue;
> -     struct amdgpu_userq_mgr *uqm, *tmp;
> -     int queue_id;
> +     struct amdgpu_userq_mgr *uqm;
> +     unsigned long queue_id;
>       int ret = 0, r;
>
>       /* only need to stop gfx/compute */
>       if (!(ip_mask & ((1 << AMDGPU_HW_IP_GFX) | (1 <<
> AMDGPU_HW_IP_COMPUTE))))
>               return 0;
>
> -     mutex_lock(&adev->userq_mutex);
>       if (!adev->userq_halt_for_enforce_isolation)
>               dev_warn(adev->dev, "userq scheduling already started!\n");
>       adev->userq_halt_for_enforce_isolation = false;
> -     list_for_each_entry_safe(uqm, tmp, &adev->userq_mgr_list, list) {
> +     xa_for_each(&adev->userq_doorbell_xa, queue_id, queue) {
> +             uqm = queue->userq_mgr;
>               mutex_lock(&uqm->userq_mutex);
> -             idr_for_each_entry(&uqm->userq_idr, queue, queue_id) {
>                       if (((queue->queue_type == AMDGPU_HW_IP_GFX) ||
>                            (queue->queue_type == AMDGPU_HW_IP_COMPUTE))
> &&
>                           (queue->xcp_id == idx)) {
> @@ -1173,10 +1173,9 @@ int
> amdgpu_userq_start_sched_for_enforce_isolation(struct amdgpu_device *adev,
>                               if (r)
>                                       ret = r;
>                       }
> -             }
>               mutex_unlock(&uqm->userq_mutex);
>       }
> -     mutex_unlock(&adev->userq_mutex);
> +
>       return ret;
>  }
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
> index 036d8dd585cd..09da0617bfa2 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
> @@ -96,11 +96,15 @@ struct amdgpu_userq_funcs {
>
>  /* Usermode queues for gfx */
>  struct amdgpu_userq_mgr {
> -     struct idr                      userq_idr;
> +     /**
> +      * @userq_mgr_xa: Per-process user queue map (queue ID → queue)
> +      * Key: queue_id (unique ID within the process's userq manager)
> +      * Value: struct amdgpu_usermode_queue
> +      */
> +     struct xarray                   userq_mgr_xa;
>       struct mutex                    userq_mutex;
>       struct amdgpu_device            *adev;
>       struct delayed_work             resume_work;
> -     struct list_head                list;
>       struct drm_file                 *file;
>  };
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c
> index 761bad98da3e..2aeeaa954882 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c
> @@ -537,7 +537,7 @@ int amdgpu_userq_signal_ioctl(struct drm_device *dev,
> void *data,
>       }
>
>       /* Retrieve the user queue */
> -     queue = idr_find(&userq_mgr->userq_idr, args->queue_id);
> +     queue = xa_load(&userq_mgr->userq_mgr_xa, args->queue_id);
>       if (!queue) {
>               r = -ENOENT;
>               goto put_gobj_write;
> @@ -899,7 +899,7 @@ int amdgpu_userq_wait_ioctl(struct drm_device *dev, void
> *data,
>                */
>               num_fences = dma_fence_dedup_array(fences, num_fences);
>
> -             waitq = idr_find(&userq_mgr->userq_idr, wait_info->waitq_id);
> +             waitq = xa_load(&userq_mgr->userq_mgr_xa, wait_info->waitq_id);
>               if (!waitq) {
>                       r = -EINVAL;
>                       goto free_fences;
> diff --git a/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c
> b/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c
> index 829129ad7bd1..b8486b0d0d69 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c
> @@ -205,9 +205,9 @@ static int mes_userq_detect_and_reset(struct
> amdgpu_device *adev,
>       int db_array_size = amdgpu_mes_get_hung_queue_db_array_size(adev);
>       struct mes_detect_and_reset_queue_input input;
>       struct amdgpu_usermode_queue *queue;
> -     struct amdgpu_userq_mgr *uqm, *tmp;
>       unsigned int hung_db_num = 0;
> -     int queue_id, r, i;
> +     unsigned long queue_id;
> +     int r, i;
>       u32 db_array[4];
>
>       if (db_array_size > 4) {
> @@ -227,16 +227,14 @@ static int mes_userq_detect_and_reset(struct
> amdgpu_device *adev,
>       if (r) {
>               dev_err(adev->dev, "Failed to detect and reset queues, err (%d)\n",
> r);
>       } else if (hung_db_num) {
> -             list_for_each_entry_safe(uqm, tmp, &adev->userq_mgr_list, list) {
> -                     idr_for_each_entry(&uqm->userq_idr, queue, queue_id) {
> -                             if (queue->queue_type == queue_type) {
> -                                     for (i = 0; i < hung_db_num; i++) {
> -                                             if (queue->doorbell_index ==
> db_array[i]) {
> -                                                     queue->state =
> AMDGPU_USERQ_STATE_HUNG;
> -                                                     atomic_inc(&adev-
> >gpu_reset_counter);
> -
>       amdgpu_userq_fence_driver_force_completion(queue);
> -
>       drm_dev_wedged_event(adev_to_drm(adev),
> DRM_WEDGE_RECOVERY_NONE, NULL);
> -                                             }
> +             xa_for_each(&adev->userq_doorbell_xa, queue_id, queue) {
> +                     if (queue->queue_type == queue_type) {
> +                             for (i = 0; i < hung_db_num; i++) {
> +                                     if (queue->doorbell_index == db_array[i]) {
> +                                             queue->state =
> AMDGPU_USERQ_STATE_HUNG;
> +                                             atomic_inc(&adev-
> >gpu_reset_counter);
> +
>       amdgpu_userq_fence_driver_force_completion(queue);
> +
>       drm_dev_wedged_event(adev_to_drm(adev),
> DRM_WEDGE_RECOVERY_NONE,
> +NULL);
>                                       }
>                               }
>                       }
> --
> 2.49.0


^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH v8 1/2] drm/amdgpu: Convert amdgpu userqueue management from IDR to XArray
  2025-10-15  1:08 [PATCH v8 1/2] drm/amdgpu: Convert amdgpu userqueue management from IDR to XArray Jesse.Zhang
  2025-10-15  1:08 ` [PATCH v8 2/2] drm/amdgpu: Implement user queue reset functionality Jesse.Zhang
  2025-10-17  1:25 ` [PATCH v8 1/2] drm/amdgpu: Convert amdgpu userqueue management from IDR to XArray Zhang, Jesse(Jie)
@ 2025-10-23 20:31 ` Alex Deucher
  2 siblings, 0 replies; 5+ messages in thread
From: Alex Deucher @ 2025-10-23 20:31 UTC (permalink / raw)
  To: Jesse.Zhang; +Cc: amd-gfx, Alexander.Deucher, Christian Koenig

On Tue, Oct 14, 2025 at 9:26 PM Jesse.Zhang <Jesse.Zhang@amd.com> wrote:
>
> This commit refactors the AMDGPU userqueue management subsystem to replace
> IDR (ID Allocation) with XArray for improved performance, scalability, and
> maintainability. The changes address several issues with the previous IDR
> implementation and provide better locking semantics.
>
> Key changes:
>
> 1. **Global XArray Introduction**:
>    - Added `userq_doorbell_xa` to `struct amdgpu_device` for global queue tracking
>    - Uses doorbell_index as key for efficient global lookup
>    - Replaces the previous `userq_mgr_list` linked list approach
>
> 2. **Per-process XArray Conversion**:
>    - Replaced `userq_idr` with `userq_mgr_xa` in `struct amdgpu_userq_mgr`
>    - Maintains per-process queue tracking with queue_id as key
>    - Uses XA_FLAGS_ALLOC for automatic ID allocation
>
> 3. **Locking Improvements**:
>    - Removed global `userq_mutex` from `struct amdgpu_device`
>    - Replaced with fine-grained XArray locking using XArray's internal spinlocks
>
> 4. **Runtime Idle Check Optimization**:
>    - Updated `amdgpu_runtime_idle_check_userq()` to use xa_empty
>
> 5. **Queue Management Functions**:
>    - Converted all IDR operations to equivalent XArray functions:
>      - `idr_alloc()` → `xa_alloc()`
>      - `idr_find()` → `xa_load()`
>      - `idr_remove()` → `xa_erase()`
>      - `idr_for_each()` → `xa_for_each()`
>
> Benefits:
> - **Performance**: XArray provides better scalability for large numbers of queues
> - **Memory Efficiency**: Reduced memory overhead compared to IDR
> - **Thread Safety**: Improved locking semantics with XArray's internal spinlocks
>
> v2: rename userq_global_xa/userq_xa to userq_doorbell_xa/userq_mgr_xa
>     Remove xa_lock and use its own lock.
>
> v3: Set queue->userq_mgr = uq_mgr in amdgpu_userq_create()
> v4: use xa_store_irq (Christian)
>     hold the read side of the reset lock while creating/destroying queues and the manager data structure. (Chritian)
>
> Suggested-by: Christian König <christian.koenig@amd.com>
> Signed-off-by: Jesse Zhang <Jesse.Zhang@amd.com>

Acked-by: Alex Deucher <alexander.deucher@amd.com>

> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu.h           |   8 +-
>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c    |   3 +-
>  drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c       |  16 +-
>  drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c     | 151 +++++++++---------
>  drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h     |   8 +-
>  .../gpu/drm/amd/amdgpu/amdgpu_userq_fence.c   |   4 +-
>  drivers/gpu/drm/amd/amdgpu/mes_userqueue.c    |  22 ++-
>  7 files changed, 101 insertions(+), 111 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index d9bd1d71552e..3d71da46b471 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -1174,6 +1174,12 @@ struct amdgpu_device {
>          * queue fence.
>          */
>         struct xarray                   userq_xa;
> +       /**
> +        * @userq_doorbell_xa: Global user queue map (doorbell index → queue)
> +        * Key: doorbell_index (unique global identifier for the queue)
> +        * Value: struct amdgpu_usermode_queue
> +        */
> +       struct xarray userq_doorbell_xa;
>
>         /* df */
>         struct amdgpu_df                df;
> @@ -1307,8 +1313,6 @@ struct amdgpu_device {
>          */
>         bool                            apu_prefer_gtt;
>
> -       struct list_head                userq_mgr_list;
> -       struct mutex                    userq_mutex;
>         bool                            userq_halt_for_enforce_isolation;
>         struct amdgpu_uid *uid_info;
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 0d5585bc3b04..5d9fd2cabc58 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -4533,7 +4533,6 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>         mutex_init(&adev->gfx.userq_sch_mutex);
>         mutex_init(&adev->gfx.workload_profile_mutex);
>         mutex_init(&adev->vcn.workload_profile_mutex);
> -       mutex_init(&adev->userq_mutex);
>
>         amdgpu_device_init_apu_flags(adev);
>
> @@ -4561,7 +4560,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>
>         INIT_LIST_HEAD(&adev->pm.od_kobj_list);
>
> -       INIT_LIST_HEAD(&adev->userq_mgr_list);
> +       xa_init(&adev->userq_doorbell_xa);
>
>         INIT_DELAYED_WORK(&adev->delayed_init_work,
>                           amdgpu_device_delayed_init_work_handler);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> index bebf2ebc4f34..e8909abcf9c0 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> @@ -2772,22 +2772,8 @@ static int amdgpu_runtime_idle_check_userq(struct device *dev)
>         struct pci_dev *pdev = to_pci_dev(dev);
>         struct drm_device *drm_dev = pci_get_drvdata(pdev);
>         struct amdgpu_device *adev = drm_to_adev(drm_dev);
> -       struct amdgpu_usermode_queue *queue;
> -       struct amdgpu_userq_mgr *uqm, *tmp;
> -       int queue_id;
> -       int ret = 0;
> -
> -       mutex_lock(&adev->userq_mutex);
> -       list_for_each_entry_safe(uqm, tmp, &adev->userq_mgr_list, list) {
> -               idr_for_each_entry(&uqm->userq_idr, queue, queue_id) {
> -                       ret = -EBUSY;
> -                       goto done;
> -               }
> -       }
> -done:
> -       mutex_unlock(&adev->userq_mutex);
>
> -       return ret;
> +       return xa_empty(&adev->userq_doorbell_xa) ? 0 : -EBUSY;
>  }
>
>  static int amdgpu_pmops_runtime_suspend(struct device *dev)
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> index 153d64bc3dcd..1e3a1013eb71 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> @@ -30,6 +30,7 @@
>  #include "amdgpu_vm.h"
>  #include "amdgpu_userq.h"
>  #include "amdgpu_userq_fence.h"
> +#include "amdgpu_reset.h"
>
>  u32 amdgpu_userq_get_supported_ip_mask(struct amdgpu_device *adev)
>  {
> @@ -277,19 +278,27 @@ amdgpu_userq_cleanup(struct amdgpu_userq_mgr *uq_mgr,
>         struct amdgpu_device *adev = uq_mgr->adev;
>         const struct amdgpu_userq_funcs *uq_funcs = adev->userq_funcs[queue->queue_type];
>
> +       /* Wait for mode-1 reset to complete */
> +       down_read(&adev->reset_domain->sem);
> +
>         /* Drop the userq reference. */
>         amdgpu_userq_buffer_vas_list_cleanup(adev, queue);
>         uq_funcs->mqd_destroy(uq_mgr, queue);
>         amdgpu_userq_fence_driver_free(queue);
> -       idr_remove(&uq_mgr->userq_idr, queue_id);
> +       /* Use interrupt-safe locking since IRQ handlers may access these XArrays */
> +       xa_erase_irq(&uq_mgr->userq_mgr_xa, (unsigned long)queue_id);
> +       xa_erase_irq(&adev->userq_doorbell_xa, queue->doorbell_index);
> +       queue->userq_mgr = NULL;
>         list_del(&queue->userq_va_list);
>         kfree(queue);
> +
> +       up_read(&adev->reset_domain->sem);
>  }
>
>  static struct amdgpu_usermode_queue *
>  amdgpu_userq_find(struct amdgpu_userq_mgr *uq_mgr, int qid)
>  {
> -       return idr_find(&uq_mgr->userq_idr, qid);
> +       return xa_load(&uq_mgr->userq_mgr_xa, qid);
>  }
>
>  void
> @@ -550,8 +559,9 @@ amdgpu_userq_create(struct drm_file *filp, union drm_amdgpu_userq *args)
>         struct amdgpu_db_info db_info;
>         char *queue_name;
>         bool skip_map_queue;
> +       u32 qid;
>         uint64_t index;
> -       int qid, r = 0;
> +       int r = 0;
>         int priority =
>                 (args->in.flags & AMDGPU_USERQ_CREATE_FLAGS_QUEUE_PRIORITY_MASK) >>
>                 AMDGPU_USERQ_CREATE_FLAGS_QUEUE_PRIORITY_SHIFT;
> @@ -574,7 +584,6 @@ amdgpu_userq_create(struct drm_file *filp, union drm_amdgpu_userq *args)
>          *
>          * This will also make sure we have a valid eviction fence ready to be used.
>          */
> -       mutex_lock(&adev->userq_mutex);
>         amdgpu_userq_ensure_ev_fence(&fpriv->userq_mgr, &fpriv->evf_mgr);
>
>         uq_funcs = adev->userq_funcs[args->in.ip_type];
> @@ -637,15 +646,27 @@ amdgpu_userq_create(struct drm_file *filp, union drm_amdgpu_userq *args)
>                 goto unlock;
>         }
>
> -       qid = idr_alloc(&uq_mgr->userq_idr, queue, 1, AMDGPU_MAX_USERQ_COUNT, GFP_KERNEL);
> -       if (qid < 0) {
> +       /* Wait for mode-1 reset to complete */
> +       down_read(&adev->reset_domain->sem);
> +       r =xa_err(xa_store_irq(&adev->userq_doorbell_xa, index, queue, GFP_KERNEL));
> +       if (r) {
> +               kfree(queue);
> +               up_read(&adev->reset_domain->sem);
> +               goto unlock;
> +       }
> +
> +       r = xa_alloc(&uq_mgr->userq_mgr_xa, &qid, queue, XA_LIMIT(1, AMDGPU_MAX_USERQ_COUNT), GFP_KERNEL);
> +       if (r) {
>                 drm_file_err(uq_mgr->file, "Failed to allocate a queue id\n");
>                 amdgpu_userq_fence_driver_free(queue);
>                 uq_funcs->mqd_destroy(uq_mgr, queue);
>                 kfree(queue);
>                 r = -ENOMEM;
> +               up_read(&adev->reset_domain->sem);
>                 goto unlock;
>         }
> +       up_read(&adev->reset_domain->sem);
> +       queue->userq_mgr = uq_mgr;
>
>         /* don't map the queue if scheduling is halted */
>         if (adev->userq_halt_for_enforce_isolation &&
> @@ -658,7 +679,7 @@ amdgpu_userq_create(struct drm_file *filp, union drm_amdgpu_userq *args)
>                 r = amdgpu_userq_map_helper(uq_mgr, queue);
>                 if (r) {
>                         drm_file_err(uq_mgr->file, "Failed to map Queue\n");
> -                       idr_remove(&uq_mgr->userq_idr, qid);
> +                       xa_erase(&uq_mgr->userq_mgr_xa, qid);
>                         amdgpu_userq_fence_driver_free(queue);
>                         uq_funcs->mqd_destroy(uq_mgr, queue);
>                         kfree(queue);
> @@ -681,7 +702,6 @@ amdgpu_userq_create(struct drm_file *filp, union drm_amdgpu_userq *args)
>
>  unlock:
>         mutex_unlock(&uq_mgr->userq_mutex);
> -       mutex_unlock(&adev->userq_mutex);
>
>         return r;
>  }
> @@ -779,11 +799,11 @@ static int
>  amdgpu_userq_restore_all(struct amdgpu_userq_mgr *uq_mgr)
>  {
>         struct amdgpu_usermode_queue *queue;
> -       int queue_id;
> +       unsigned long queue_id;
>         int ret = 0, r;
>
>         /* Resume all the queues for this process */
> -       idr_for_each_entry(&uq_mgr->userq_idr, queue, queue_id) {
> +       xa_for_each(&uq_mgr->userq_mgr_xa, queue_id, queue) {
>
>                 if (!amdgpu_userq_buffer_vas_mapped(queue)) {
>                         drm_file_err(uq_mgr->file,
> @@ -942,11 +962,11 @@ static int
>  amdgpu_userq_evict_all(struct amdgpu_userq_mgr *uq_mgr)
>  {
>         struct amdgpu_usermode_queue *queue;
> -       int queue_id;
> +       unsigned long queue_id;
>         int ret = 0, r;
>
>         /* Try to unmap all the queues in this process ctx */
> -       idr_for_each_entry(&uq_mgr->userq_idr, queue, queue_id) {
> +       xa_for_each(&uq_mgr->userq_mgr_xa, queue_id, queue) {
>                 r = amdgpu_userq_preempt_helper(uq_mgr, queue);
>                 if (r)
>                         ret = r;
> @@ -961,9 +981,10 @@ static int
>  amdgpu_userq_wait_for_signal(struct amdgpu_userq_mgr *uq_mgr)
>  {
>         struct amdgpu_usermode_queue *queue;
> -       int queue_id, ret;
> +       unsigned long queue_id;
> +       int ret;
>
> -       idr_for_each_entry(&uq_mgr->userq_idr, queue, queue_id) {
> +       xa_for_each(&uq_mgr->userq_mgr_xa, queue_id, queue) {
>                 struct dma_fence *f = queue->last_fence;
>
>                 if (!f || dma_fence_is_signaled(f))
> @@ -1016,44 +1037,30 @@ int amdgpu_userq_mgr_init(struct amdgpu_userq_mgr *userq_mgr, struct drm_file *f
>                           struct amdgpu_device *adev)
>  {
>         mutex_init(&userq_mgr->userq_mutex);
> -       idr_init_base(&userq_mgr->userq_idr, 1);
> +       xa_init_flags(&userq_mgr->userq_mgr_xa, XA_FLAGS_ALLOC);
>         userq_mgr->adev = adev;
>         userq_mgr->file = file_priv;
>
> -       mutex_lock(&adev->userq_mutex);
> -       list_add(&userq_mgr->list, &adev->userq_mgr_list);
> -       mutex_unlock(&adev->userq_mutex);
> -
>         INIT_DELAYED_WORK(&userq_mgr->resume_work, amdgpu_userq_restore_worker);
>         return 0;
>  }
>
>  void amdgpu_userq_mgr_fini(struct amdgpu_userq_mgr *userq_mgr)
>  {
> -       struct amdgpu_device *adev = userq_mgr->adev;
>         struct amdgpu_usermode_queue *queue;
> -       struct amdgpu_userq_mgr *uqm, *tmp;
> -       uint32_t queue_id;
> +       unsigned long queue_id;
>
>         cancel_delayed_work_sync(&userq_mgr->resume_work);
>
> -       mutex_lock(&adev->userq_mutex);
>         mutex_lock(&userq_mgr->userq_mutex);
> -       idr_for_each_entry(&userq_mgr->userq_idr, queue, queue_id) {
> +       xa_for_each(&userq_mgr->userq_mgr_xa, queue_id, queue) {
>                 amdgpu_userq_wait_for_last_fence(userq_mgr, queue);
>                 amdgpu_userq_unmap_helper(userq_mgr, queue);
>                 amdgpu_userq_cleanup(userq_mgr, queue, queue_id);
>         }
>
> -       list_for_each_entry_safe(uqm, tmp, &adev->userq_mgr_list, list) {
> -               if (uqm == userq_mgr) {
> -                       list_del(&uqm->list);
> -                       break;
> -               }
> -       }
> -       idr_destroy(&userq_mgr->userq_idr);
> +       xa_destroy(&userq_mgr->userq_mgr_xa);
>         mutex_unlock(&userq_mgr->userq_mutex);
> -       mutex_unlock(&adev->userq_mutex);
>         mutex_destroy(&userq_mgr->userq_mutex);
>  }
>
> @@ -1061,25 +1068,23 @@ int amdgpu_userq_suspend(struct amdgpu_device *adev)
>  {
>         u32 ip_mask = amdgpu_userq_get_supported_ip_mask(adev);
>         struct amdgpu_usermode_queue *queue;
> -       struct amdgpu_userq_mgr *uqm, *tmp;
> -       int queue_id;
> +       struct amdgpu_userq_mgr *uqm;
> +       unsigned long queue_id;
>         int r;
>
>         if (!ip_mask)
>                 return 0;
>
> -       guard(mutex)(&adev->userq_mutex);
> -       list_for_each_entry_safe(uqm, tmp, &adev->userq_mgr_list, list) {
> +       xa_for_each(&adev->userq_doorbell_xa, queue_id, queue) {
> +               uqm = queue->userq_mgr;
>                 cancel_delayed_work_sync(&uqm->resume_work);
>                 guard(mutex)(&uqm->userq_mutex);
> -               idr_for_each_entry(&uqm->userq_idr, queue, queue_id) {
> -                       if (adev->in_s0ix)
> -                               r = amdgpu_userq_preempt_helper(uqm, queue);
> -                       else
> -                               r = amdgpu_userq_unmap_helper(uqm, queue);
> -                       if (r)
> -                               return r;
> -               }
> +               if (adev->in_s0ix)
> +                       r = amdgpu_userq_preempt_helper(uqm, queue);
> +               else
> +                       r = amdgpu_userq_unmap_helper(uqm, queue);
> +               if (r)
> +                       return r;
>         }
>         return 0;
>  }
> @@ -1088,24 +1093,22 @@ int amdgpu_userq_resume(struct amdgpu_device *adev)
>  {
>         u32 ip_mask = amdgpu_userq_get_supported_ip_mask(adev);
>         struct amdgpu_usermode_queue *queue;
> -       struct amdgpu_userq_mgr *uqm, *tmp;
> -       int queue_id;
> +       struct amdgpu_userq_mgr *uqm;
> +       unsigned long queue_id;
>         int r;
>
>         if (!ip_mask)
>                 return 0;
>
> -       guard(mutex)(&adev->userq_mutex);
> -       list_for_each_entry_safe(uqm, tmp, &adev->userq_mgr_list, list) {
> +       xa_for_each(&adev->userq_doorbell_xa, queue_id, queue) {
> +               uqm = queue->userq_mgr;
>                 guard(mutex)(&uqm->userq_mutex);
> -               idr_for_each_entry(&uqm->userq_idr, queue, queue_id) {
> -                       if (adev->in_s0ix)
> -                               r = amdgpu_userq_restore_helper(uqm, queue);
> -                       else
> -                               r = amdgpu_userq_map_helper(uqm, queue);
> -                       if (r)
> -                               return r;
> -               }
> +               if (adev->in_s0ix)
> +                       r = amdgpu_userq_restore_helper(uqm, queue);
> +               else
> +                       r = amdgpu_userq_map_helper(uqm, queue);
> +               if (r)
> +                       return r;
>         }
>
>         return 0;
> @@ -1116,33 +1119,31 @@ int amdgpu_userq_stop_sched_for_enforce_isolation(struct amdgpu_device *adev,
>  {
>         u32 ip_mask = amdgpu_userq_get_supported_ip_mask(adev);
>         struct amdgpu_usermode_queue *queue;
> -       struct amdgpu_userq_mgr *uqm, *tmp;
> -       int queue_id;
> +       struct amdgpu_userq_mgr *uqm;
> +       unsigned long queue_id;
>         int ret = 0, r;
>
>         /* only need to stop gfx/compute */
>         if (!(ip_mask & ((1 << AMDGPU_HW_IP_GFX) | (1 << AMDGPU_HW_IP_COMPUTE))))
>                 return 0;
>
> -       mutex_lock(&adev->userq_mutex);
>         if (adev->userq_halt_for_enforce_isolation)
>                 dev_warn(adev->dev, "userq scheduling already stopped!\n");
>         adev->userq_halt_for_enforce_isolation = true;
> -       list_for_each_entry_safe(uqm, tmp, &adev->userq_mgr_list, list) {
> +       xa_for_each(&adev->userq_doorbell_xa, queue_id, queue) {
> +               uqm = queue->userq_mgr;
>                 cancel_delayed_work_sync(&uqm->resume_work);
>                 mutex_lock(&uqm->userq_mutex);
> -               idr_for_each_entry(&uqm->userq_idr, queue, queue_id) {
> -                       if (((queue->queue_type == AMDGPU_HW_IP_GFX) ||
> -                            (queue->queue_type == AMDGPU_HW_IP_COMPUTE)) &&
> -                           (queue->xcp_id == idx)) {
> -                               r = amdgpu_userq_preempt_helper(uqm, queue);
> -                               if (r)
> -                                       ret = r;
> -                       }
> +               if (((queue->queue_type == AMDGPU_HW_IP_GFX) ||
> +                    (queue->queue_type == AMDGPU_HW_IP_COMPUTE)) &&
> +                   (queue->xcp_id == idx)) {
> +                       r = amdgpu_userq_preempt_helper(uqm, queue);
> +                       if (r)
> +                               ret = r;
>                 }
>                 mutex_unlock(&uqm->userq_mutex);
>         }
> -       mutex_unlock(&adev->userq_mutex);
> +
>         return ret;
>  }
>
> @@ -1151,21 +1152,20 @@ int amdgpu_userq_start_sched_for_enforce_isolation(struct amdgpu_device *adev,
>  {
>         u32 ip_mask = amdgpu_userq_get_supported_ip_mask(adev);
>         struct amdgpu_usermode_queue *queue;
> -       struct amdgpu_userq_mgr *uqm, *tmp;
> -       int queue_id;
> +       struct amdgpu_userq_mgr *uqm;
> +       unsigned long queue_id;
>         int ret = 0, r;
>
>         /* only need to stop gfx/compute */
>         if (!(ip_mask & ((1 << AMDGPU_HW_IP_GFX) | (1 << AMDGPU_HW_IP_COMPUTE))))
>                 return 0;
>
> -       mutex_lock(&adev->userq_mutex);
>         if (!adev->userq_halt_for_enforce_isolation)
>                 dev_warn(adev->dev, "userq scheduling already started!\n");
>         adev->userq_halt_for_enforce_isolation = false;
> -       list_for_each_entry_safe(uqm, tmp, &adev->userq_mgr_list, list) {
> +       xa_for_each(&adev->userq_doorbell_xa, queue_id, queue) {
> +               uqm = queue->userq_mgr;
>                 mutex_lock(&uqm->userq_mutex);
> -               idr_for_each_entry(&uqm->userq_idr, queue, queue_id) {
>                         if (((queue->queue_type == AMDGPU_HW_IP_GFX) ||
>                              (queue->queue_type == AMDGPU_HW_IP_COMPUTE)) &&
>                             (queue->xcp_id == idx)) {
> @@ -1173,10 +1173,9 @@ int amdgpu_userq_start_sched_for_enforce_isolation(struct amdgpu_device *adev,
>                                 if (r)
>                                         ret = r;
>                         }
> -               }
>                 mutex_unlock(&uqm->userq_mutex);
>         }
> -       mutex_unlock(&adev->userq_mutex);
> +
>         return ret;
>  }
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
> index 036d8dd585cd..09da0617bfa2 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
> @@ -96,11 +96,15 @@ struct amdgpu_userq_funcs {
>
>  /* Usermode queues for gfx */
>  struct amdgpu_userq_mgr {
> -       struct idr                      userq_idr;
> +       /**
> +        * @userq_mgr_xa: Per-process user queue map (queue ID → queue)
> +        * Key: queue_id (unique ID within the process's userq manager)
> +        * Value: struct amdgpu_usermode_queue
> +        */
> +       struct xarray                   userq_mgr_xa;
>         struct mutex                    userq_mutex;
>         struct amdgpu_device            *adev;
>         struct delayed_work             resume_work;
> -       struct list_head                list;
>         struct drm_file                 *file;
>  };
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c
> index 761bad98da3e..2aeeaa954882 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c
> @@ -537,7 +537,7 @@ int amdgpu_userq_signal_ioctl(struct drm_device *dev, void *data,
>         }
>
>         /* Retrieve the user queue */
> -       queue = idr_find(&userq_mgr->userq_idr, args->queue_id);
> +       queue = xa_load(&userq_mgr->userq_mgr_xa, args->queue_id);
>         if (!queue) {
>                 r = -ENOENT;
>                 goto put_gobj_write;
> @@ -899,7 +899,7 @@ int amdgpu_userq_wait_ioctl(struct drm_device *dev, void *data,
>                  */
>                 num_fences = dma_fence_dedup_array(fences, num_fences);
>
> -               waitq = idr_find(&userq_mgr->userq_idr, wait_info->waitq_id);
> +               waitq = xa_load(&userq_mgr->userq_mgr_xa, wait_info->waitq_id);
>                 if (!waitq) {
>                         r = -EINVAL;
>                         goto free_fences;
> diff --git a/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c b/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c
> index 829129ad7bd1..b8486b0d0d69 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c
> @@ -205,9 +205,9 @@ static int mes_userq_detect_and_reset(struct amdgpu_device *adev,
>         int db_array_size = amdgpu_mes_get_hung_queue_db_array_size(adev);
>         struct mes_detect_and_reset_queue_input input;
>         struct amdgpu_usermode_queue *queue;
> -       struct amdgpu_userq_mgr *uqm, *tmp;
>         unsigned int hung_db_num = 0;
> -       int queue_id, r, i;
> +       unsigned long queue_id;
> +       int r, i;
>         u32 db_array[4];
>
>         if (db_array_size > 4) {
> @@ -227,16 +227,14 @@ static int mes_userq_detect_and_reset(struct amdgpu_device *adev,
>         if (r) {
>                 dev_err(adev->dev, "Failed to detect and reset queues, err (%d)\n", r);
>         } else if (hung_db_num) {
> -               list_for_each_entry_safe(uqm, tmp, &adev->userq_mgr_list, list) {
> -                       idr_for_each_entry(&uqm->userq_idr, queue, queue_id) {
> -                               if (queue->queue_type == queue_type) {
> -                                       for (i = 0; i < hung_db_num; i++) {
> -                                               if (queue->doorbell_index == db_array[i]) {
> -                                                       queue->state = AMDGPU_USERQ_STATE_HUNG;
> -                                                       atomic_inc(&adev->gpu_reset_counter);
> -                                                       amdgpu_userq_fence_driver_force_completion(queue);
> -                                                       drm_dev_wedged_event(adev_to_drm(adev), DRM_WEDGE_RECOVERY_NONE, NULL);
> -                                               }
> +               xa_for_each(&adev->userq_doorbell_xa, queue_id, queue) {
> +                       if (queue->queue_type == queue_type) {
> +                               for (i = 0; i < hung_db_num; i++) {
> +                                       if (queue->doorbell_index == db_array[i]) {
> +                                               queue->state = AMDGPU_USERQ_STATE_HUNG;
> +                                               atomic_inc(&adev->gpu_reset_counter);
> +                                               amdgpu_userq_fence_driver_force_completion(queue);
> +                                               drm_dev_wedged_event(adev_to_drm(adev), DRM_WEDGE_RECOVERY_NONE, NULL);
>                                         }
>                                 }
>                         }
> --
> 2.49.0
>

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH v8 2/2] drm/amdgpu: Implement user queue reset functionality
  2025-10-15  1:08 ` [PATCH v8 2/2] drm/amdgpu: Implement user queue reset functionality Jesse.Zhang
@ 2025-10-23 20:41   ` Alex Deucher
  0 siblings, 0 replies; 5+ messages in thread
From: Alex Deucher @ 2025-10-23 20:41 UTC (permalink / raw)
  To: Jesse.Zhang; +Cc: amd-gfx, Alexander.Deucher, Christian Koenig

On Tue, Oct 14, 2025 at 9:09 PM Jesse.Zhang <Jesse.Zhang@amd.com> wrote:
>
> This patch adds robust reset handling for user queues (userq) to improve
> recovery from queue failures. The key components include:
>
> 1. Queue detection and reset logic:
>    - amdgpu_userq_detect_and_reset_queues() identifies failed queues
>    - Per-IP detect_and_reset callbacks for targeted recovery
>    - Falls back to full GPU reset when needed
>
> 2. Reset infrastructure:
>    - Adds userq_reset_work workqueue for async reset handling
>    - Implements pre/post reset handlers for queue state management
>    - Integrates with existing GPU reset framework
>
> 3. Error handling improvements:
>    - Enhanced state tracking with HUNG state
>    - Automatic reset triggering on critical failures
>    - VRAM loss handling during recovery
>
> 4. Integration points:
>    - Added to device init/reset paths
>    - Called during queue destroy, suspend, and isolation events
>    - Handles both individual queue and full GPU resets
>
> The reset functionality works with both gfx/compute and sdma queues,
> providing better resilience against queue failures while minimizing
> disruption to unaffected queues.
>
> v2: add detection and reset calls when preemption/unmaped fails.
>     add a per device userq counter for each user queue type.(Alex)
> v3: make sure we hold the adev->userq_mutex when we call amdgpu_userq_detect_and_reset_queues. (Alex)
>    warn if the adev->userq_mutex is not held.
> v4: make sure we have all of the uqm->userq_mutex held.
>    warn if the uqm->userq_mutex is not held.
>
> v5: Use array for user queue type counters.(Alex)
>     all of the uqm->userq_mutex need to be held when calling detect and reset.  (Alex)
>
> v6: fix lock dep warning in amdgpu_userq_fence_dence_driver_process
>
> v7: add the queue types in an array and use a loop in amdgpu_userq_detect_and_reset_queues (Lijo)
>
> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
> Signed-off-by: Jesse Zhang <Jesse.Zhang@amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu.h           |   1 +
>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c    |   8 +
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h      |   1 +
>  drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c     | 183 ++++++++++++++++--
>  drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h     |   5 +
>  .../gpu/drm/amd/amdgpu/amdgpu_userq_fence.c   |  13 +-
>  6 files changed, 195 insertions(+), 16 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index 3d71da46b471..dc695bf69092 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -1314,6 +1314,7 @@ struct amdgpu_device {
>         bool                            apu_prefer_gtt;
>
>         bool                            userq_halt_for_enforce_isolation;
> +       struct work_struct              userq_reset_work;
>         struct amdgpu_uid *uid_info;
>
>         /* KFD
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 5d9fd2cabc58..b63807b7b15a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -4583,6 +4583,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>         }
>
>         INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
> +       INIT_WORK(&adev->userq_reset_work, amdgpu_userq_reset_work);
>
>         adev->gfx.gfx_off_req_count = 1;
>         adev->gfx.gfx_off_residency = 0;
> @@ -5965,6 +5966,10 @@ int amdgpu_device_reinit_after_reset(struct amdgpu_reset_context *reset_context)
>                                 if (r)
>                                         goto out;
>
> +                               r = amdgpu_userq_post_reset(tmp_adev, vram_lost);
> +                               if (r)
> +                                       goto out;
> +
>                                 drm_client_dev_resume(adev_to_drm(tmp_adev), false);
>
>                                 /*
> @@ -6187,6 +6192,7 @@ static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev)
>         if (!amdgpu_sriov_vf(adev))
>                 cancel_work(&adev->reset_work);
>  #endif
> +       cancel_work(&adev->userq_reset_work);
>
>         if (adev->kfd.dev)
>                 cancel_work(&adev->kfd.reset_work);
> @@ -6307,6 +6313,8 @@ static void amdgpu_device_halt_activities(struct amdgpu_device *adev,
>                     amdgpu_device_ip_need_full_reset(tmp_adev))
>                         amdgpu_ras_suspend(tmp_adev);
>
> +               amdgpu_userq_pre_reset(tmp_adev);
> +
>                 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>                         struct amdgpu_ring *ring = tmp_adev->rings[i];
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> index 87b962df5460..7a27c6c4bb44 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> @@ -83,6 +83,7 @@ enum amdgpu_ring_type {
>         AMDGPU_RING_TYPE_MES,
>         AMDGPU_RING_TYPE_UMSCH_MM,
>         AMDGPU_RING_TYPE_CPER,
> +       AMDGPU_RING_TYPE_MAX,
>  };
>
>  enum amdgpu_ib_pool_type {
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> index 1e3a1013eb71..4a7a954ee8ed 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> @@ -25,8 +25,10 @@
>  #include <drm/drm_auth.h>
>  #include <drm/drm_exec.h>
>  #include <linux/pm_runtime.h>
> +#include <drm/drm_drv.h>
>
>  #include "amdgpu.h"
> +#include "amdgpu_reset.h"
>  #include "amdgpu_vm.h"
>  #include "amdgpu_userq.h"
>  #include "amdgpu_userq_fence.h"
> @@ -45,6 +47,69 @@ u32 amdgpu_userq_get_supported_ip_mask(struct amdgpu_device *adev)
>         return userq_ip_mask;
>  }
>
> +static void amdgpu_userq_gpu_reset(struct amdgpu_device *adev)
> +{
> +       if (amdgpu_device_should_recover_gpu(adev)) {
> +               amdgpu_reset_domain_schedule(adev->reset_domain,
> +                                            &adev->userq_reset_work);
> +               /* Wait for the reset job to complete */
> +               flush_work(&adev->userq_reset_work);
> +       }
> +}
> +
> +static int
> +amdgpu_userq_detect_and_reset_queues(struct amdgpu_userq_mgr *uq_mgr)
> +{
> +       struct amdgpu_device *adev = uq_mgr->adev;
> +       const int queue_types[] = {
> +               AMDGPU_RING_TYPE_COMPUTE,
> +               AMDGPU_RING_TYPE_GFX,
> +               AMDGPU_RING_TYPE_SDMA
> +       };
> +       const int num_queue_types = ARRAY_SIZE(queue_types);
> +       bool gpu_reset = false;
> +       int r = 0;
> +       int i;
> +
> +       /* Warning if current process mutex is not held */
> +       WARN_ON(!mutex_is_locked(&uq_mgr->userq_mutex));
> +
> +       if (unlikely(adev->debug_disable_gpu_ring_reset)) {
> +               dev_err(adev->dev, "userq reset disabled by debug mask\n");
> +               return 0;
> +       }
> +
> +       /*
> +        * If GPU recovery feature is disabled system-wide,
> +        * skip all reset detection logic
> +        */
> +       if (!amdgpu_gpu_recovery)
> +               return 0;
> +
> +       /*
> +        * Iterate through all queue types to detect and reset problematic queues
> +        * Process each queue type in the defined order
> +        */
> +       for (i = 0; i < num_queue_types; i++) {
> +               int ring_type = queue_types[i];
> +               const struct amdgpu_userq_funcs *funcs = adev->userq_funcs[ring_type];

For consistency with kernel queues, We may want something like:
if (!amdgpu_userq_is_reset_type_supported(ring_type,
AMDGPU_RESET_TYPE_PER_QUEUE))
    continue;


> +
> +               if (atomic_read(&uq_mgr->userq_count[ring_type]) > 0 &&
> +                   funcs && funcs->detect_and_reset) {
> +                       r = funcs->detect_and_reset(adev, ring_type);
> +                       if (r) {
> +                               gpu_reset = true;
> +                               break;
> +                       }
> +               }
> +       }
> +
> +       if (gpu_reset)
> +               amdgpu_userq_gpu_reset(adev);
> +
> +       return r;
> +}
> +
>  static int amdgpu_userq_buffer_va_list_add(struct amdgpu_usermode_queue *queue,
>                                            struct amdgpu_bo_va_mapping *va_map, u64 addr)
>  {
> @@ -175,17 +240,22 @@ amdgpu_userq_preempt_helper(struct amdgpu_userq_mgr *uq_mgr,
>         struct amdgpu_device *adev = uq_mgr->adev;
>         const struct amdgpu_userq_funcs *userq_funcs =
>                 adev->userq_funcs[queue->queue_type];
> +       bool found_hung_queue = false;
>         int r = 0;
>
>         if (queue->state == AMDGPU_USERQ_STATE_MAPPED) {
>                 r = userq_funcs->preempt(uq_mgr, queue);
>                 if (r) {
>                         queue->state = AMDGPU_USERQ_STATE_HUNG;
> +                       found_hung_queue = true;
>                 } else {
>                         queue->state = AMDGPU_USERQ_STATE_PREEMPTED;
>                 }
>         }
>
> +       if (found_hung_queue)
> +               amdgpu_userq_detect_and_reset_queues(uq_mgr);
> +
>         return r;
>  }
>
> @@ -217,16 +287,23 @@ amdgpu_userq_unmap_helper(struct amdgpu_userq_mgr *uq_mgr,
>         struct amdgpu_device *adev = uq_mgr->adev;
>         const struct amdgpu_userq_funcs *userq_funcs =
>                 adev->userq_funcs[queue->queue_type];
> +       bool found_hung_queue = false;
>         int r = 0;
>
>         if ((queue->state == AMDGPU_USERQ_STATE_MAPPED) ||
>                 (queue->state == AMDGPU_USERQ_STATE_PREEMPTED)) {
>                 r = userq_funcs->unmap(uq_mgr, queue);
> -               if (r)
> +               if (r) {
>                         queue->state = AMDGPU_USERQ_STATE_HUNG;
> -               else
> +                       found_hung_queue = true;
> +               } else {
>                         queue->state = AMDGPU_USERQ_STATE_UNMAPPED;
> +               }
>         }
> +
> +       if (found_hung_queue)
> +               amdgpu_userq_detect_and_reset_queues(uq_mgr);
> +
>         return r;
>  }
>
> @@ -243,10 +320,12 @@ amdgpu_userq_map_helper(struct amdgpu_userq_mgr *uq_mgr,
>                 r = userq_funcs->map(uq_mgr, queue);
>                 if (r) {
>                         queue->state = AMDGPU_USERQ_STATE_HUNG;
> +                       amdgpu_userq_detect_and_reset_queues(uq_mgr);
>                 } else {
>                         queue->state = AMDGPU_USERQ_STATE_MAPPED;
>                 }
>         }
> +
>         return r;
>  }
>
> @@ -474,10 +553,11 @@ amdgpu_userq_destroy(struct drm_file *filp, int queue_id)
>                 amdgpu_bo_unreserve(queue->db_obj.obj);
>         }
>         amdgpu_bo_unref(&queue->db_obj.obj);
> -
> +       atomic_dec(&uq_mgr->userq_count[queue->queue_type]);
>  #if defined(CONFIG_DEBUG_FS)
>         debugfs_remove_recursive(queue->debugfs_queue);
>  #endif
> +       amdgpu_userq_detect_and_reset_queues(uq_mgr);
>         r = amdgpu_userq_unmap_helper(uq_mgr, queue);
>         /*TODO: It requires a reset for userq hw unmap error*/
>         if (unlikely(r != AMDGPU_USERQ_STATE_UNMAPPED)) {
> @@ -699,6 +779,7 @@ amdgpu_userq_create(struct drm_file *filp, union drm_amdgpu_userq *args)
>         kfree(queue_name);
>
>         args->out.queue_id = qid;
> +       atomic_inc(&uq_mgr->userq_count[queue->queue_type]);
>
>  unlock:
>         mutex_unlock(&uq_mgr->userq_mutex);
> @@ -965,6 +1046,7 @@ amdgpu_userq_evict_all(struct amdgpu_userq_mgr *uq_mgr)
>         unsigned long queue_id;
>         int ret = 0, r;
>
> +       amdgpu_userq_detect_and_reset_queues(uq_mgr);
>         /* Try to unmap all the queues in this process ctx */
>         xa_for_each(&uq_mgr->userq_mgr_xa, queue_id, queue) {
>                 r = amdgpu_userq_preempt_helper(uq_mgr, queue);
> @@ -977,6 +1059,23 @@ amdgpu_userq_evict_all(struct amdgpu_userq_mgr *uq_mgr)
>         return ret;
>  }
>
> +void amdgpu_userq_reset_work(struct work_struct *work)
> +{
> +       struct amdgpu_device *adev = container_of(work, struct amdgpu_device,
> +                                                 userq_reset_work);
> +       struct amdgpu_reset_context reset_context;
> +
> +       memset(&reset_context, 0, sizeof(reset_context));
> +
> +       reset_context.method = AMD_RESET_METHOD_NONE;
> +       reset_context.reset_req_dev = adev;
> +       reset_context.src = AMDGPU_RESET_SRC_USERQ;
> +       set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
> +       /*set_bit(AMDGPU_SKIP_COREDUMP, &reset_context.flags);*/
> +
> +       amdgpu_device_gpu_recover(adev, NULL, &reset_context);
> +}
> +
>  static int
>  amdgpu_userq_wait_for_signal(struct amdgpu_userq_mgr *uq_mgr)
>  {
> @@ -1004,22 +1103,19 @@ void
>  amdgpu_userq_evict(struct amdgpu_userq_mgr *uq_mgr,
>                    struct amdgpu_eviction_fence *ev_fence)
>  {
> -       int ret;
>         struct amdgpu_fpriv *fpriv = uq_mgr_to_fpriv(uq_mgr);
>         struct amdgpu_eviction_fence_mgr *evf_mgr = &fpriv->evf_mgr;
> +       struct amdgpu_device *adev = uq_mgr->adev;
> +       int ret;
>
>         /* Wait for any pending userqueue fence work to finish */
>         ret = amdgpu_userq_wait_for_signal(uq_mgr);
> -       if (ret) {
> -               drm_file_err(uq_mgr->file, "Not evicting userqueue, timeout waiting for work\n");
> -               return;
> -       }
> +       if (ret)
> +               dev_err(adev->dev, "Not evicting userqueue, timeout waiting for work\n");
>
>         ret = amdgpu_userq_evict_all(uq_mgr);
> -       if (ret) {
> -               drm_file_err(uq_mgr->file, "Failed to evict userqueue\n");
> -               return;
> -       }
> +       if (ret)
> +               dev_err(adev->dev, "Failed to evict userqueue\n");
>
>         /* Signal current eviction fence */
>         amdgpu_eviction_fence_signal(evf_mgr, ev_fence);
> @@ -1036,11 +1132,18 @@ amdgpu_userq_evict(struct amdgpu_userq_mgr *uq_mgr,
>  int amdgpu_userq_mgr_init(struct amdgpu_userq_mgr *userq_mgr, struct drm_file *file_priv,
>                           struct amdgpu_device *adev)
>  {
> +       int i;
> +
>         mutex_init(&userq_mgr->userq_mutex);
>         xa_init_flags(&userq_mgr->userq_mgr_xa, XA_FLAGS_ALLOC);
>         userq_mgr->adev = adev;
>         userq_mgr->file = file_priv;
>
> +       /* Initialize all queue type counters to zero */
> +       for (i = 0; i < AMDGPU_RING_TYPE_MAX; i++) {
> +               atomic_set(&userq_mgr->userq_count[i], 0);

These should already be 0 since we kzalloc the structure.

> +       }
> +
>         INIT_DELAYED_WORK(&userq_mgr->resume_work, amdgpu_userq_restore_worker);
>         return 0;
>  }
> @@ -1053,6 +1156,7 @@ void amdgpu_userq_mgr_fini(struct amdgpu_userq_mgr *userq_mgr)
>         cancel_delayed_work_sync(&userq_mgr->resume_work);
>
>         mutex_lock(&userq_mgr->userq_mutex);
> +       amdgpu_userq_detect_and_reset_queues(userq_mgr);
>         xa_for_each(&userq_mgr->userq_mgr_xa, queue_id, queue) {
>                 amdgpu_userq_wait_for_last_fence(userq_mgr, queue);
>                 amdgpu_userq_unmap_helper(userq_mgr, queue);
> @@ -1079,6 +1183,7 @@ int amdgpu_userq_suspend(struct amdgpu_device *adev)
>                 uqm = queue->userq_mgr;
>                 cancel_delayed_work_sync(&uqm->resume_work);
>                 guard(mutex)(&uqm->userq_mutex);
> +               amdgpu_userq_detect_and_reset_queues(uqm);
>                 if (adev->in_s0ix)
>                         r = amdgpu_userq_preempt_helper(uqm, queue);
>                 else
> @@ -1137,6 +1242,7 @@ int amdgpu_userq_stop_sched_for_enforce_isolation(struct amdgpu_device *adev,
>                 if (((queue->queue_type == AMDGPU_HW_IP_GFX) ||
>                      (queue->queue_type == AMDGPU_HW_IP_COMPUTE)) &&
>                     (queue->xcp_id == idx)) {
> +                       amdgpu_userq_detect_and_reset_queues(uqm);
>                         r = amdgpu_userq_preempt_helper(uqm, queue);
>                         if (r)
>                                 ret = r;
> @@ -1209,3 +1315,56 @@ int amdgpu_userq_gem_va_unmap_validate(struct amdgpu_device *adev,
>
>         return 0;
>  }
> +
> +void amdgpu_userq_pre_reset(struct amdgpu_device *adev)
> +{
> +       const struct amdgpu_userq_funcs *userq_funcs;
> +       struct amdgpu_usermode_queue *queue;
> +       struct amdgpu_userq_mgr *uqm;
> +       unsigned long queue_id;
> +
> +       xa_for_each(&adev->userq_doorbell_xa, queue_id, queue) {
> +               uqm = queue->userq_mgr;
> +               cancel_delayed_work_sync(&uqm->resume_work);
> +               if (queue->state == AMDGPU_USERQ_STATE_MAPPED) {
> +                       amdgpu_userq_wait_for_last_fence(uqm, queue);
> +                       userq_funcs = adev->userq_funcs[queue->queue_type];
> +                       userq_funcs->unmap(uqm, queue);
> +                       /* just mark all queues as hung at this point.
> +                        * if unmap succeeds, we could map again
> +                        * in amdgpu_userq_post_reset() if vram is not lost
> +                        */
> +                       queue->state = AMDGPU_USERQ_STATE_HUNG;
> +                       amdgpu_userq_fence_driver_force_completion(queue);
> +               }
> +       }
> +}
> +
> +int amdgpu_userq_post_reset(struct amdgpu_device *adev, bool vram_lost)
> +{
> +       /* if any queue state is AMDGPU_USERQ_STATE_UNMAPPED
> +        * at this point, we should be able to map it again
> +        * and continue if vram is not lost.
> +        */
> +       struct amdgpu_userq_mgr *uqm;
> +       struct amdgpu_usermode_queue *queue;
> +       const struct amdgpu_userq_funcs *userq_funcs;
> +       unsigned long queue_id;
> +       int r = 0;
> +
> +       xa_for_each(&adev->userq_doorbell_xa, queue_id, queue) {
> +               uqm = queue->userq_mgr;
> +               if (queue->state == AMDGPU_USERQ_STATE_HUNG && !vram_lost) {
> +                       userq_funcs = adev->userq_funcs[queue->queue_type];
> +                       /* Re-map queue */
> +                       r = userq_funcs->map(uqm, queue);
> +                       if (r) {
> +                               dev_err(adev->dev, "Failed to remap queue %ld\n", queue_id);
> +                               continue;
> +                       }
> +                       queue->state = AMDGPU_USERQ_STATE_MAPPED;
> +               }
> +       }
> +
> +       return r;
> +}
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
> index 09da0617bfa2..c37444427a14 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
> @@ -106,6 +106,7 @@ struct amdgpu_userq_mgr {
>         struct amdgpu_device            *adev;
>         struct delayed_work             resume_work;
>         struct drm_file                 *file;
> +       atomic_t                        userq_count[AMDGPU_RING_TYPE_MAX];
>  };
>
>  struct amdgpu_db_info {
> @@ -148,6 +149,10 @@ int amdgpu_userq_stop_sched_for_enforce_isolation(struct amdgpu_device *adev,
>                                                   u32 idx);
>  int amdgpu_userq_start_sched_for_enforce_isolation(struct amdgpu_device *adev,
>                                                    u32 idx);
> +void amdgpu_userq_reset_work(struct work_struct *work);
> +void amdgpu_userq_pre_reset(struct amdgpu_device *adev);
> +int amdgpu_userq_post_reset(struct amdgpu_device *adev, bool vram_lost);
> +
>  int amdgpu_userq_input_va_validate(struct amdgpu_usermode_queue *queue,
>                                    u64 addr, u64 expected_size);
>  int amdgpu_userq_gem_va_unmap_validate(struct amdgpu_device *adev,
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c
> index 2aeeaa954882..aab55f38d81f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c

The changes to this file should be a separate bug fix patch.

Alex

> @@ -151,15 +151,20 @@ void amdgpu_userq_fence_driver_process(struct amdgpu_userq_fence_driver *fence_d
>  {
>         struct amdgpu_userq_fence *userq_fence, *tmp;
>         struct dma_fence *fence;
> +       unsigned long flags;
>         u64 rptr;
>         int i;
>
>         if (!fence_drv)
>                 return;
> -
> +       /*
> +        * Use interrupt-safe spinlock since this function can be called from:
> +        * 1. Interrupt context (IRQ handlers like gfx_v11_0_eop_irq)
> +        * 2. Process context (workqueues like eviction suspend worker)
> +        * Using regular spin_lock() in interrupt context causes lockdep warnings.
> +        */
> +       spin_lock_irqsave(&fence_drv->fence_list_lock, flags);
>         rptr = amdgpu_userq_fence_read(fence_drv);
> -
> -       spin_lock(&fence_drv->fence_list_lock);
>         list_for_each_entry_safe(userq_fence, tmp, &fence_drv->fences, link) {
>                 fence = &userq_fence->base;
>
> @@ -174,7 +179,7 @@ void amdgpu_userq_fence_driver_process(struct amdgpu_userq_fence_driver *fence_d
>                 list_del(&userq_fence->link);
>                 dma_fence_put(fence);
>         }
> -       spin_unlock(&fence_drv->fence_list_lock);
> +       spin_unlock_irqrestore(&fence_drv->fence_list_lock, flags);
>  }
>
>  void amdgpu_userq_fence_driver_destroy(struct kref *ref)
> --
> 2.49.0
>

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2025-10-23 20:42 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-10-15  1:08 [PATCH v8 1/2] drm/amdgpu: Convert amdgpu userqueue management from IDR to XArray Jesse.Zhang
2025-10-15  1:08 ` [PATCH v8 2/2] drm/amdgpu: Implement user queue reset functionality Jesse.Zhang
2025-10-23 20:41   ` Alex Deucher
2025-10-17  1:25 ` [PATCH v8 1/2] drm/amdgpu: Convert amdgpu userqueue management from IDR to XArray Zhang, Jesse(Jie)
2025-10-23 20:31 ` Alex Deucher

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox