* [PATCH v8 2/2] drm/amdgpu: Implement user queue reset functionality
2025-10-15 1:08 [PATCH v8 1/2] drm/amdgpu: Convert amdgpu userqueue management from IDR to XArray Jesse.Zhang
@ 2025-10-15 1:08 ` Jesse.Zhang
2025-10-23 20:41 ` Alex Deucher
2025-10-17 1:25 ` [PATCH v8 1/2] drm/amdgpu: Convert amdgpu userqueue management from IDR to XArray Zhang, Jesse(Jie)
2025-10-23 20:31 ` Alex Deucher
2 siblings, 1 reply; 5+ messages in thread
From: Jesse.Zhang @ 2025-10-15 1:08 UTC (permalink / raw)
To: amd-gfx; +Cc: Alexander.Deucher, Christian Koenig, Jesse.Zhang, Alex Deucher
This patch adds robust reset handling for user queues (userq) to improve
recovery from queue failures. The key components include:
1. Queue detection and reset logic:
- amdgpu_userq_detect_and_reset_queues() identifies failed queues
- Per-IP detect_and_reset callbacks for targeted recovery
- Falls back to full GPU reset when needed
2. Reset infrastructure:
- Adds userq_reset_work workqueue for async reset handling
- Implements pre/post reset handlers for queue state management
- Integrates with existing GPU reset framework
3. Error handling improvements:
- Enhanced state tracking with HUNG state
- Automatic reset triggering on critical failures
- VRAM loss handling during recovery
4. Integration points:
- Added to device init/reset paths
- Called during queue destroy, suspend, and isolation events
- Handles both individual queue and full GPU resets
The reset functionality works with both gfx/compute and sdma queues,
providing better resilience against queue failures while minimizing
disruption to unaffected queues.
v2: add detection and reset calls when preemption/unmaped fails.
add a per device userq counter for each user queue type.(Alex)
v3: make sure we hold the adev->userq_mutex when we call amdgpu_userq_detect_and_reset_queues. (Alex)
warn if the adev->userq_mutex is not held.
v4: make sure we have all of the uqm->userq_mutex held.
warn if the uqm->userq_mutex is not held.
v5: Use array for user queue type counters.(Alex)
all of the uqm->userq_mutex need to be held when calling detect and reset. (Alex)
v6: fix lock dep warning in amdgpu_userq_fence_dence_driver_process
v7: add the queue types in an array and use a loop in amdgpu_userq_detect_and_reset_queues (Lijo)
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Signed-off-by: Jesse Zhang <Jesse.Zhang@amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu.h | 1 +
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 8 +
drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h | 1 +
drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c | 183 ++++++++++++++++--
drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h | 5 +
.../gpu/drm/amd/amdgpu/amdgpu_userq_fence.c | 13 +-
6 files changed, 195 insertions(+), 16 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 3d71da46b471..dc695bf69092 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1314,6 +1314,7 @@ struct amdgpu_device {
bool apu_prefer_gtt;
bool userq_halt_for_enforce_isolation;
+ struct work_struct userq_reset_work;
struct amdgpu_uid *uid_info;
/* KFD
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 5d9fd2cabc58..b63807b7b15a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -4583,6 +4583,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
}
INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
+ INIT_WORK(&adev->userq_reset_work, amdgpu_userq_reset_work);
adev->gfx.gfx_off_req_count = 1;
adev->gfx.gfx_off_residency = 0;
@@ -5965,6 +5966,10 @@ int amdgpu_device_reinit_after_reset(struct amdgpu_reset_context *reset_context)
if (r)
goto out;
+ r = amdgpu_userq_post_reset(tmp_adev, vram_lost);
+ if (r)
+ goto out;
+
drm_client_dev_resume(adev_to_drm(tmp_adev), false);
/*
@@ -6187,6 +6192,7 @@ static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev)
if (!amdgpu_sriov_vf(adev))
cancel_work(&adev->reset_work);
#endif
+ cancel_work(&adev->userq_reset_work);
if (adev->kfd.dev)
cancel_work(&adev->kfd.reset_work);
@@ -6307,6 +6313,8 @@ static void amdgpu_device_halt_activities(struct amdgpu_device *adev,
amdgpu_device_ip_need_full_reset(tmp_adev))
amdgpu_ras_suspend(tmp_adev);
+ amdgpu_userq_pre_reset(tmp_adev);
+
for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
struct amdgpu_ring *ring = tmp_adev->rings[i];
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
index 87b962df5460..7a27c6c4bb44 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
@@ -83,6 +83,7 @@ enum amdgpu_ring_type {
AMDGPU_RING_TYPE_MES,
AMDGPU_RING_TYPE_UMSCH_MM,
AMDGPU_RING_TYPE_CPER,
+ AMDGPU_RING_TYPE_MAX,
};
enum amdgpu_ib_pool_type {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
index 1e3a1013eb71..4a7a954ee8ed 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
@@ -25,8 +25,10 @@
#include <drm/drm_auth.h>
#include <drm/drm_exec.h>
#include <linux/pm_runtime.h>
+#include <drm/drm_drv.h>
#include "amdgpu.h"
+#include "amdgpu_reset.h"
#include "amdgpu_vm.h"
#include "amdgpu_userq.h"
#include "amdgpu_userq_fence.h"
@@ -45,6 +47,69 @@ u32 amdgpu_userq_get_supported_ip_mask(struct amdgpu_device *adev)
return userq_ip_mask;
}
+static void amdgpu_userq_gpu_reset(struct amdgpu_device *adev)
+{
+ if (amdgpu_device_should_recover_gpu(adev)) {
+ amdgpu_reset_domain_schedule(adev->reset_domain,
+ &adev->userq_reset_work);
+ /* Wait for the reset job to complete */
+ flush_work(&adev->userq_reset_work);
+ }
+}
+
+static int
+amdgpu_userq_detect_and_reset_queues(struct amdgpu_userq_mgr *uq_mgr)
+{
+ struct amdgpu_device *adev = uq_mgr->adev;
+ const int queue_types[] = {
+ AMDGPU_RING_TYPE_COMPUTE,
+ AMDGPU_RING_TYPE_GFX,
+ AMDGPU_RING_TYPE_SDMA
+ };
+ const int num_queue_types = ARRAY_SIZE(queue_types);
+ bool gpu_reset = false;
+ int r = 0;
+ int i;
+
+ /* Warning if current process mutex is not held */
+ WARN_ON(!mutex_is_locked(&uq_mgr->userq_mutex));
+
+ if (unlikely(adev->debug_disable_gpu_ring_reset)) {
+ dev_err(adev->dev, "userq reset disabled by debug mask\n");
+ return 0;
+ }
+
+ /*
+ * If GPU recovery feature is disabled system-wide,
+ * skip all reset detection logic
+ */
+ if (!amdgpu_gpu_recovery)
+ return 0;
+
+ /*
+ * Iterate through all queue types to detect and reset problematic queues
+ * Process each queue type in the defined order
+ */
+ for (i = 0; i < num_queue_types; i++) {
+ int ring_type = queue_types[i];
+ const struct amdgpu_userq_funcs *funcs = adev->userq_funcs[ring_type];
+
+ if (atomic_read(&uq_mgr->userq_count[ring_type]) > 0 &&
+ funcs && funcs->detect_and_reset) {
+ r = funcs->detect_and_reset(adev, ring_type);
+ if (r) {
+ gpu_reset = true;
+ break;
+ }
+ }
+ }
+
+ if (gpu_reset)
+ amdgpu_userq_gpu_reset(adev);
+
+ return r;
+}
+
static int amdgpu_userq_buffer_va_list_add(struct amdgpu_usermode_queue *queue,
struct amdgpu_bo_va_mapping *va_map, u64 addr)
{
@@ -175,17 +240,22 @@ amdgpu_userq_preempt_helper(struct amdgpu_userq_mgr *uq_mgr,
struct amdgpu_device *adev = uq_mgr->adev;
const struct amdgpu_userq_funcs *userq_funcs =
adev->userq_funcs[queue->queue_type];
+ bool found_hung_queue = false;
int r = 0;
if (queue->state == AMDGPU_USERQ_STATE_MAPPED) {
r = userq_funcs->preempt(uq_mgr, queue);
if (r) {
queue->state = AMDGPU_USERQ_STATE_HUNG;
+ found_hung_queue = true;
} else {
queue->state = AMDGPU_USERQ_STATE_PREEMPTED;
}
}
+ if (found_hung_queue)
+ amdgpu_userq_detect_and_reset_queues(uq_mgr);
+
return r;
}
@@ -217,16 +287,23 @@ amdgpu_userq_unmap_helper(struct amdgpu_userq_mgr *uq_mgr,
struct amdgpu_device *adev = uq_mgr->adev;
const struct amdgpu_userq_funcs *userq_funcs =
adev->userq_funcs[queue->queue_type];
+ bool found_hung_queue = false;
int r = 0;
if ((queue->state == AMDGPU_USERQ_STATE_MAPPED) ||
(queue->state == AMDGPU_USERQ_STATE_PREEMPTED)) {
r = userq_funcs->unmap(uq_mgr, queue);
- if (r)
+ if (r) {
queue->state = AMDGPU_USERQ_STATE_HUNG;
- else
+ found_hung_queue = true;
+ } else {
queue->state = AMDGPU_USERQ_STATE_UNMAPPED;
+ }
}
+
+ if (found_hung_queue)
+ amdgpu_userq_detect_and_reset_queues(uq_mgr);
+
return r;
}
@@ -243,10 +320,12 @@ amdgpu_userq_map_helper(struct amdgpu_userq_mgr *uq_mgr,
r = userq_funcs->map(uq_mgr, queue);
if (r) {
queue->state = AMDGPU_USERQ_STATE_HUNG;
+ amdgpu_userq_detect_and_reset_queues(uq_mgr);
} else {
queue->state = AMDGPU_USERQ_STATE_MAPPED;
}
}
+
return r;
}
@@ -474,10 +553,11 @@ amdgpu_userq_destroy(struct drm_file *filp, int queue_id)
amdgpu_bo_unreserve(queue->db_obj.obj);
}
amdgpu_bo_unref(&queue->db_obj.obj);
-
+ atomic_dec(&uq_mgr->userq_count[queue->queue_type]);
#if defined(CONFIG_DEBUG_FS)
debugfs_remove_recursive(queue->debugfs_queue);
#endif
+ amdgpu_userq_detect_and_reset_queues(uq_mgr);
r = amdgpu_userq_unmap_helper(uq_mgr, queue);
/*TODO: It requires a reset for userq hw unmap error*/
if (unlikely(r != AMDGPU_USERQ_STATE_UNMAPPED)) {
@@ -699,6 +779,7 @@ amdgpu_userq_create(struct drm_file *filp, union drm_amdgpu_userq *args)
kfree(queue_name);
args->out.queue_id = qid;
+ atomic_inc(&uq_mgr->userq_count[queue->queue_type]);
unlock:
mutex_unlock(&uq_mgr->userq_mutex);
@@ -965,6 +1046,7 @@ amdgpu_userq_evict_all(struct amdgpu_userq_mgr *uq_mgr)
unsigned long queue_id;
int ret = 0, r;
+ amdgpu_userq_detect_and_reset_queues(uq_mgr);
/* Try to unmap all the queues in this process ctx */
xa_for_each(&uq_mgr->userq_mgr_xa, queue_id, queue) {
r = amdgpu_userq_preempt_helper(uq_mgr, queue);
@@ -977,6 +1059,23 @@ amdgpu_userq_evict_all(struct amdgpu_userq_mgr *uq_mgr)
return ret;
}
+void amdgpu_userq_reset_work(struct work_struct *work)
+{
+ struct amdgpu_device *adev = container_of(work, struct amdgpu_device,
+ userq_reset_work);
+ struct amdgpu_reset_context reset_context;
+
+ memset(&reset_context, 0, sizeof(reset_context));
+
+ reset_context.method = AMD_RESET_METHOD_NONE;
+ reset_context.reset_req_dev = adev;
+ reset_context.src = AMDGPU_RESET_SRC_USERQ;
+ set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
+ /*set_bit(AMDGPU_SKIP_COREDUMP, &reset_context.flags);*/
+
+ amdgpu_device_gpu_recover(adev, NULL, &reset_context);
+}
+
static int
amdgpu_userq_wait_for_signal(struct amdgpu_userq_mgr *uq_mgr)
{
@@ -1004,22 +1103,19 @@ void
amdgpu_userq_evict(struct amdgpu_userq_mgr *uq_mgr,
struct amdgpu_eviction_fence *ev_fence)
{
- int ret;
struct amdgpu_fpriv *fpriv = uq_mgr_to_fpriv(uq_mgr);
struct amdgpu_eviction_fence_mgr *evf_mgr = &fpriv->evf_mgr;
+ struct amdgpu_device *adev = uq_mgr->adev;
+ int ret;
/* Wait for any pending userqueue fence work to finish */
ret = amdgpu_userq_wait_for_signal(uq_mgr);
- if (ret) {
- drm_file_err(uq_mgr->file, "Not evicting userqueue, timeout waiting for work\n");
- return;
- }
+ if (ret)
+ dev_err(adev->dev, "Not evicting userqueue, timeout waiting for work\n");
ret = amdgpu_userq_evict_all(uq_mgr);
- if (ret) {
- drm_file_err(uq_mgr->file, "Failed to evict userqueue\n");
- return;
- }
+ if (ret)
+ dev_err(adev->dev, "Failed to evict userqueue\n");
/* Signal current eviction fence */
amdgpu_eviction_fence_signal(evf_mgr, ev_fence);
@@ -1036,11 +1132,18 @@ amdgpu_userq_evict(struct amdgpu_userq_mgr *uq_mgr,
int amdgpu_userq_mgr_init(struct amdgpu_userq_mgr *userq_mgr, struct drm_file *file_priv,
struct amdgpu_device *adev)
{
+ int i;
+
mutex_init(&userq_mgr->userq_mutex);
xa_init_flags(&userq_mgr->userq_mgr_xa, XA_FLAGS_ALLOC);
userq_mgr->adev = adev;
userq_mgr->file = file_priv;
+ /* Initialize all queue type counters to zero */
+ for (i = 0; i < AMDGPU_RING_TYPE_MAX; i++) {
+ atomic_set(&userq_mgr->userq_count[i], 0);
+ }
+
INIT_DELAYED_WORK(&userq_mgr->resume_work, amdgpu_userq_restore_worker);
return 0;
}
@@ -1053,6 +1156,7 @@ void amdgpu_userq_mgr_fini(struct amdgpu_userq_mgr *userq_mgr)
cancel_delayed_work_sync(&userq_mgr->resume_work);
mutex_lock(&userq_mgr->userq_mutex);
+ amdgpu_userq_detect_and_reset_queues(userq_mgr);
xa_for_each(&userq_mgr->userq_mgr_xa, queue_id, queue) {
amdgpu_userq_wait_for_last_fence(userq_mgr, queue);
amdgpu_userq_unmap_helper(userq_mgr, queue);
@@ -1079,6 +1183,7 @@ int amdgpu_userq_suspend(struct amdgpu_device *adev)
uqm = queue->userq_mgr;
cancel_delayed_work_sync(&uqm->resume_work);
guard(mutex)(&uqm->userq_mutex);
+ amdgpu_userq_detect_and_reset_queues(uqm);
if (adev->in_s0ix)
r = amdgpu_userq_preempt_helper(uqm, queue);
else
@@ -1137,6 +1242,7 @@ int amdgpu_userq_stop_sched_for_enforce_isolation(struct amdgpu_device *adev,
if (((queue->queue_type == AMDGPU_HW_IP_GFX) ||
(queue->queue_type == AMDGPU_HW_IP_COMPUTE)) &&
(queue->xcp_id == idx)) {
+ amdgpu_userq_detect_and_reset_queues(uqm);
r = amdgpu_userq_preempt_helper(uqm, queue);
if (r)
ret = r;
@@ -1209,3 +1315,56 @@ int amdgpu_userq_gem_va_unmap_validate(struct amdgpu_device *adev,
return 0;
}
+
+void amdgpu_userq_pre_reset(struct amdgpu_device *adev)
+{
+ const struct amdgpu_userq_funcs *userq_funcs;
+ struct amdgpu_usermode_queue *queue;
+ struct amdgpu_userq_mgr *uqm;
+ unsigned long queue_id;
+
+ xa_for_each(&adev->userq_doorbell_xa, queue_id, queue) {
+ uqm = queue->userq_mgr;
+ cancel_delayed_work_sync(&uqm->resume_work);
+ if (queue->state == AMDGPU_USERQ_STATE_MAPPED) {
+ amdgpu_userq_wait_for_last_fence(uqm, queue);
+ userq_funcs = adev->userq_funcs[queue->queue_type];
+ userq_funcs->unmap(uqm, queue);
+ /* just mark all queues as hung at this point.
+ * if unmap succeeds, we could map again
+ * in amdgpu_userq_post_reset() if vram is not lost
+ */
+ queue->state = AMDGPU_USERQ_STATE_HUNG;
+ amdgpu_userq_fence_driver_force_completion(queue);
+ }
+ }
+}
+
+int amdgpu_userq_post_reset(struct amdgpu_device *adev, bool vram_lost)
+{
+ /* if any queue state is AMDGPU_USERQ_STATE_UNMAPPED
+ * at this point, we should be able to map it again
+ * and continue if vram is not lost.
+ */
+ struct amdgpu_userq_mgr *uqm;
+ struct amdgpu_usermode_queue *queue;
+ const struct amdgpu_userq_funcs *userq_funcs;
+ unsigned long queue_id;
+ int r = 0;
+
+ xa_for_each(&adev->userq_doorbell_xa, queue_id, queue) {
+ uqm = queue->userq_mgr;
+ if (queue->state == AMDGPU_USERQ_STATE_HUNG && !vram_lost) {
+ userq_funcs = adev->userq_funcs[queue->queue_type];
+ /* Re-map queue */
+ r = userq_funcs->map(uqm, queue);
+ if (r) {
+ dev_err(adev->dev, "Failed to remap queue %ld\n", queue_id);
+ continue;
+ }
+ queue->state = AMDGPU_USERQ_STATE_MAPPED;
+ }
+ }
+
+ return r;
+}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
index 09da0617bfa2..c37444427a14 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
@@ -106,6 +106,7 @@ struct amdgpu_userq_mgr {
struct amdgpu_device *adev;
struct delayed_work resume_work;
struct drm_file *file;
+ atomic_t userq_count[AMDGPU_RING_TYPE_MAX];
};
struct amdgpu_db_info {
@@ -148,6 +149,10 @@ int amdgpu_userq_stop_sched_for_enforce_isolation(struct amdgpu_device *adev,
u32 idx);
int amdgpu_userq_start_sched_for_enforce_isolation(struct amdgpu_device *adev,
u32 idx);
+void amdgpu_userq_reset_work(struct work_struct *work);
+void amdgpu_userq_pre_reset(struct amdgpu_device *adev);
+int amdgpu_userq_post_reset(struct amdgpu_device *adev, bool vram_lost);
+
int amdgpu_userq_input_va_validate(struct amdgpu_usermode_queue *queue,
u64 addr, u64 expected_size);
int amdgpu_userq_gem_va_unmap_validate(struct amdgpu_device *adev,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c
index 2aeeaa954882..aab55f38d81f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c
@@ -151,15 +151,20 @@ void amdgpu_userq_fence_driver_process(struct amdgpu_userq_fence_driver *fence_d
{
struct amdgpu_userq_fence *userq_fence, *tmp;
struct dma_fence *fence;
+ unsigned long flags;
u64 rptr;
int i;
if (!fence_drv)
return;
-
+ /*
+ * Use interrupt-safe spinlock since this function can be called from:
+ * 1. Interrupt context (IRQ handlers like gfx_v11_0_eop_irq)
+ * 2. Process context (workqueues like eviction suspend worker)
+ * Using regular spin_lock() in interrupt context causes lockdep warnings.
+ */
+ spin_lock_irqsave(&fence_drv->fence_list_lock, flags);
rptr = amdgpu_userq_fence_read(fence_drv);
-
- spin_lock(&fence_drv->fence_list_lock);
list_for_each_entry_safe(userq_fence, tmp, &fence_drv->fences, link) {
fence = &userq_fence->base;
@@ -174,7 +179,7 @@ void amdgpu_userq_fence_driver_process(struct amdgpu_userq_fence_driver *fence_d
list_del(&userq_fence->link);
dma_fence_put(fence);
}
- spin_unlock(&fence_drv->fence_list_lock);
+ spin_unlock_irqrestore(&fence_drv->fence_list_lock, flags);
}
void amdgpu_userq_fence_driver_destroy(struct kref *ref)
--
2.49.0
^ permalink raw reply related [flat|nested] 5+ messages in thread* Re: [PATCH v8 2/2] drm/amdgpu: Implement user queue reset functionality
2025-10-15 1:08 ` [PATCH v8 2/2] drm/amdgpu: Implement user queue reset functionality Jesse.Zhang
@ 2025-10-23 20:41 ` Alex Deucher
0 siblings, 0 replies; 5+ messages in thread
From: Alex Deucher @ 2025-10-23 20:41 UTC (permalink / raw)
To: Jesse.Zhang; +Cc: amd-gfx, Alexander.Deucher, Christian Koenig
On Tue, Oct 14, 2025 at 9:09 PM Jesse.Zhang <Jesse.Zhang@amd.com> wrote:
>
> This patch adds robust reset handling for user queues (userq) to improve
> recovery from queue failures. The key components include:
>
> 1. Queue detection and reset logic:
> - amdgpu_userq_detect_and_reset_queues() identifies failed queues
> - Per-IP detect_and_reset callbacks for targeted recovery
> - Falls back to full GPU reset when needed
>
> 2. Reset infrastructure:
> - Adds userq_reset_work workqueue for async reset handling
> - Implements pre/post reset handlers for queue state management
> - Integrates with existing GPU reset framework
>
> 3. Error handling improvements:
> - Enhanced state tracking with HUNG state
> - Automatic reset triggering on critical failures
> - VRAM loss handling during recovery
>
> 4. Integration points:
> - Added to device init/reset paths
> - Called during queue destroy, suspend, and isolation events
> - Handles both individual queue and full GPU resets
>
> The reset functionality works with both gfx/compute and sdma queues,
> providing better resilience against queue failures while minimizing
> disruption to unaffected queues.
>
> v2: add detection and reset calls when preemption/unmaped fails.
> add a per device userq counter for each user queue type.(Alex)
> v3: make sure we hold the adev->userq_mutex when we call amdgpu_userq_detect_and_reset_queues. (Alex)
> warn if the adev->userq_mutex is not held.
> v4: make sure we have all of the uqm->userq_mutex held.
> warn if the uqm->userq_mutex is not held.
>
> v5: Use array for user queue type counters.(Alex)
> all of the uqm->userq_mutex need to be held when calling detect and reset. (Alex)
>
> v6: fix lock dep warning in amdgpu_userq_fence_dence_driver_process
>
> v7: add the queue types in an array and use a loop in amdgpu_userq_detect_and_reset_queues (Lijo)
>
> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
> Signed-off-by: Jesse Zhang <Jesse.Zhang@amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 1 +
> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 8 +
> drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h | 1 +
> drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c | 183 ++++++++++++++++--
> drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h | 5 +
> .../gpu/drm/amd/amdgpu/amdgpu_userq_fence.c | 13 +-
> 6 files changed, 195 insertions(+), 16 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index 3d71da46b471..dc695bf69092 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -1314,6 +1314,7 @@ struct amdgpu_device {
> bool apu_prefer_gtt;
>
> bool userq_halt_for_enforce_isolation;
> + struct work_struct userq_reset_work;
> struct amdgpu_uid *uid_info;
>
> /* KFD
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 5d9fd2cabc58..b63807b7b15a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -4583,6 +4583,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
> }
>
> INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
> + INIT_WORK(&adev->userq_reset_work, amdgpu_userq_reset_work);
>
> adev->gfx.gfx_off_req_count = 1;
> adev->gfx.gfx_off_residency = 0;
> @@ -5965,6 +5966,10 @@ int amdgpu_device_reinit_after_reset(struct amdgpu_reset_context *reset_context)
> if (r)
> goto out;
>
> + r = amdgpu_userq_post_reset(tmp_adev, vram_lost);
> + if (r)
> + goto out;
> +
> drm_client_dev_resume(adev_to_drm(tmp_adev), false);
>
> /*
> @@ -6187,6 +6192,7 @@ static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev)
> if (!amdgpu_sriov_vf(adev))
> cancel_work(&adev->reset_work);
> #endif
> + cancel_work(&adev->userq_reset_work);
>
> if (adev->kfd.dev)
> cancel_work(&adev->kfd.reset_work);
> @@ -6307,6 +6313,8 @@ static void amdgpu_device_halt_activities(struct amdgpu_device *adev,
> amdgpu_device_ip_need_full_reset(tmp_adev))
> amdgpu_ras_suspend(tmp_adev);
>
> + amdgpu_userq_pre_reset(tmp_adev);
> +
> for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
> struct amdgpu_ring *ring = tmp_adev->rings[i];
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> index 87b962df5460..7a27c6c4bb44 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> @@ -83,6 +83,7 @@ enum amdgpu_ring_type {
> AMDGPU_RING_TYPE_MES,
> AMDGPU_RING_TYPE_UMSCH_MM,
> AMDGPU_RING_TYPE_CPER,
> + AMDGPU_RING_TYPE_MAX,
> };
>
> enum amdgpu_ib_pool_type {
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> index 1e3a1013eb71..4a7a954ee8ed 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> @@ -25,8 +25,10 @@
> #include <drm/drm_auth.h>
> #include <drm/drm_exec.h>
> #include <linux/pm_runtime.h>
> +#include <drm/drm_drv.h>
>
> #include "amdgpu.h"
> +#include "amdgpu_reset.h"
> #include "amdgpu_vm.h"
> #include "amdgpu_userq.h"
> #include "amdgpu_userq_fence.h"
> @@ -45,6 +47,69 @@ u32 amdgpu_userq_get_supported_ip_mask(struct amdgpu_device *adev)
> return userq_ip_mask;
> }
>
> +static void amdgpu_userq_gpu_reset(struct amdgpu_device *adev)
> +{
> + if (amdgpu_device_should_recover_gpu(adev)) {
> + amdgpu_reset_domain_schedule(adev->reset_domain,
> + &adev->userq_reset_work);
> + /* Wait for the reset job to complete */
> + flush_work(&adev->userq_reset_work);
> + }
> +}
> +
> +static int
> +amdgpu_userq_detect_and_reset_queues(struct amdgpu_userq_mgr *uq_mgr)
> +{
> + struct amdgpu_device *adev = uq_mgr->adev;
> + const int queue_types[] = {
> + AMDGPU_RING_TYPE_COMPUTE,
> + AMDGPU_RING_TYPE_GFX,
> + AMDGPU_RING_TYPE_SDMA
> + };
> + const int num_queue_types = ARRAY_SIZE(queue_types);
> + bool gpu_reset = false;
> + int r = 0;
> + int i;
> +
> + /* Warning if current process mutex is not held */
> + WARN_ON(!mutex_is_locked(&uq_mgr->userq_mutex));
> +
> + if (unlikely(adev->debug_disable_gpu_ring_reset)) {
> + dev_err(adev->dev, "userq reset disabled by debug mask\n");
> + return 0;
> + }
> +
> + /*
> + * If GPU recovery feature is disabled system-wide,
> + * skip all reset detection logic
> + */
> + if (!amdgpu_gpu_recovery)
> + return 0;
> +
> + /*
> + * Iterate through all queue types to detect and reset problematic queues
> + * Process each queue type in the defined order
> + */
> + for (i = 0; i < num_queue_types; i++) {
> + int ring_type = queue_types[i];
> + const struct amdgpu_userq_funcs *funcs = adev->userq_funcs[ring_type];
For consistency with kernel queues, We may want something like:
if (!amdgpu_userq_is_reset_type_supported(ring_type,
AMDGPU_RESET_TYPE_PER_QUEUE))
continue;
> +
> + if (atomic_read(&uq_mgr->userq_count[ring_type]) > 0 &&
> + funcs && funcs->detect_and_reset) {
> + r = funcs->detect_and_reset(adev, ring_type);
> + if (r) {
> + gpu_reset = true;
> + break;
> + }
> + }
> + }
> +
> + if (gpu_reset)
> + amdgpu_userq_gpu_reset(adev);
> +
> + return r;
> +}
> +
> static int amdgpu_userq_buffer_va_list_add(struct amdgpu_usermode_queue *queue,
> struct amdgpu_bo_va_mapping *va_map, u64 addr)
> {
> @@ -175,17 +240,22 @@ amdgpu_userq_preempt_helper(struct amdgpu_userq_mgr *uq_mgr,
> struct amdgpu_device *adev = uq_mgr->adev;
> const struct amdgpu_userq_funcs *userq_funcs =
> adev->userq_funcs[queue->queue_type];
> + bool found_hung_queue = false;
> int r = 0;
>
> if (queue->state == AMDGPU_USERQ_STATE_MAPPED) {
> r = userq_funcs->preempt(uq_mgr, queue);
> if (r) {
> queue->state = AMDGPU_USERQ_STATE_HUNG;
> + found_hung_queue = true;
> } else {
> queue->state = AMDGPU_USERQ_STATE_PREEMPTED;
> }
> }
>
> + if (found_hung_queue)
> + amdgpu_userq_detect_and_reset_queues(uq_mgr);
> +
> return r;
> }
>
> @@ -217,16 +287,23 @@ amdgpu_userq_unmap_helper(struct amdgpu_userq_mgr *uq_mgr,
> struct amdgpu_device *adev = uq_mgr->adev;
> const struct amdgpu_userq_funcs *userq_funcs =
> adev->userq_funcs[queue->queue_type];
> + bool found_hung_queue = false;
> int r = 0;
>
> if ((queue->state == AMDGPU_USERQ_STATE_MAPPED) ||
> (queue->state == AMDGPU_USERQ_STATE_PREEMPTED)) {
> r = userq_funcs->unmap(uq_mgr, queue);
> - if (r)
> + if (r) {
> queue->state = AMDGPU_USERQ_STATE_HUNG;
> - else
> + found_hung_queue = true;
> + } else {
> queue->state = AMDGPU_USERQ_STATE_UNMAPPED;
> + }
> }
> +
> + if (found_hung_queue)
> + amdgpu_userq_detect_and_reset_queues(uq_mgr);
> +
> return r;
> }
>
> @@ -243,10 +320,12 @@ amdgpu_userq_map_helper(struct amdgpu_userq_mgr *uq_mgr,
> r = userq_funcs->map(uq_mgr, queue);
> if (r) {
> queue->state = AMDGPU_USERQ_STATE_HUNG;
> + amdgpu_userq_detect_and_reset_queues(uq_mgr);
> } else {
> queue->state = AMDGPU_USERQ_STATE_MAPPED;
> }
> }
> +
> return r;
> }
>
> @@ -474,10 +553,11 @@ amdgpu_userq_destroy(struct drm_file *filp, int queue_id)
> amdgpu_bo_unreserve(queue->db_obj.obj);
> }
> amdgpu_bo_unref(&queue->db_obj.obj);
> -
> + atomic_dec(&uq_mgr->userq_count[queue->queue_type]);
> #if defined(CONFIG_DEBUG_FS)
> debugfs_remove_recursive(queue->debugfs_queue);
> #endif
> + amdgpu_userq_detect_and_reset_queues(uq_mgr);
> r = amdgpu_userq_unmap_helper(uq_mgr, queue);
> /*TODO: It requires a reset for userq hw unmap error*/
> if (unlikely(r != AMDGPU_USERQ_STATE_UNMAPPED)) {
> @@ -699,6 +779,7 @@ amdgpu_userq_create(struct drm_file *filp, union drm_amdgpu_userq *args)
> kfree(queue_name);
>
> args->out.queue_id = qid;
> + atomic_inc(&uq_mgr->userq_count[queue->queue_type]);
>
> unlock:
> mutex_unlock(&uq_mgr->userq_mutex);
> @@ -965,6 +1046,7 @@ amdgpu_userq_evict_all(struct amdgpu_userq_mgr *uq_mgr)
> unsigned long queue_id;
> int ret = 0, r;
>
> + amdgpu_userq_detect_and_reset_queues(uq_mgr);
> /* Try to unmap all the queues in this process ctx */
> xa_for_each(&uq_mgr->userq_mgr_xa, queue_id, queue) {
> r = amdgpu_userq_preempt_helper(uq_mgr, queue);
> @@ -977,6 +1059,23 @@ amdgpu_userq_evict_all(struct amdgpu_userq_mgr *uq_mgr)
> return ret;
> }
>
> +void amdgpu_userq_reset_work(struct work_struct *work)
> +{
> + struct amdgpu_device *adev = container_of(work, struct amdgpu_device,
> + userq_reset_work);
> + struct amdgpu_reset_context reset_context;
> +
> + memset(&reset_context, 0, sizeof(reset_context));
> +
> + reset_context.method = AMD_RESET_METHOD_NONE;
> + reset_context.reset_req_dev = adev;
> + reset_context.src = AMDGPU_RESET_SRC_USERQ;
> + set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
> + /*set_bit(AMDGPU_SKIP_COREDUMP, &reset_context.flags);*/
> +
> + amdgpu_device_gpu_recover(adev, NULL, &reset_context);
> +}
> +
> static int
> amdgpu_userq_wait_for_signal(struct amdgpu_userq_mgr *uq_mgr)
> {
> @@ -1004,22 +1103,19 @@ void
> amdgpu_userq_evict(struct amdgpu_userq_mgr *uq_mgr,
> struct amdgpu_eviction_fence *ev_fence)
> {
> - int ret;
> struct amdgpu_fpriv *fpriv = uq_mgr_to_fpriv(uq_mgr);
> struct amdgpu_eviction_fence_mgr *evf_mgr = &fpriv->evf_mgr;
> + struct amdgpu_device *adev = uq_mgr->adev;
> + int ret;
>
> /* Wait for any pending userqueue fence work to finish */
> ret = amdgpu_userq_wait_for_signal(uq_mgr);
> - if (ret) {
> - drm_file_err(uq_mgr->file, "Not evicting userqueue, timeout waiting for work\n");
> - return;
> - }
> + if (ret)
> + dev_err(adev->dev, "Not evicting userqueue, timeout waiting for work\n");
>
> ret = amdgpu_userq_evict_all(uq_mgr);
> - if (ret) {
> - drm_file_err(uq_mgr->file, "Failed to evict userqueue\n");
> - return;
> - }
> + if (ret)
> + dev_err(adev->dev, "Failed to evict userqueue\n");
>
> /* Signal current eviction fence */
> amdgpu_eviction_fence_signal(evf_mgr, ev_fence);
> @@ -1036,11 +1132,18 @@ amdgpu_userq_evict(struct amdgpu_userq_mgr *uq_mgr,
> int amdgpu_userq_mgr_init(struct amdgpu_userq_mgr *userq_mgr, struct drm_file *file_priv,
> struct amdgpu_device *adev)
> {
> + int i;
> +
> mutex_init(&userq_mgr->userq_mutex);
> xa_init_flags(&userq_mgr->userq_mgr_xa, XA_FLAGS_ALLOC);
> userq_mgr->adev = adev;
> userq_mgr->file = file_priv;
>
> + /* Initialize all queue type counters to zero */
> + for (i = 0; i < AMDGPU_RING_TYPE_MAX; i++) {
> + atomic_set(&userq_mgr->userq_count[i], 0);
These should already be 0 since we kzalloc the structure.
> + }
> +
> INIT_DELAYED_WORK(&userq_mgr->resume_work, amdgpu_userq_restore_worker);
> return 0;
> }
> @@ -1053,6 +1156,7 @@ void amdgpu_userq_mgr_fini(struct amdgpu_userq_mgr *userq_mgr)
> cancel_delayed_work_sync(&userq_mgr->resume_work);
>
> mutex_lock(&userq_mgr->userq_mutex);
> + amdgpu_userq_detect_and_reset_queues(userq_mgr);
> xa_for_each(&userq_mgr->userq_mgr_xa, queue_id, queue) {
> amdgpu_userq_wait_for_last_fence(userq_mgr, queue);
> amdgpu_userq_unmap_helper(userq_mgr, queue);
> @@ -1079,6 +1183,7 @@ int amdgpu_userq_suspend(struct amdgpu_device *adev)
> uqm = queue->userq_mgr;
> cancel_delayed_work_sync(&uqm->resume_work);
> guard(mutex)(&uqm->userq_mutex);
> + amdgpu_userq_detect_and_reset_queues(uqm);
> if (adev->in_s0ix)
> r = amdgpu_userq_preempt_helper(uqm, queue);
> else
> @@ -1137,6 +1242,7 @@ int amdgpu_userq_stop_sched_for_enforce_isolation(struct amdgpu_device *adev,
> if (((queue->queue_type == AMDGPU_HW_IP_GFX) ||
> (queue->queue_type == AMDGPU_HW_IP_COMPUTE)) &&
> (queue->xcp_id == idx)) {
> + amdgpu_userq_detect_and_reset_queues(uqm);
> r = amdgpu_userq_preempt_helper(uqm, queue);
> if (r)
> ret = r;
> @@ -1209,3 +1315,56 @@ int amdgpu_userq_gem_va_unmap_validate(struct amdgpu_device *adev,
>
> return 0;
> }
> +
> +void amdgpu_userq_pre_reset(struct amdgpu_device *adev)
> +{
> + const struct amdgpu_userq_funcs *userq_funcs;
> + struct amdgpu_usermode_queue *queue;
> + struct amdgpu_userq_mgr *uqm;
> + unsigned long queue_id;
> +
> + xa_for_each(&adev->userq_doorbell_xa, queue_id, queue) {
> + uqm = queue->userq_mgr;
> + cancel_delayed_work_sync(&uqm->resume_work);
> + if (queue->state == AMDGPU_USERQ_STATE_MAPPED) {
> + amdgpu_userq_wait_for_last_fence(uqm, queue);
> + userq_funcs = adev->userq_funcs[queue->queue_type];
> + userq_funcs->unmap(uqm, queue);
> + /* just mark all queues as hung at this point.
> + * if unmap succeeds, we could map again
> + * in amdgpu_userq_post_reset() if vram is not lost
> + */
> + queue->state = AMDGPU_USERQ_STATE_HUNG;
> + amdgpu_userq_fence_driver_force_completion(queue);
> + }
> + }
> +}
> +
> +int amdgpu_userq_post_reset(struct amdgpu_device *adev, bool vram_lost)
> +{
> + /* if any queue state is AMDGPU_USERQ_STATE_UNMAPPED
> + * at this point, we should be able to map it again
> + * and continue if vram is not lost.
> + */
> + struct amdgpu_userq_mgr *uqm;
> + struct amdgpu_usermode_queue *queue;
> + const struct amdgpu_userq_funcs *userq_funcs;
> + unsigned long queue_id;
> + int r = 0;
> +
> + xa_for_each(&adev->userq_doorbell_xa, queue_id, queue) {
> + uqm = queue->userq_mgr;
> + if (queue->state == AMDGPU_USERQ_STATE_HUNG && !vram_lost) {
> + userq_funcs = adev->userq_funcs[queue->queue_type];
> + /* Re-map queue */
> + r = userq_funcs->map(uqm, queue);
> + if (r) {
> + dev_err(adev->dev, "Failed to remap queue %ld\n", queue_id);
> + continue;
> + }
> + queue->state = AMDGPU_USERQ_STATE_MAPPED;
> + }
> + }
> +
> + return r;
> +}
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
> index 09da0617bfa2..c37444427a14 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
> @@ -106,6 +106,7 @@ struct amdgpu_userq_mgr {
> struct amdgpu_device *adev;
> struct delayed_work resume_work;
> struct drm_file *file;
> + atomic_t userq_count[AMDGPU_RING_TYPE_MAX];
> };
>
> struct amdgpu_db_info {
> @@ -148,6 +149,10 @@ int amdgpu_userq_stop_sched_for_enforce_isolation(struct amdgpu_device *adev,
> u32 idx);
> int amdgpu_userq_start_sched_for_enforce_isolation(struct amdgpu_device *adev,
> u32 idx);
> +void amdgpu_userq_reset_work(struct work_struct *work);
> +void amdgpu_userq_pre_reset(struct amdgpu_device *adev);
> +int amdgpu_userq_post_reset(struct amdgpu_device *adev, bool vram_lost);
> +
> int amdgpu_userq_input_va_validate(struct amdgpu_usermode_queue *queue,
> u64 addr, u64 expected_size);
> int amdgpu_userq_gem_va_unmap_validate(struct amdgpu_device *adev,
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c
> index 2aeeaa954882..aab55f38d81f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c
The changes to this file should be a separate bug fix patch.
Alex
> @@ -151,15 +151,20 @@ void amdgpu_userq_fence_driver_process(struct amdgpu_userq_fence_driver *fence_d
> {
> struct amdgpu_userq_fence *userq_fence, *tmp;
> struct dma_fence *fence;
> + unsigned long flags;
> u64 rptr;
> int i;
>
> if (!fence_drv)
> return;
> -
> + /*
> + * Use interrupt-safe spinlock since this function can be called from:
> + * 1. Interrupt context (IRQ handlers like gfx_v11_0_eop_irq)
> + * 2. Process context (workqueues like eviction suspend worker)
> + * Using regular spin_lock() in interrupt context causes lockdep warnings.
> + */
> + spin_lock_irqsave(&fence_drv->fence_list_lock, flags);
> rptr = amdgpu_userq_fence_read(fence_drv);
> -
> - spin_lock(&fence_drv->fence_list_lock);
> list_for_each_entry_safe(userq_fence, tmp, &fence_drv->fences, link) {
> fence = &userq_fence->base;
>
> @@ -174,7 +179,7 @@ void amdgpu_userq_fence_driver_process(struct amdgpu_userq_fence_driver *fence_d
> list_del(&userq_fence->link);
> dma_fence_put(fence);
> }
> - spin_unlock(&fence_drv->fence_list_lock);
> + spin_unlock_irqrestore(&fence_drv->fence_list_lock, flags);
> }
>
> void amdgpu_userq_fence_driver_destroy(struct kref *ref)
> --
> 2.49.0
>
^ permalink raw reply [flat|nested] 5+ messages in thread
* RE: [PATCH v8 1/2] drm/amdgpu: Convert amdgpu userqueue management from IDR to XArray
2025-10-15 1:08 [PATCH v8 1/2] drm/amdgpu: Convert amdgpu userqueue management from IDR to XArray Jesse.Zhang
2025-10-15 1:08 ` [PATCH v8 2/2] drm/amdgpu: Implement user queue reset functionality Jesse.Zhang
@ 2025-10-17 1:25 ` Zhang, Jesse(Jie)
2025-10-23 20:31 ` Alex Deucher
2 siblings, 0 replies; 5+ messages in thread
From: Zhang, Jesse(Jie) @ 2025-10-17 1:25 UTC (permalink / raw)
To: Zhang, Jesse(Jie), amd-gfx@lists.freedesktop.org,
Koenig, Christian
Cc: Deucher, Alexander
[AMD Official Use Only - AMD Internal Distribution Only]
Ping..
@Koenig, Christian,
Could you please review the patch again when you have some free time?
Thank you in advance.
Thanks
Jesse
> -----Original Message-----
> From: Jesse.Zhang <Jesse.Zhang@amd.com>
> Sent: Wednesday, October 15, 2025 9:09 AM
> To: amd-gfx@lists.freedesktop.org
> Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Koenig, Christian
> <Christian.Koenig@amd.com>; Zhang, Jesse(Jie) <Jesse.Zhang@amd.com>
> Subject: [PATCH v8 1/2] drm/amdgpu: Convert amdgpu userqueue management
> from IDR to XArray
>
> This commit refactors the AMDGPU userqueue management subsystem to replace
> IDR (ID Allocation) with XArray for improved performance, scalability, and
> maintainability. The changes address several issues with the previous IDR
> implementation and provide better locking semantics.
>
> Key changes:
>
> 1. **Global XArray Introduction**:
> - Added `userq_doorbell_xa` to `struct amdgpu_device` for global queue tracking
> - Uses doorbell_index as key for efficient global lookup
> - Replaces the previous `userq_mgr_list` linked list approach
>
> 2. **Per-process XArray Conversion**:
> - Replaced `userq_idr` with `userq_mgr_xa` in `struct amdgpu_userq_mgr`
> - Maintains per-process queue tracking with queue_id as key
> - Uses XA_FLAGS_ALLOC for automatic ID allocation
>
> 3. **Locking Improvements**:
> - Removed global `userq_mutex` from `struct amdgpu_device`
> - Replaced with fine-grained XArray locking using XArray's internal spinlocks
>
> 4. **Runtime Idle Check Optimization**:
> - Updated `amdgpu_runtime_idle_check_userq()` to use xa_empty
>
> 5. **Queue Management Functions**:
> - Converted all IDR operations to equivalent XArray functions:
> - `idr_alloc()` → `xa_alloc()`
> - `idr_find()` → `xa_load()`
> - `idr_remove()` → `xa_erase()`
> - `idr_for_each()` → `xa_for_each()`
>
> Benefits:
> - **Performance**: XArray provides better scalability for large numbers of queues
> - **Memory Efficiency**: Reduced memory overhead compared to IDR
> - **Thread Safety**: Improved locking semantics with XArray's internal spinlocks
>
> v2: rename userq_global_xa/userq_xa to userq_doorbell_xa/userq_mgr_xa
> Remove xa_lock and use its own lock.
>
> v3: Set queue->userq_mgr = uq_mgr in amdgpu_userq_create()
> v4: use xa_store_irq (Christian)
> hold the read side of the reset lock while creating/destroying queues and the
> manager data structure. (Chritian)
>
> Suggested-by: Christian König <christian.koenig@amd.com>
> Signed-off-by: Jesse Zhang <Jesse.Zhang@amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 8 +-
> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 3 +-
> drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 16 +-
> drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c | 151 +++++++++---------
> drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h | 8 +-
> .../gpu/drm/amd/amdgpu/amdgpu_userq_fence.c | 4 +-
> drivers/gpu/drm/amd/amdgpu/mes_userqueue.c | 22 ++-
> 7 files changed, 101 insertions(+), 111 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index d9bd1d71552e..3d71da46b471 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -1174,6 +1174,12 @@ struct amdgpu_device {
> * queue fence.
> */
> struct xarray userq_xa;
> + /**
> + * @userq_doorbell_xa: Global user queue map (doorbell index → queue)
> + * Key: doorbell_index (unique global identifier for the queue)
> + * Value: struct amdgpu_usermode_queue
> + */
> + struct xarray userq_doorbell_xa;
>
> /* df */
> struct amdgpu_df df;
> @@ -1307,8 +1313,6 @@ struct amdgpu_device {
> */
> bool apu_prefer_gtt;
>
> - struct list_head userq_mgr_list;
> - struct mutex userq_mutex;
> bool userq_halt_for_enforce_isolation;
> struct amdgpu_uid *uid_info;
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 0d5585bc3b04..5d9fd2cabc58 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -4533,7 +4533,6 @@ int amdgpu_device_init(struct amdgpu_device *adev,
> mutex_init(&adev->gfx.userq_sch_mutex);
> mutex_init(&adev->gfx.workload_profile_mutex);
> mutex_init(&adev->vcn.workload_profile_mutex);
> - mutex_init(&adev->userq_mutex);
>
> amdgpu_device_init_apu_flags(adev);
>
> @@ -4561,7 +4560,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>
> INIT_LIST_HEAD(&adev->pm.od_kobj_list);
>
> - INIT_LIST_HEAD(&adev->userq_mgr_list);
> + xa_init(&adev->userq_doorbell_xa);
>
> INIT_DELAYED_WORK(&adev->delayed_init_work,
> amdgpu_device_delayed_init_work_handler);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> index bebf2ebc4f34..e8909abcf9c0 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> @@ -2772,22 +2772,8 @@ static int amdgpu_runtime_idle_check_userq(struct
> device *dev)
> struct pci_dev *pdev = to_pci_dev(dev);
> struct drm_device *drm_dev = pci_get_drvdata(pdev);
> struct amdgpu_device *adev = drm_to_adev(drm_dev);
> - struct amdgpu_usermode_queue *queue;
> - struct amdgpu_userq_mgr *uqm, *tmp;
> - int queue_id;
> - int ret = 0;
> -
> - mutex_lock(&adev->userq_mutex);
> - list_for_each_entry_safe(uqm, tmp, &adev->userq_mgr_list, list) {
> - idr_for_each_entry(&uqm->userq_idr, queue, queue_id) {
> - ret = -EBUSY;
> - goto done;
> - }
> - }
> -done:
> - mutex_unlock(&adev->userq_mutex);
>
> - return ret;
> + return xa_empty(&adev->userq_doorbell_xa) ? 0 : -EBUSY;
> }
>
> static int amdgpu_pmops_runtime_suspend(struct device *dev) diff --git
> a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> index 153d64bc3dcd..1e3a1013eb71 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> @@ -30,6 +30,7 @@
> #include "amdgpu_vm.h"
> #include "amdgpu_userq.h"
> #include "amdgpu_userq_fence.h"
> +#include "amdgpu_reset.h"
>
> u32 amdgpu_userq_get_supported_ip_mask(struct amdgpu_device *adev) { @@
> -277,19 +278,27 @@ amdgpu_userq_cleanup(struct amdgpu_userq_mgr *uq_mgr,
> struct amdgpu_device *adev = uq_mgr->adev;
> const struct amdgpu_userq_funcs *uq_funcs = adev->userq_funcs[queue-
> >queue_type];
>
> + /* Wait for mode-1 reset to complete */
> + down_read(&adev->reset_domain->sem);
> +
> /* Drop the userq reference. */
> amdgpu_userq_buffer_vas_list_cleanup(adev, queue);
> uq_funcs->mqd_destroy(uq_mgr, queue);
> amdgpu_userq_fence_driver_free(queue);
> - idr_remove(&uq_mgr->userq_idr, queue_id);
> + /* Use interrupt-safe locking since IRQ handlers may access these XArrays
> */
> + xa_erase_irq(&uq_mgr->userq_mgr_xa, (unsigned long)queue_id);
> + xa_erase_irq(&adev->userq_doorbell_xa, queue->doorbell_index);
> + queue->userq_mgr = NULL;
> list_del(&queue->userq_va_list);
> kfree(queue);
> +
> + up_read(&adev->reset_domain->sem);
> }
>
> static struct amdgpu_usermode_queue *
> amdgpu_userq_find(struct amdgpu_userq_mgr *uq_mgr, int qid) {
> - return idr_find(&uq_mgr->userq_idr, qid);
> + return xa_load(&uq_mgr->userq_mgr_xa, qid);
> }
>
> void
> @@ -550,8 +559,9 @@ amdgpu_userq_create(struct drm_file *filp, union
> drm_amdgpu_userq *args)
> struct amdgpu_db_info db_info;
> char *queue_name;
> bool skip_map_queue;
> + u32 qid;
> uint64_t index;
> - int qid, r = 0;
> + int r = 0;
> int priority =
> (args->in.flags &
> AMDGPU_USERQ_CREATE_FLAGS_QUEUE_PRIORITY_MASK) >>
>
> AMDGPU_USERQ_CREATE_FLAGS_QUEUE_PRIORITY_SHIFT;
> @@ -574,7 +584,6 @@ amdgpu_userq_create(struct drm_file *filp, union
> drm_amdgpu_userq *args)
> *
> * This will also make sure we have a valid eviction fence ready to be used.
> */
> - mutex_lock(&adev->userq_mutex);
> amdgpu_userq_ensure_ev_fence(&fpriv->userq_mgr, &fpriv->evf_mgr);
>
> uq_funcs = adev->userq_funcs[args->in.ip_type];
> @@ -637,15 +646,27 @@ amdgpu_userq_create(struct drm_file *filp, union
> drm_amdgpu_userq *args)
> goto unlock;
> }
>
> - qid = idr_alloc(&uq_mgr->userq_idr, queue, 1,
> AMDGPU_MAX_USERQ_COUNT, GFP_KERNEL);
> - if (qid < 0) {
> + /* Wait for mode-1 reset to complete */
> + down_read(&adev->reset_domain->sem);
> + r =xa_err(xa_store_irq(&adev->userq_doorbell_xa, index, queue,
> GFP_KERNEL));
> + if (r) {
> + kfree(queue);
> + up_read(&adev->reset_domain->sem);
> + goto unlock;
> + }
> +
> + r = xa_alloc(&uq_mgr->userq_mgr_xa, &qid, queue, XA_LIMIT(1,
> AMDGPU_MAX_USERQ_COUNT), GFP_KERNEL);
> + if (r) {
> drm_file_err(uq_mgr->file, "Failed to allocate a queue id\n");
> amdgpu_userq_fence_driver_free(queue);
> uq_funcs->mqd_destroy(uq_mgr, queue);
> kfree(queue);
> r = -ENOMEM;
> + up_read(&adev->reset_domain->sem);
> goto unlock;
> }
> + up_read(&adev->reset_domain->sem);
> + queue->userq_mgr = uq_mgr;
>
> /* don't map the queue if scheduling is halted */
> if (adev->userq_halt_for_enforce_isolation && @@ -658,7 +679,7 @@
> amdgpu_userq_create(struct drm_file *filp, union drm_amdgpu_userq *args)
> r = amdgpu_userq_map_helper(uq_mgr, queue);
> if (r) {
> drm_file_err(uq_mgr->file, "Failed to map Queue\n");
> - idr_remove(&uq_mgr->userq_idr, qid);
> + xa_erase(&uq_mgr->userq_mgr_xa, qid);
> amdgpu_userq_fence_driver_free(queue);
> uq_funcs->mqd_destroy(uq_mgr, queue);
> kfree(queue);
> @@ -681,7 +702,6 @@ amdgpu_userq_create(struct drm_file *filp, union
> drm_amdgpu_userq *args)
>
> unlock:
> mutex_unlock(&uq_mgr->userq_mutex);
> - mutex_unlock(&adev->userq_mutex);
>
> return r;
> }
> @@ -779,11 +799,11 @@ static int
> amdgpu_userq_restore_all(struct amdgpu_userq_mgr *uq_mgr) {
> struct amdgpu_usermode_queue *queue;
> - int queue_id;
> + unsigned long queue_id;
> int ret = 0, r;
>
> /* Resume all the queues for this process */
> - idr_for_each_entry(&uq_mgr->userq_idr, queue, queue_id) {
> + xa_for_each(&uq_mgr->userq_mgr_xa, queue_id, queue) {
>
> if (!amdgpu_userq_buffer_vas_mapped(queue)) {
> drm_file_err(uq_mgr->file,
> @@ -942,11 +962,11 @@ static int
> amdgpu_userq_evict_all(struct amdgpu_userq_mgr *uq_mgr) {
> struct amdgpu_usermode_queue *queue;
> - int queue_id;
> + unsigned long queue_id;
> int ret = 0, r;
>
> /* Try to unmap all the queues in this process ctx */
> - idr_for_each_entry(&uq_mgr->userq_idr, queue, queue_id) {
> + xa_for_each(&uq_mgr->userq_mgr_xa, queue_id, queue) {
> r = amdgpu_userq_preempt_helper(uq_mgr, queue);
> if (r)
> ret = r;
> @@ -961,9 +981,10 @@ static int
> amdgpu_userq_wait_for_signal(struct amdgpu_userq_mgr *uq_mgr) {
> struct amdgpu_usermode_queue *queue;
> - int queue_id, ret;
> + unsigned long queue_id;
> + int ret;
>
> - idr_for_each_entry(&uq_mgr->userq_idr, queue, queue_id) {
> + xa_for_each(&uq_mgr->userq_mgr_xa, queue_id, queue) {
> struct dma_fence *f = queue->last_fence;
>
> if (!f || dma_fence_is_signaled(f))
> @@ -1016,44 +1037,30 @@ int amdgpu_userq_mgr_init(struct
> amdgpu_userq_mgr *userq_mgr, struct drm_file *f
> struct amdgpu_device *adev)
> {
> mutex_init(&userq_mgr->userq_mutex);
> - idr_init_base(&userq_mgr->userq_idr, 1);
> + xa_init_flags(&userq_mgr->userq_mgr_xa, XA_FLAGS_ALLOC);
> userq_mgr->adev = adev;
> userq_mgr->file = file_priv;
>
> - mutex_lock(&adev->userq_mutex);
> - list_add(&userq_mgr->list, &adev->userq_mgr_list);
> - mutex_unlock(&adev->userq_mutex);
> -
> INIT_DELAYED_WORK(&userq_mgr->resume_work,
> amdgpu_userq_restore_worker);
> return 0;
> }
>
> void amdgpu_userq_mgr_fini(struct amdgpu_userq_mgr *userq_mgr) {
> - struct amdgpu_device *adev = userq_mgr->adev;
> struct amdgpu_usermode_queue *queue;
> - struct amdgpu_userq_mgr *uqm, *tmp;
> - uint32_t queue_id;
> + unsigned long queue_id;
>
> cancel_delayed_work_sync(&userq_mgr->resume_work);
>
> - mutex_lock(&adev->userq_mutex);
> mutex_lock(&userq_mgr->userq_mutex);
> - idr_for_each_entry(&userq_mgr->userq_idr, queue, queue_id) {
> + xa_for_each(&userq_mgr->userq_mgr_xa, queue_id, queue) {
> amdgpu_userq_wait_for_last_fence(userq_mgr, queue);
> amdgpu_userq_unmap_helper(userq_mgr, queue);
> amdgpu_userq_cleanup(userq_mgr, queue, queue_id);
> }
>
> - list_for_each_entry_safe(uqm, tmp, &adev->userq_mgr_list, list) {
> - if (uqm == userq_mgr) {
> - list_del(&uqm->list);
> - break;
> - }
> - }
> - idr_destroy(&userq_mgr->userq_idr);
> + xa_destroy(&userq_mgr->userq_mgr_xa);
> mutex_unlock(&userq_mgr->userq_mutex);
> - mutex_unlock(&adev->userq_mutex);
> mutex_destroy(&userq_mgr->userq_mutex);
> }
>
> @@ -1061,25 +1068,23 @@ int amdgpu_userq_suspend(struct amdgpu_device
> *adev) {
> u32 ip_mask = amdgpu_userq_get_supported_ip_mask(adev);
> struct amdgpu_usermode_queue *queue;
> - struct amdgpu_userq_mgr *uqm, *tmp;
> - int queue_id;
> + struct amdgpu_userq_mgr *uqm;
> + unsigned long queue_id;
> int r;
>
> if (!ip_mask)
> return 0;
>
> - guard(mutex)(&adev->userq_mutex);
> - list_for_each_entry_safe(uqm, tmp, &adev->userq_mgr_list, list) {
> + xa_for_each(&adev->userq_doorbell_xa, queue_id, queue) {
> + uqm = queue->userq_mgr;
> cancel_delayed_work_sync(&uqm->resume_work);
> guard(mutex)(&uqm->userq_mutex);
> - idr_for_each_entry(&uqm->userq_idr, queue, queue_id) {
> - if (adev->in_s0ix)
> - r = amdgpu_userq_preempt_helper(uqm, queue);
> - else
> - r = amdgpu_userq_unmap_helper(uqm, queue);
> - if (r)
> - return r;
> - }
> + if (adev->in_s0ix)
> + r = amdgpu_userq_preempt_helper(uqm, queue);
> + else
> + r = amdgpu_userq_unmap_helper(uqm, queue);
> + if (r)
> + return r;
> }
> return 0;
> }
> @@ -1088,24 +1093,22 @@ int amdgpu_userq_resume(struct amdgpu_device
> *adev) {
> u32 ip_mask = amdgpu_userq_get_supported_ip_mask(adev);
> struct amdgpu_usermode_queue *queue;
> - struct amdgpu_userq_mgr *uqm, *tmp;
> - int queue_id;
> + struct amdgpu_userq_mgr *uqm;
> + unsigned long queue_id;
> int r;
>
> if (!ip_mask)
> return 0;
>
> - guard(mutex)(&adev->userq_mutex);
> - list_for_each_entry_safe(uqm, tmp, &adev->userq_mgr_list, list) {
> + xa_for_each(&adev->userq_doorbell_xa, queue_id, queue) {
> + uqm = queue->userq_mgr;
> guard(mutex)(&uqm->userq_mutex);
> - idr_for_each_entry(&uqm->userq_idr, queue, queue_id) {
> - if (adev->in_s0ix)
> - r = amdgpu_userq_restore_helper(uqm, queue);
> - else
> - r = amdgpu_userq_map_helper(uqm, queue);
> - if (r)
> - return r;
> - }
> + if (adev->in_s0ix)
> + r = amdgpu_userq_restore_helper(uqm, queue);
> + else
> + r = amdgpu_userq_map_helper(uqm, queue);
> + if (r)
> + return r;
> }
>
> return 0;
> @@ -1116,33 +1119,31 @@ int
> amdgpu_userq_stop_sched_for_enforce_isolation(struct amdgpu_device *adev, {
> u32 ip_mask = amdgpu_userq_get_supported_ip_mask(adev);
> struct amdgpu_usermode_queue *queue;
> - struct amdgpu_userq_mgr *uqm, *tmp;
> - int queue_id;
> + struct amdgpu_userq_mgr *uqm;
> + unsigned long queue_id;
> int ret = 0, r;
>
> /* only need to stop gfx/compute */
> if (!(ip_mask & ((1 << AMDGPU_HW_IP_GFX) | (1 <<
> AMDGPU_HW_IP_COMPUTE))))
> return 0;
>
> - mutex_lock(&adev->userq_mutex);
> if (adev->userq_halt_for_enforce_isolation)
> dev_warn(adev->dev, "userq scheduling already stopped!\n");
> adev->userq_halt_for_enforce_isolation = true;
> - list_for_each_entry_safe(uqm, tmp, &adev->userq_mgr_list, list) {
> + xa_for_each(&adev->userq_doorbell_xa, queue_id, queue) {
> + uqm = queue->userq_mgr;
> cancel_delayed_work_sync(&uqm->resume_work);
> mutex_lock(&uqm->userq_mutex);
> - idr_for_each_entry(&uqm->userq_idr, queue, queue_id) {
> - if (((queue->queue_type == AMDGPU_HW_IP_GFX) ||
> - (queue->queue_type == AMDGPU_HW_IP_COMPUTE))
> &&
> - (queue->xcp_id == idx)) {
> - r = amdgpu_userq_preempt_helper(uqm, queue);
> - if (r)
> - ret = r;
> - }
> + if (((queue->queue_type == AMDGPU_HW_IP_GFX) ||
> + (queue->queue_type == AMDGPU_HW_IP_COMPUTE)) &&
> + (queue->xcp_id == idx)) {
> + r = amdgpu_userq_preempt_helper(uqm, queue);
> + if (r)
> + ret = r;
> }
> mutex_unlock(&uqm->userq_mutex);
> }
> - mutex_unlock(&adev->userq_mutex);
> +
> return ret;
> }
>
> @@ -1151,21 +1152,20 @@ int
> amdgpu_userq_start_sched_for_enforce_isolation(struct amdgpu_device *adev, {
> u32 ip_mask = amdgpu_userq_get_supported_ip_mask(adev);
> struct amdgpu_usermode_queue *queue;
> - struct amdgpu_userq_mgr *uqm, *tmp;
> - int queue_id;
> + struct amdgpu_userq_mgr *uqm;
> + unsigned long queue_id;
> int ret = 0, r;
>
> /* only need to stop gfx/compute */
> if (!(ip_mask & ((1 << AMDGPU_HW_IP_GFX) | (1 <<
> AMDGPU_HW_IP_COMPUTE))))
> return 0;
>
> - mutex_lock(&adev->userq_mutex);
> if (!adev->userq_halt_for_enforce_isolation)
> dev_warn(adev->dev, "userq scheduling already started!\n");
> adev->userq_halt_for_enforce_isolation = false;
> - list_for_each_entry_safe(uqm, tmp, &adev->userq_mgr_list, list) {
> + xa_for_each(&adev->userq_doorbell_xa, queue_id, queue) {
> + uqm = queue->userq_mgr;
> mutex_lock(&uqm->userq_mutex);
> - idr_for_each_entry(&uqm->userq_idr, queue, queue_id) {
> if (((queue->queue_type == AMDGPU_HW_IP_GFX) ||
> (queue->queue_type == AMDGPU_HW_IP_COMPUTE))
> &&
> (queue->xcp_id == idx)) {
> @@ -1173,10 +1173,9 @@ int
> amdgpu_userq_start_sched_for_enforce_isolation(struct amdgpu_device *adev,
> if (r)
> ret = r;
> }
> - }
> mutex_unlock(&uqm->userq_mutex);
> }
> - mutex_unlock(&adev->userq_mutex);
> +
> return ret;
> }
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
> index 036d8dd585cd..09da0617bfa2 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
> @@ -96,11 +96,15 @@ struct amdgpu_userq_funcs {
>
> /* Usermode queues for gfx */
> struct amdgpu_userq_mgr {
> - struct idr userq_idr;
> + /**
> + * @userq_mgr_xa: Per-process user queue map (queue ID → queue)
> + * Key: queue_id (unique ID within the process's userq manager)
> + * Value: struct amdgpu_usermode_queue
> + */
> + struct xarray userq_mgr_xa;
> struct mutex userq_mutex;
> struct amdgpu_device *adev;
> struct delayed_work resume_work;
> - struct list_head list;
> struct drm_file *file;
> };
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c
> index 761bad98da3e..2aeeaa954882 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c
> @@ -537,7 +537,7 @@ int amdgpu_userq_signal_ioctl(struct drm_device *dev,
> void *data,
> }
>
> /* Retrieve the user queue */
> - queue = idr_find(&userq_mgr->userq_idr, args->queue_id);
> + queue = xa_load(&userq_mgr->userq_mgr_xa, args->queue_id);
> if (!queue) {
> r = -ENOENT;
> goto put_gobj_write;
> @@ -899,7 +899,7 @@ int amdgpu_userq_wait_ioctl(struct drm_device *dev, void
> *data,
> */
> num_fences = dma_fence_dedup_array(fences, num_fences);
>
> - waitq = idr_find(&userq_mgr->userq_idr, wait_info->waitq_id);
> + waitq = xa_load(&userq_mgr->userq_mgr_xa, wait_info->waitq_id);
> if (!waitq) {
> r = -EINVAL;
> goto free_fences;
> diff --git a/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c
> b/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c
> index 829129ad7bd1..b8486b0d0d69 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c
> @@ -205,9 +205,9 @@ static int mes_userq_detect_and_reset(struct
> amdgpu_device *adev,
> int db_array_size = amdgpu_mes_get_hung_queue_db_array_size(adev);
> struct mes_detect_and_reset_queue_input input;
> struct amdgpu_usermode_queue *queue;
> - struct amdgpu_userq_mgr *uqm, *tmp;
> unsigned int hung_db_num = 0;
> - int queue_id, r, i;
> + unsigned long queue_id;
> + int r, i;
> u32 db_array[4];
>
> if (db_array_size > 4) {
> @@ -227,16 +227,14 @@ static int mes_userq_detect_and_reset(struct
> amdgpu_device *adev,
> if (r) {
> dev_err(adev->dev, "Failed to detect and reset queues, err (%d)\n",
> r);
> } else if (hung_db_num) {
> - list_for_each_entry_safe(uqm, tmp, &adev->userq_mgr_list, list) {
> - idr_for_each_entry(&uqm->userq_idr, queue, queue_id) {
> - if (queue->queue_type == queue_type) {
> - for (i = 0; i < hung_db_num; i++) {
> - if (queue->doorbell_index ==
> db_array[i]) {
> - queue->state =
> AMDGPU_USERQ_STATE_HUNG;
> - atomic_inc(&adev-
> >gpu_reset_counter);
> -
> amdgpu_userq_fence_driver_force_completion(queue);
> -
> drm_dev_wedged_event(adev_to_drm(adev),
> DRM_WEDGE_RECOVERY_NONE, NULL);
> - }
> + xa_for_each(&adev->userq_doorbell_xa, queue_id, queue) {
> + if (queue->queue_type == queue_type) {
> + for (i = 0; i < hung_db_num; i++) {
> + if (queue->doorbell_index == db_array[i]) {
> + queue->state =
> AMDGPU_USERQ_STATE_HUNG;
> + atomic_inc(&adev-
> >gpu_reset_counter);
> +
> amdgpu_userq_fence_driver_force_completion(queue);
> +
> drm_dev_wedged_event(adev_to_drm(adev),
> DRM_WEDGE_RECOVERY_NONE,
> +NULL);
> }
> }
> }
> --
> 2.49.0
^ permalink raw reply [flat|nested] 5+ messages in thread* Re: [PATCH v8 1/2] drm/amdgpu: Convert amdgpu userqueue management from IDR to XArray
2025-10-15 1:08 [PATCH v8 1/2] drm/amdgpu: Convert amdgpu userqueue management from IDR to XArray Jesse.Zhang
2025-10-15 1:08 ` [PATCH v8 2/2] drm/amdgpu: Implement user queue reset functionality Jesse.Zhang
2025-10-17 1:25 ` [PATCH v8 1/2] drm/amdgpu: Convert amdgpu userqueue management from IDR to XArray Zhang, Jesse(Jie)
@ 2025-10-23 20:31 ` Alex Deucher
2 siblings, 0 replies; 5+ messages in thread
From: Alex Deucher @ 2025-10-23 20:31 UTC (permalink / raw)
To: Jesse.Zhang; +Cc: amd-gfx, Alexander.Deucher, Christian Koenig
On Tue, Oct 14, 2025 at 9:26 PM Jesse.Zhang <Jesse.Zhang@amd.com> wrote:
>
> This commit refactors the AMDGPU userqueue management subsystem to replace
> IDR (ID Allocation) with XArray for improved performance, scalability, and
> maintainability. The changes address several issues with the previous IDR
> implementation and provide better locking semantics.
>
> Key changes:
>
> 1. **Global XArray Introduction**:
> - Added `userq_doorbell_xa` to `struct amdgpu_device` for global queue tracking
> - Uses doorbell_index as key for efficient global lookup
> - Replaces the previous `userq_mgr_list` linked list approach
>
> 2. **Per-process XArray Conversion**:
> - Replaced `userq_idr` with `userq_mgr_xa` in `struct amdgpu_userq_mgr`
> - Maintains per-process queue tracking with queue_id as key
> - Uses XA_FLAGS_ALLOC for automatic ID allocation
>
> 3. **Locking Improvements**:
> - Removed global `userq_mutex` from `struct amdgpu_device`
> - Replaced with fine-grained XArray locking using XArray's internal spinlocks
>
> 4. **Runtime Idle Check Optimization**:
> - Updated `amdgpu_runtime_idle_check_userq()` to use xa_empty
>
> 5. **Queue Management Functions**:
> - Converted all IDR operations to equivalent XArray functions:
> - `idr_alloc()` → `xa_alloc()`
> - `idr_find()` → `xa_load()`
> - `idr_remove()` → `xa_erase()`
> - `idr_for_each()` → `xa_for_each()`
>
> Benefits:
> - **Performance**: XArray provides better scalability for large numbers of queues
> - **Memory Efficiency**: Reduced memory overhead compared to IDR
> - **Thread Safety**: Improved locking semantics with XArray's internal spinlocks
>
> v2: rename userq_global_xa/userq_xa to userq_doorbell_xa/userq_mgr_xa
> Remove xa_lock and use its own lock.
>
> v3: Set queue->userq_mgr = uq_mgr in amdgpu_userq_create()
> v4: use xa_store_irq (Christian)
> hold the read side of the reset lock while creating/destroying queues and the manager data structure. (Chritian)
>
> Suggested-by: Christian König <christian.koenig@amd.com>
> Signed-off-by: Jesse Zhang <Jesse.Zhang@amd.com>
Acked-by: Alex Deucher <alexander.deucher@amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 8 +-
> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 3 +-
> drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 16 +-
> drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c | 151 +++++++++---------
> drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h | 8 +-
> .../gpu/drm/amd/amdgpu/amdgpu_userq_fence.c | 4 +-
> drivers/gpu/drm/amd/amdgpu/mes_userqueue.c | 22 ++-
> 7 files changed, 101 insertions(+), 111 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index d9bd1d71552e..3d71da46b471 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -1174,6 +1174,12 @@ struct amdgpu_device {
> * queue fence.
> */
> struct xarray userq_xa;
> + /**
> + * @userq_doorbell_xa: Global user queue map (doorbell index → queue)
> + * Key: doorbell_index (unique global identifier for the queue)
> + * Value: struct amdgpu_usermode_queue
> + */
> + struct xarray userq_doorbell_xa;
>
> /* df */
> struct amdgpu_df df;
> @@ -1307,8 +1313,6 @@ struct amdgpu_device {
> */
> bool apu_prefer_gtt;
>
> - struct list_head userq_mgr_list;
> - struct mutex userq_mutex;
> bool userq_halt_for_enforce_isolation;
> struct amdgpu_uid *uid_info;
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 0d5585bc3b04..5d9fd2cabc58 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -4533,7 +4533,6 @@ int amdgpu_device_init(struct amdgpu_device *adev,
> mutex_init(&adev->gfx.userq_sch_mutex);
> mutex_init(&adev->gfx.workload_profile_mutex);
> mutex_init(&adev->vcn.workload_profile_mutex);
> - mutex_init(&adev->userq_mutex);
>
> amdgpu_device_init_apu_flags(adev);
>
> @@ -4561,7 +4560,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>
> INIT_LIST_HEAD(&adev->pm.od_kobj_list);
>
> - INIT_LIST_HEAD(&adev->userq_mgr_list);
> + xa_init(&adev->userq_doorbell_xa);
>
> INIT_DELAYED_WORK(&adev->delayed_init_work,
> amdgpu_device_delayed_init_work_handler);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> index bebf2ebc4f34..e8909abcf9c0 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> @@ -2772,22 +2772,8 @@ static int amdgpu_runtime_idle_check_userq(struct device *dev)
> struct pci_dev *pdev = to_pci_dev(dev);
> struct drm_device *drm_dev = pci_get_drvdata(pdev);
> struct amdgpu_device *adev = drm_to_adev(drm_dev);
> - struct amdgpu_usermode_queue *queue;
> - struct amdgpu_userq_mgr *uqm, *tmp;
> - int queue_id;
> - int ret = 0;
> -
> - mutex_lock(&adev->userq_mutex);
> - list_for_each_entry_safe(uqm, tmp, &adev->userq_mgr_list, list) {
> - idr_for_each_entry(&uqm->userq_idr, queue, queue_id) {
> - ret = -EBUSY;
> - goto done;
> - }
> - }
> -done:
> - mutex_unlock(&adev->userq_mutex);
>
> - return ret;
> + return xa_empty(&adev->userq_doorbell_xa) ? 0 : -EBUSY;
> }
>
> static int amdgpu_pmops_runtime_suspend(struct device *dev)
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> index 153d64bc3dcd..1e3a1013eb71 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> @@ -30,6 +30,7 @@
> #include "amdgpu_vm.h"
> #include "amdgpu_userq.h"
> #include "amdgpu_userq_fence.h"
> +#include "amdgpu_reset.h"
>
> u32 amdgpu_userq_get_supported_ip_mask(struct amdgpu_device *adev)
> {
> @@ -277,19 +278,27 @@ amdgpu_userq_cleanup(struct amdgpu_userq_mgr *uq_mgr,
> struct amdgpu_device *adev = uq_mgr->adev;
> const struct amdgpu_userq_funcs *uq_funcs = adev->userq_funcs[queue->queue_type];
>
> + /* Wait for mode-1 reset to complete */
> + down_read(&adev->reset_domain->sem);
> +
> /* Drop the userq reference. */
> amdgpu_userq_buffer_vas_list_cleanup(adev, queue);
> uq_funcs->mqd_destroy(uq_mgr, queue);
> amdgpu_userq_fence_driver_free(queue);
> - idr_remove(&uq_mgr->userq_idr, queue_id);
> + /* Use interrupt-safe locking since IRQ handlers may access these XArrays */
> + xa_erase_irq(&uq_mgr->userq_mgr_xa, (unsigned long)queue_id);
> + xa_erase_irq(&adev->userq_doorbell_xa, queue->doorbell_index);
> + queue->userq_mgr = NULL;
> list_del(&queue->userq_va_list);
> kfree(queue);
> +
> + up_read(&adev->reset_domain->sem);
> }
>
> static struct amdgpu_usermode_queue *
> amdgpu_userq_find(struct amdgpu_userq_mgr *uq_mgr, int qid)
> {
> - return idr_find(&uq_mgr->userq_idr, qid);
> + return xa_load(&uq_mgr->userq_mgr_xa, qid);
> }
>
> void
> @@ -550,8 +559,9 @@ amdgpu_userq_create(struct drm_file *filp, union drm_amdgpu_userq *args)
> struct amdgpu_db_info db_info;
> char *queue_name;
> bool skip_map_queue;
> + u32 qid;
> uint64_t index;
> - int qid, r = 0;
> + int r = 0;
> int priority =
> (args->in.flags & AMDGPU_USERQ_CREATE_FLAGS_QUEUE_PRIORITY_MASK) >>
> AMDGPU_USERQ_CREATE_FLAGS_QUEUE_PRIORITY_SHIFT;
> @@ -574,7 +584,6 @@ amdgpu_userq_create(struct drm_file *filp, union drm_amdgpu_userq *args)
> *
> * This will also make sure we have a valid eviction fence ready to be used.
> */
> - mutex_lock(&adev->userq_mutex);
> amdgpu_userq_ensure_ev_fence(&fpriv->userq_mgr, &fpriv->evf_mgr);
>
> uq_funcs = adev->userq_funcs[args->in.ip_type];
> @@ -637,15 +646,27 @@ amdgpu_userq_create(struct drm_file *filp, union drm_amdgpu_userq *args)
> goto unlock;
> }
>
> - qid = idr_alloc(&uq_mgr->userq_idr, queue, 1, AMDGPU_MAX_USERQ_COUNT, GFP_KERNEL);
> - if (qid < 0) {
> + /* Wait for mode-1 reset to complete */
> + down_read(&adev->reset_domain->sem);
> + r =xa_err(xa_store_irq(&adev->userq_doorbell_xa, index, queue, GFP_KERNEL));
> + if (r) {
> + kfree(queue);
> + up_read(&adev->reset_domain->sem);
> + goto unlock;
> + }
> +
> + r = xa_alloc(&uq_mgr->userq_mgr_xa, &qid, queue, XA_LIMIT(1, AMDGPU_MAX_USERQ_COUNT), GFP_KERNEL);
> + if (r) {
> drm_file_err(uq_mgr->file, "Failed to allocate a queue id\n");
> amdgpu_userq_fence_driver_free(queue);
> uq_funcs->mqd_destroy(uq_mgr, queue);
> kfree(queue);
> r = -ENOMEM;
> + up_read(&adev->reset_domain->sem);
> goto unlock;
> }
> + up_read(&adev->reset_domain->sem);
> + queue->userq_mgr = uq_mgr;
>
> /* don't map the queue if scheduling is halted */
> if (adev->userq_halt_for_enforce_isolation &&
> @@ -658,7 +679,7 @@ amdgpu_userq_create(struct drm_file *filp, union drm_amdgpu_userq *args)
> r = amdgpu_userq_map_helper(uq_mgr, queue);
> if (r) {
> drm_file_err(uq_mgr->file, "Failed to map Queue\n");
> - idr_remove(&uq_mgr->userq_idr, qid);
> + xa_erase(&uq_mgr->userq_mgr_xa, qid);
> amdgpu_userq_fence_driver_free(queue);
> uq_funcs->mqd_destroy(uq_mgr, queue);
> kfree(queue);
> @@ -681,7 +702,6 @@ amdgpu_userq_create(struct drm_file *filp, union drm_amdgpu_userq *args)
>
> unlock:
> mutex_unlock(&uq_mgr->userq_mutex);
> - mutex_unlock(&adev->userq_mutex);
>
> return r;
> }
> @@ -779,11 +799,11 @@ static int
> amdgpu_userq_restore_all(struct amdgpu_userq_mgr *uq_mgr)
> {
> struct amdgpu_usermode_queue *queue;
> - int queue_id;
> + unsigned long queue_id;
> int ret = 0, r;
>
> /* Resume all the queues for this process */
> - idr_for_each_entry(&uq_mgr->userq_idr, queue, queue_id) {
> + xa_for_each(&uq_mgr->userq_mgr_xa, queue_id, queue) {
>
> if (!amdgpu_userq_buffer_vas_mapped(queue)) {
> drm_file_err(uq_mgr->file,
> @@ -942,11 +962,11 @@ static int
> amdgpu_userq_evict_all(struct amdgpu_userq_mgr *uq_mgr)
> {
> struct amdgpu_usermode_queue *queue;
> - int queue_id;
> + unsigned long queue_id;
> int ret = 0, r;
>
> /* Try to unmap all the queues in this process ctx */
> - idr_for_each_entry(&uq_mgr->userq_idr, queue, queue_id) {
> + xa_for_each(&uq_mgr->userq_mgr_xa, queue_id, queue) {
> r = amdgpu_userq_preempt_helper(uq_mgr, queue);
> if (r)
> ret = r;
> @@ -961,9 +981,10 @@ static int
> amdgpu_userq_wait_for_signal(struct amdgpu_userq_mgr *uq_mgr)
> {
> struct amdgpu_usermode_queue *queue;
> - int queue_id, ret;
> + unsigned long queue_id;
> + int ret;
>
> - idr_for_each_entry(&uq_mgr->userq_idr, queue, queue_id) {
> + xa_for_each(&uq_mgr->userq_mgr_xa, queue_id, queue) {
> struct dma_fence *f = queue->last_fence;
>
> if (!f || dma_fence_is_signaled(f))
> @@ -1016,44 +1037,30 @@ int amdgpu_userq_mgr_init(struct amdgpu_userq_mgr *userq_mgr, struct drm_file *f
> struct amdgpu_device *adev)
> {
> mutex_init(&userq_mgr->userq_mutex);
> - idr_init_base(&userq_mgr->userq_idr, 1);
> + xa_init_flags(&userq_mgr->userq_mgr_xa, XA_FLAGS_ALLOC);
> userq_mgr->adev = adev;
> userq_mgr->file = file_priv;
>
> - mutex_lock(&adev->userq_mutex);
> - list_add(&userq_mgr->list, &adev->userq_mgr_list);
> - mutex_unlock(&adev->userq_mutex);
> -
> INIT_DELAYED_WORK(&userq_mgr->resume_work, amdgpu_userq_restore_worker);
> return 0;
> }
>
> void amdgpu_userq_mgr_fini(struct amdgpu_userq_mgr *userq_mgr)
> {
> - struct amdgpu_device *adev = userq_mgr->adev;
> struct amdgpu_usermode_queue *queue;
> - struct amdgpu_userq_mgr *uqm, *tmp;
> - uint32_t queue_id;
> + unsigned long queue_id;
>
> cancel_delayed_work_sync(&userq_mgr->resume_work);
>
> - mutex_lock(&adev->userq_mutex);
> mutex_lock(&userq_mgr->userq_mutex);
> - idr_for_each_entry(&userq_mgr->userq_idr, queue, queue_id) {
> + xa_for_each(&userq_mgr->userq_mgr_xa, queue_id, queue) {
> amdgpu_userq_wait_for_last_fence(userq_mgr, queue);
> amdgpu_userq_unmap_helper(userq_mgr, queue);
> amdgpu_userq_cleanup(userq_mgr, queue, queue_id);
> }
>
> - list_for_each_entry_safe(uqm, tmp, &adev->userq_mgr_list, list) {
> - if (uqm == userq_mgr) {
> - list_del(&uqm->list);
> - break;
> - }
> - }
> - idr_destroy(&userq_mgr->userq_idr);
> + xa_destroy(&userq_mgr->userq_mgr_xa);
> mutex_unlock(&userq_mgr->userq_mutex);
> - mutex_unlock(&adev->userq_mutex);
> mutex_destroy(&userq_mgr->userq_mutex);
> }
>
> @@ -1061,25 +1068,23 @@ int amdgpu_userq_suspend(struct amdgpu_device *adev)
> {
> u32 ip_mask = amdgpu_userq_get_supported_ip_mask(adev);
> struct amdgpu_usermode_queue *queue;
> - struct amdgpu_userq_mgr *uqm, *tmp;
> - int queue_id;
> + struct amdgpu_userq_mgr *uqm;
> + unsigned long queue_id;
> int r;
>
> if (!ip_mask)
> return 0;
>
> - guard(mutex)(&adev->userq_mutex);
> - list_for_each_entry_safe(uqm, tmp, &adev->userq_mgr_list, list) {
> + xa_for_each(&adev->userq_doorbell_xa, queue_id, queue) {
> + uqm = queue->userq_mgr;
> cancel_delayed_work_sync(&uqm->resume_work);
> guard(mutex)(&uqm->userq_mutex);
> - idr_for_each_entry(&uqm->userq_idr, queue, queue_id) {
> - if (adev->in_s0ix)
> - r = amdgpu_userq_preempt_helper(uqm, queue);
> - else
> - r = amdgpu_userq_unmap_helper(uqm, queue);
> - if (r)
> - return r;
> - }
> + if (adev->in_s0ix)
> + r = amdgpu_userq_preempt_helper(uqm, queue);
> + else
> + r = amdgpu_userq_unmap_helper(uqm, queue);
> + if (r)
> + return r;
> }
> return 0;
> }
> @@ -1088,24 +1093,22 @@ int amdgpu_userq_resume(struct amdgpu_device *adev)
> {
> u32 ip_mask = amdgpu_userq_get_supported_ip_mask(adev);
> struct amdgpu_usermode_queue *queue;
> - struct amdgpu_userq_mgr *uqm, *tmp;
> - int queue_id;
> + struct amdgpu_userq_mgr *uqm;
> + unsigned long queue_id;
> int r;
>
> if (!ip_mask)
> return 0;
>
> - guard(mutex)(&adev->userq_mutex);
> - list_for_each_entry_safe(uqm, tmp, &adev->userq_mgr_list, list) {
> + xa_for_each(&adev->userq_doorbell_xa, queue_id, queue) {
> + uqm = queue->userq_mgr;
> guard(mutex)(&uqm->userq_mutex);
> - idr_for_each_entry(&uqm->userq_idr, queue, queue_id) {
> - if (adev->in_s0ix)
> - r = amdgpu_userq_restore_helper(uqm, queue);
> - else
> - r = amdgpu_userq_map_helper(uqm, queue);
> - if (r)
> - return r;
> - }
> + if (adev->in_s0ix)
> + r = amdgpu_userq_restore_helper(uqm, queue);
> + else
> + r = amdgpu_userq_map_helper(uqm, queue);
> + if (r)
> + return r;
> }
>
> return 0;
> @@ -1116,33 +1119,31 @@ int amdgpu_userq_stop_sched_for_enforce_isolation(struct amdgpu_device *adev,
> {
> u32 ip_mask = amdgpu_userq_get_supported_ip_mask(adev);
> struct amdgpu_usermode_queue *queue;
> - struct amdgpu_userq_mgr *uqm, *tmp;
> - int queue_id;
> + struct amdgpu_userq_mgr *uqm;
> + unsigned long queue_id;
> int ret = 0, r;
>
> /* only need to stop gfx/compute */
> if (!(ip_mask & ((1 << AMDGPU_HW_IP_GFX) | (1 << AMDGPU_HW_IP_COMPUTE))))
> return 0;
>
> - mutex_lock(&adev->userq_mutex);
> if (adev->userq_halt_for_enforce_isolation)
> dev_warn(adev->dev, "userq scheduling already stopped!\n");
> adev->userq_halt_for_enforce_isolation = true;
> - list_for_each_entry_safe(uqm, tmp, &adev->userq_mgr_list, list) {
> + xa_for_each(&adev->userq_doorbell_xa, queue_id, queue) {
> + uqm = queue->userq_mgr;
> cancel_delayed_work_sync(&uqm->resume_work);
> mutex_lock(&uqm->userq_mutex);
> - idr_for_each_entry(&uqm->userq_idr, queue, queue_id) {
> - if (((queue->queue_type == AMDGPU_HW_IP_GFX) ||
> - (queue->queue_type == AMDGPU_HW_IP_COMPUTE)) &&
> - (queue->xcp_id == idx)) {
> - r = amdgpu_userq_preempt_helper(uqm, queue);
> - if (r)
> - ret = r;
> - }
> + if (((queue->queue_type == AMDGPU_HW_IP_GFX) ||
> + (queue->queue_type == AMDGPU_HW_IP_COMPUTE)) &&
> + (queue->xcp_id == idx)) {
> + r = amdgpu_userq_preempt_helper(uqm, queue);
> + if (r)
> + ret = r;
> }
> mutex_unlock(&uqm->userq_mutex);
> }
> - mutex_unlock(&adev->userq_mutex);
> +
> return ret;
> }
>
> @@ -1151,21 +1152,20 @@ int amdgpu_userq_start_sched_for_enforce_isolation(struct amdgpu_device *adev,
> {
> u32 ip_mask = amdgpu_userq_get_supported_ip_mask(adev);
> struct amdgpu_usermode_queue *queue;
> - struct amdgpu_userq_mgr *uqm, *tmp;
> - int queue_id;
> + struct amdgpu_userq_mgr *uqm;
> + unsigned long queue_id;
> int ret = 0, r;
>
> /* only need to stop gfx/compute */
> if (!(ip_mask & ((1 << AMDGPU_HW_IP_GFX) | (1 << AMDGPU_HW_IP_COMPUTE))))
> return 0;
>
> - mutex_lock(&adev->userq_mutex);
> if (!adev->userq_halt_for_enforce_isolation)
> dev_warn(adev->dev, "userq scheduling already started!\n");
> adev->userq_halt_for_enforce_isolation = false;
> - list_for_each_entry_safe(uqm, tmp, &adev->userq_mgr_list, list) {
> + xa_for_each(&adev->userq_doorbell_xa, queue_id, queue) {
> + uqm = queue->userq_mgr;
> mutex_lock(&uqm->userq_mutex);
> - idr_for_each_entry(&uqm->userq_idr, queue, queue_id) {
> if (((queue->queue_type == AMDGPU_HW_IP_GFX) ||
> (queue->queue_type == AMDGPU_HW_IP_COMPUTE)) &&
> (queue->xcp_id == idx)) {
> @@ -1173,10 +1173,9 @@ int amdgpu_userq_start_sched_for_enforce_isolation(struct amdgpu_device *adev,
> if (r)
> ret = r;
> }
> - }
> mutex_unlock(&uqm->userq_mutex);
> }
> - mutex_unlock(&adev->userq_mutex);
> +
> return ret;
> }
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
> index 036d8dd585cd..09da0617bfa2 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
> @@ -96,11 +96,15 @@ struct amdgpu_userq_funcs {
>
> /* Usermode queues for gfx */
> struct amdgpu_userq_mgr {
> - struct idr userq_idr;
> + /**
> + * @userq_mgr_xa: Per-process user queue map (queue ID → queue)
> + * Key: queue_id (unique ID within the process's userq manager)
> + * Value: struct amdgpu_usermode_queue
> + */
> + struct xarray userq_mgr_xa;
> struct mutex userq_mutex;
> struct amdgpu_device *adev;
> struct delayed_work resume_work;
> - struct list_head list;
> struct drm_file *file;
> };
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c
> index 761bad98da3e..2aeeaa954882 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c
> @@ -537,7 +537,7 @@ int amdgpu_userq_signal_ioctl(struct drm_device *dev, void *data,
> }
>
> /* Retrieve the user queue */
> - queue = idr_find(&userq_mgr->userq_idr, args->queue_id);
> + queue = xa_load(&userq_mgr->userq_mgr_xa, args->queue_id);
> if (!queue) {
> r = -ENOENT;
> goto put_gobj_write;
> @@ -899,7 +899,7 @@ int amdgpu_userq_wait_ioctl(struct drm_device *dev, void *data,
> */
> num_fences = dma_fence_dedup_array(fences, num_fences);
>
> - waitq = idr_find(&userq_mgr->userq_idr, wait_info->waitq_id);
> + waitq = xa_load(&userq_mgr->userq_mgr_xa, wait_info->waitq_id);
> if (!waitq) {
> r = -EINVAL;
> goto free_fences;
> diff --git a/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c b/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c
> index 829129ad7bd1..b8486b0d0d69 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c
> @@ -205,9 +205,9 @@ static int mes_userq_detect_and_reset(struct amdgpu_device *adev,
> int db_array_size = amdgpu_mes_get_hung_queue_db_array_size(adev);
> struct mes_detect_and_reset_queue_input input;
> struct amdgpu_usermode_queue *queue;
> - struct amdgpu_userq_mgr *uqm, *tmp;
> unsigned int hung_db_num = 0;
> - int queue_id, r, i;
> + unsigned long queue_id;
> + int r, i;
> u32 db_array[4];
>
> if (db_array_size > 4) {
> @@ -227,16 +227,14 @@ static int mes_userq_detect_and_reset(struct amdgpu_device *adev,
> if (r) {
> dev_err(adev->dev, "Failed to detect and reset queues, err (%d)\n", r);
> } else if (hung_db_num) {
> - list_for_each_entry_safe(uqm, tmp, &adev->userq_mgr_list, list) {
> - idr_for_each_entry(&uqm->userq_idr, queue, queue_id) {
> - if (queue->queue_type == queue_type) {
> - for (i = 0; i < hung_db_num; i++) {
> - if (queue->doorbell_index == db_array[i]) {
> - queue->state = AMDGPU_USERQ_STATE_HUNG;
> - atomic_inc(&adev->gpu_reset_counter);
> - amdgpu_userq_fence_driver_force_completion(queue);
> - drm_dev_wedged_event(adev_to_drm(adev), DRM_WEDGE_RECOVERY_NONE, NULL);
> - }
> + xa_for_each(&adev->userq_doorbell_xa, queue_id, queue) {
> + if (queue->queue_type == queue_type) {
> + for (i = 0; i < hung_db_num; i++) {
> + if (queue->doorbell_index == db_array[i]) {
> + queue->state = AMDGPU_USERQ_STATE_HUNG;
> + atomic_inc(&adev->gpu_reset_counter);
> + amdgpu_userq_fence_driver_force_completion(queue);
> + drm_dev_wedged_event(adev_to_drm(adev), DRM_WEDGE_RECOVERY_NONE, NULL);
> }
> }
> }
> --
> 2.49.0
>
^ permalink raw reply [flat|nested] 5+ messages in thread