It is illegal to schedule reset work from another reset work!
Fix this by scheduling the userq reset work directly on the work queue
of the reset domain.
Not fully tested, I leave that to the IGT test cases.
Signed-off-by: Christian König <christian.koenig@amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu.h | 1 -
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 3 +-
drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c | 84 +++++++++++-----------
drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h | 16 ++++-
4 files changed, 60 insertions(+), 44 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 39894e38fee4..17341e384caf 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1191,7 +1191,6 @@ struct amdgpu_device {
bool apu_prefer_gtt;
bool userq_halt_for_enforce_isolation;
- struct work_struct userq_reset_work;
struct amdgpu_uid *uid_info;
struct amdgpu_uma_carveout_info uma_info;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index b11c4b5fa8fc..cf61be17e061 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3786,7 +3786,6 @@ int amdgpu_device_init(struct amdgpu_device *adev,
}
INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
- INIT_WORK(&adev->userq_reset_work, amdgpu_userq_reset_work);
amdgpu_coredump_init(adev);
@@ -5477,7 +5476,7 @@ static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev)
if (!amdgpu_sriov_vf(adev))
cancel_work(&adev->reset_work);
#endif
- cancel_work(&adev->userq_reset_work);
+ amdgpu_userq_mgr_cancel_reset_work(adev);
if (adev->kfd.dev)
cancel_work(&adev->kfd.reset_work);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
index 0a4c39d83adc..ad6dac17dd21 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
@@ -82,19 +82,11 @@ static bool amdgpu_userq_is_reset_type_supported(struct amdgpu_device *adev,
return false;
}
-static void amdgpu_userq_gpu_reset(struct amdgpu_device *adev)
-{
- if (amdgpu_device_should_recover_gpu(adev)) {
- amdgpu_reset_domain_schedule(adev->reset_domain,
- &adev->userq_reset_work);
- /* Wait for the reset job to complete */
- flush_work(&adev->userq_reset_work);
- }
-}
-
-static int
-amdgpu_userq_detect_and_reset_queues(struct amdgpu_userq_mgr *uq_mgr)
+static void amdgpu_userq_mgr_reset_work(struct work_struct *work)
{
+ struct amdgpu_userq_mgr *uq_mgr =
+ container_of(work, struct amdgpu_userq_mgr,
+ reset_work);
struct amdgpu_device *adev = uq_mgr->adev;
const int queue_types[] = {
AMDGPU_RING_TYPE_COMPUTE,
@@ -103,12 +95,11 @@ amdgpu_userq_detect_and_reset_queues(struct amdgpu_userq_mgr *uq_mgr)
};
const int num_queue_types = ARRAY_SIZE(queue_types);
bool gpu_reset = false;
- int r = 0;
- int i;
+ int i, r;
if (unlikely(adev->debug_disable_gpu_ring_reset)) {
dev_err(adev->dev, "userq reset disabled by debug mask\n");
- return 0;
+ return;
}
/*
@@ -116,7 +107,7 @@ amdgpu_userq_detect_and_reset_queues(struct amdgpu_userq_mgr *uq_mgr)
* skip all reset detection logic
*/
if (!amdgpu_gpu_recovery)
- return 0;
+ return;
/*
* Iterate through all queue types to detect and reset problematic queues
@@ -141,10 +132,19 @@ amdgpu_userq_detect_and_reset_queues(struct amdgpu_userq_mgr *uq_mgr)
}
}
- if (gpu_reset)
- amdgpu_userq_gpu_reset(adev);
+ if (gpu_reset) {
+ struct amdgpu_reset_context reset_context;
- return r;
+ memset(&reset_context, 0, sizeof(reset_context));
+
+ reset_context.method = AMD_RESET_METHOD_NONE;
+ reset_context.reset_req_dev = adev;
+ reset_context.src = AMDGPU_RESET_SRC_USERQ;
+ set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
+ /*set_bit(AMDGPU_SKIP_COREDUMP, &reset_context.flags);*/
+
+ amdgpu_device_gpu_recover(adev, NULL, &reset_context);
+ }
}
static void amdgpu_userq_hang_detect_work(struct work_struct *work)
The function and the work handler for are using the same name and it
causes confusion to understand.Regards
Sunil Khatri
@@ -153,7 +153,11 @@ static void amdgpu_userq_hang_detect_work(struct work_struct *work)
container_of(work, struct amdgpu_usermode_queue,
hang_detect_work.work);
- amdgpu_userq_detect_and_reset_queues(queue->userq_mgr);
+ /*
+ * Don't schedule the work here! Scheduling or queue work from one reset
+ * handler to another is illegal if you don't take extra precautions!
+ */
+ amdgpu_userq_mgr_reset_work(&queue->userq_mgr->reset_work);
}
/*
@@ -182,8 +186,8 @@ void amdgpu_userq_start_hang_detect_work(struct amdgpu_usermode_queue *queue)
break;
}
- schedule_delayed_work(&queue->hang_detect_work,
- msecs_to_jiffies(timeout_ms));
+ queue_delayed_work(adev->reset_domain->wq, &queue->hang_detect_work,
+ msecs_to_jiffies(timeout_ms));
}
void amdgpu_userq_process_fence_irq(struct amdgpu_device *adev, u32 doorbell)
@@ -1256,28 +1260,13 @@ amdgpu_userq_evict_all(struct amdgpu_userq_mgr *uq_mgr)
if (ret) {
drm_file_err(uq_mgr->file,
"Couldn't unmap all the queues, eviction failed ret=%d\n", ret);
- amdgpu_userq_detect_and_reset_queues(uq_mgr);
+ amdgpu_reset_domain_schedule(uq_mgr->adev->reset_domain,
+ &uq_mgr->reset_work);
+ flush_work(&uq_mgr->reset_work);
Flush work is called here with userq_mutex held? Is it ok to run for
that long time and not sure about it but the flush_work might try to
take the userq_mutex again, that was problem initially during reset.
}
return ret;
}
-void amdgpu_userq_reset_work(struct work_struct *work)
-{
- struct amdgpu_device *adev = container_of(work, struct amdgpu_device,
- userq_reset_work);
- struct amdgpu_reset_context reset_context;
-
- memset(&reset_context, 0, sizeof(reset_context));
-
- reset_context.method = AMD_RESET_METHOD_NONE;
- reset_context.reset_req_dev = adev;
- reset_context.src = AMDGPU_RESET_SRC_USERQ;
- set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
- /*set_bit(AMDGPU_SKIP_COREDUMP, &reset_context.flags);*/
-
- amdgpu_device_gpu_recover(adev, NULL, &reset_context);
-}
-
static void
amdgpu_userq_wait_for_signal(struct amdgpu_userq_mgr *uq_mgr)
{
@@ -1311,9 +1300,24 @@ int amdgpu_userq_mgr_init(struct amdgpu_userq_mgr *userq_mgr, struct drm_file *f
userq_mgr->file = file_priv;
INIT_DELAYED_WORK(&userq_mgr->resume_work, amdgpu_userq_restore_worker);
+ INIT_WORK(&userq_mgr->reset_work, amdgpu_userq_mgr_reset_work);
return 0;
}
+void amdgpu_userq_mgr_cancel_reset_work(struct amdgpu_device *adev)
+{
+ struct xarray *xa = &adev->userq_doorbell_xa;
+ struct amdgpu_usermode_queue *queue;
+ unsigned long flags, queue_id;
+
+ xa_lock_irqsave(xa, flags);
+ xa_for_each(xa, queue_id, queue) {
+ cancel_delayed_work(&queue->hang_detect_work);
+ cancel_work(&queue->userq_mgr->reset_work);
+ }
+ xa_unlock_irqrestore(xa, flags);
+}
+
void amdgpu_userq_mgr_cancel_resume(struct amdgpu_userq_mgr *userq_mgr)
{
cancel_delayed_work_sync(&userq_mgr->resume_work);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
index 85f460e7c31b..49b33e2d6932 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
@@ -84,7 +84,13 @@ struct amdgpu_usermode_queue {
u32 xcp_id;
int priority;
struct dentry *debugfs_queue;
- struct delayed_work hang_detect_work;
+
+ /**
+ * @hang_detect_work:
+ *
+ * Delayed work which runs when userq_fences time out.
+ */
+ struct delayed_work hang_detect_work;
struct kref refcount;
struct list_head userq_va_list;
@@ -116,6 +122,13 @@ struct amdgpu_userq_mgr {
struct amdgpu_device *adev;
struct delayed_work resume_work;
struct drm_file *file;
+
+ /**
+ * @reset_work:
+ *
+ * Reset work which is used when eviction fails.
+ */
+ struct work_struct reset_work;
atomic_t userq_count[AMDGPU_RING_TYPE_MAX];
};
@@ -134,6 +147,7 @@ int amdgpu_userq_ioctl(struct drm_device *dev, void *data, struct drm_file *filp
int amdgpu_userq_mgr_init(struct amdgpu_userq_mgr *userq_mgr, struct drm_file *file_priv,
struct amdgpu_device *adev);
+void amdgpu_userq_mgr_cancel_reset_work(struct amdgpu_device *adev);
void amdgpu_userq_mgr_cancel_resume(struct amdgpu_userq_mgr *userq_mgr);
void amdgpu_userq_mgr_fini(struct amdgpu_userq_mgr *userq_mgr);