From: Zhanjun Dong <zhanjun.dong@intel.com>
To: intel-xe@lists.freedesktop.org
Cc: Zhanjun Dong <zhanjun.dong@intel.com>,
stable@vger.kernel.org, Matthew Brost <matthew.brost@intel.com>
Subject: [PATCH v6 2/6] drm/xe: Forcefully tear down exec queues in GuC submit fini
Date: Wed, 11 Feb 2026 17:20:16 -0500 [thread overview]
Message-ID: <20260211222020.848341-3-zhanjun.dong@intel.com> (raw)
In-Reply-To: <20260211222020.848341-1-zhanjun.dong@intel.com>
In GuC submit fini, forcefully tear down any exec queues by disabling
CTs, stopping the scheduler (which cleans up lost G2H), killing all
remaining queues, and resuming scheduling to allow any remaining cleanup
actions to complete and signal any remaining fences.
Split guc_submit_fini into device related and software only part. Using
device-managed and drm-managed action guarantees the correct ordering of
cleanup.
v4:
- Split guc_submit_fini into 2 parts, device related by devm and software
only part by drmm
v3:
- Add page fault fix
v2:
- Fix VF failure (CI)
Fixes: dd08ebf6c352 ("drm/xe: Introduce a new DRM driver for Intel GPUs")
Cc: stable@vger.kernel.org
Signed-off-by: Zhanjun Dong <zhanjun.dong@intel.com>
Signed-off-by: Matthew Brost <matthew.brost@intel.com>
---
drivers/gpu/drm/xe/xe_guc.c | 18 +++++++++--
drivers/gpu/drm/xe/xe_guc.h | 1 +
drivers/gpu/drm/xe/xe_guc_submit.c | 48 +++++++++++++++++++++++-------
3 files changed, 55 insertions(+), 12 deletions(-)
diff --git a/drivers/gpu/drm/xe/xe_guc.c b/drivers/gpu/drm/xe/xe_guc.c
index cbbb4d665b8f..f59ae602b8ed 100644
--- a/drivers/gpu/drm/xe/xe_guc.c
+++ b/drivers/gpu/drm/xe/xe_guc.c
@@ -1402,15 +1402,29 @@ int xe_guc_enable_communication(struct xe_guc *guc)
return 0;
}
-int xe_guc_suspend(struct xe_guc *guc)
+int xe_guc_softreset(struct xe_guc *guc)
{
- struct xe_gt *gt = guc_to_gt(guc);
u32 action[] = {
XE_GUC_ACTION_CLIENT_SOFT_RESET,
};
int ret;
+ if (!xe_uc_fw_is_running(&guc->fw))
+ return 0;
+
ret = xe_guc_mmio_send(guc, action, ARRAY_SIZE(action));
+ if (ret)
+ return ret;
+
+ return 0;
+}
+
+int xe_guc_suspend(struct xe_guc *guc)
+{
+ struct xe_gt *gt = guc_to_gt(guc);
+ int ret;
+
+ ret = xe_guc_softreset(guc);
if (ret) {
xe_gt_err(gt, "GuC suspend failed: %pe\n", ERR_PTR(ret));
return ret;
diff --git a/drivers/gpu/drm/xe/xe_guc.h b/drivers/gpu/drm/xe/xe_guc.h
index 66e7edc70ed9..02514914f404 100644
--- a/drivers/gpu/drm/xe/xe_guc.h
+++ b/drivers/gpu/drm/xe/xe_guc.h
@@ -44,6 +44,7 @@ int xe_guc_opt_in_features_enable(struct xe_guc *guc);
void xe_guc_runtime_suspend(struct xe_guc *guc);
void xe_guc_runtime_resume(struct xe_guc *guc);
int xe_guc_suspend(struct xe_guc *guc);
+int xe_guc_softreset(struct xe_guc *guc);
void xe_guc_notify(struct xe_guc *guc);
int xe_guc_auth_huc(struct xe_guc *guc, u32 rsa_addr);
int xe_guc_mmio_send(struct xe_guc *guc, const u32 *request, u32 len);
diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c
index 42712acf2ec2..bfb176c201c7 100644
--- a/drivers/gpu/drm/xe/xe_guc_submit.c
+++ b/drivers/gpu/drm/xe/xe_guc_submit.c
@@ -47,6 +47,8 @@
#define XE_GUC_EXEC_QUEUE_CGP_CONTEXT_ERROR_LEN 6
+static int __xe_guc_submit_reset_prepare(struct xe_guc *guc);
+
static struct xe_guc *
exec_queue_to_guc(struct xe_exec_queue *q)
{
@@ -238,7 +240,7 @@ static bool exec_queue_killed_or_banned_or_wedged(struct xe_exec_queue *q)
EXEC_QUEUE_STATE_BANNED));
}
-static void guc_submit_fini(struct drm_device *drm, void *arg)
+static void guc_submit_sw_fini(struct drm_device *drm, void *arg)
{
struct xe_guc *guc = arg;
struct xe_device *xe = guc_to_xe(guc);
@@ -256,6 +258,19 @@ static void guc_submit_fini(struct drm_device *drm, void *arg)
xa_destroy(&guc->submission_state.exec_queue_lookup);
}
+static void guc_submit_fini(void *arg)
+{
+ struct xe_guc *guc = arg;
+
+ /* Forcefully kill any remaining exec queues */
+ xe_guc_ct_stop(&guc->ct);
+ __xe_guc_submit_reset_prepare(guc);
+ xe_guc_softreset(guc);
+ xe_guc_submit_stop(guc);
+ xe_uc_fw_sanitize(&guc->fw);
+ xe_guc_submit_pause_abort(guc);
+}
+
static void guc_submit_wedged_fini(void *arg)
{
struct xe_guc *guc = arg;
@@ -325,7 +340,11 @@ int xe_guc_submit_init(struct xe_guc *guc, unsigned int num_ids)
guc->submission_state.initialized = true;
- return drmm_add_action_or_reset(&xe->drm, guc_submit_fini, guc);
+ err = drmm_add_action_or_reset(&xe->drm, guc_submit_sw_fini, guc);
+ if (err)
+ return err;
+
+ return devm_add_action_or_reset(xe->drm.dev, guc_submit_fini, guc);
}
/*
@@ -2298,6 +2317,7 @@ static const struct xe_exec_queue_ops guc_exec_queue_ops = {
static void guc_exec_queue_stop(struct xe_guc *guc, struct xe_exec_queue *q)
{
struct xe_gpu_scheduler *sched = &q->guc->sched;
+ bool do_destroy = false;
/* Stop scheduling + flush any DRM scheduler operations */
xe_sched_submission_stop(sched);
@@ -2305,7 +2325,7 @@ static void guc_exec_queue_stop(struct xe_guc *guc, struct xe_exec_queue *q)
/* Clean up lost G2H + reset engine state */
if (exec_queue_registered(q)) {
if (exec_queue_destroyed(q))
- __guc_exec_queue_destroy(guc, q);
+ do_destroy = true;
}
if (q->guc->suspend_pending) {
set_exec_queue_suspended(q);
@@ -2341,18 +2361,15 @@ static void guc_exec_queue_stop(struct xe_guc *guc, struct xe_exec_queue *q)
xe_guc_exec_queue_trigger_cleanup(q);
}
}
+
+ if (do_destroy)
+ __guc_exec_queue_destroy(guc, q);
}
-int xe_guc_submit_reset_prepare(struct xe_guc *guc)
+static int __xe_guc_submit_reset_prepare(struct xe_guc *guc)
{
int ret;
- if (xe_gt_WARN_ON(guc_to_gt(guc), vf_recovery(guc)))
- return 0;
-
- if (!guc->submission_state.initialized)
- return 0;
-
/*
* Using an atomic here rather than submission_state.lock as this
* function can be called while holding the CT lock (engine reset
@@ -2367,6 +2384,17 @@ int xe_guc_submit_reset_prepare(struct xe_guc *guc)
return ret;
}
+int xe_guc_submit_reset_prepare(struct xe_guc *guc)
+{
+ if (xe_gt_WARN_ON(guc_to_gt(guc), vf_recovery(guc)))
+ return 0;
+
+ if (!guc->submission_state.initialized)
+ return 0;
+
+ return __xe_guc_submit_reset_prepare(guc);
+}
+
void xe_guc_submit_reset_wait(struct xe_guc *guc)
{
wait_event(guc->ct.wq, xe_device_wedged(guc_to_xe(guc)) ||
--
2.34.1
next prev parent reply other threads:[~2026-02-11 22:20 UTC|newest]
Thread overview: 10+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-02-11 22:20 [PATCH v6 0/6] Attempt to fixup reset, wedge, unload corner cases Zhanjun Dong
2026-02-11 22:20 ` [PATCH v6 1/6] drm/xe: Always kill exec queues in xe_guc_submit_pause_abort Zhanjun Dong
2026-02-11 22:20 ` Zhanjun Dong [this message]
2026-02-11 22:20 ` [PATCH v6 3/6] drm/xe: Trigger queue cleanup if not in wedged mode 2 Zhanjun Dong
2026-02-11 22:20 ` [PATCH v6 4/6] drm/xe: Use XE_WEDGED_MODE_UPON_ANY_HANG_NO_RESET enum instead of magic number Zhanjun Dong
2026-02-11 22:20 ` [PATCH v6 5/6] drm/xe/guc: Ensure CT state transitions via STOP before DISABLED Zhanjun Dong
2026-02-11 22:20 ` [PATCH v6 6/6] drm/xe/uc: Drop xe_guc_sanitize in favor of managed cleanup Zhanjun Dong
2026-02-11 22:27 ` ✓ CI.KUnit: success for Attempt to fixup reset, wedge, unload corner cases Patchwork
2026-02-11 23:02 ` ✓ Xe.CI.BAT: " Patchwork
2026-02-13 5:44 ` ✗ Xe.CI.FULL: failure " Patchwork
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260211222020.848341-3-zhanjun.dong@intel.com \
--to=zhanjun.dong@intel.com \
--cc=intel-xe@lists.freedesktop.org \
--cc=matthew.brost@intel.com \
--cc=stable@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox