[PATCH v9 1/7] drm/xe: Always kill exec queues in xe_guc_submit_pause

public inbox for stable@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCH v9 1/7] drm/xe: Always kill exec queues in xe_guc_submit_pause_abort
       [not found] <20260310225039.1320161-1-zhanjun.dong@intel.com>
@ 2026-03-10 22:50 ` Zhanjun Dong
  2026-03-10 22:50 ` [PATCH v9 2/7] drm/xe: Forcefully tear down exec queues in GuC submit fini Zhanjun Dong
                   ` (3 subsequent siblings)
  4 siblings, 0 replies; 11+ messages in thread
From: Zhanjun Dong @ 2026-03-10 22:50 UTC (permalink / raw)
  To: intel-xe; +Cc: Matthew Brost, stable, Zhanjun Dong, Stuart Summers

From: Matthew Brost <matthew.brost@intel.com>

xe_guc_submit_pause_abort is intended to be called after something
disastrous occurs (e.g., VF migration fails, device wedging, or driver
unload) and should immediately trigger the teardown of remaining
submission state. With that, kill any remaining queues in this function.

Fixes: 7c4b7e34c83b ("drm/xe/vf: Abort VF post migration recovery on failure")
Cc: stable@vger.kernel.org
Signed-off-by: Zhanjun Dong <zhanjun.dong@intel.com>
Reviewed-by: Stuart Summers <stuart.summers@intel.com>
---
 drivers/gpu/drm/xe/xe_guc_submit.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c
index ca7aa4f358d0..b31e0e0af5cb 100644
--- a/drivers/gpu/drm/xe/xe_guc_submit.c
+++ b/drivers/gpu/drm/xe/xe_guc_submit.c
@@ -2763,8 +2763,7 @@ void xe_guc_submit_pause_abort(struct xe_guc *guc)
 			continue;
 
 		xe_sched_submission_start(sched);
-		if (exec_queue_killed_or_banned_or_wedged(q))
-			xe_guc_exec_queue_trigger_cleanup(q);
+		guc_exec_queue_kill(q);
 	}
 	mutex_unlock(&guc->submission_state.lock);
 }
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH v9 2/7] drm/xe: Forcefully tear down exec queues in GuC submit fini
       [not found] <20260310225039.1320161-1-zhanjun.dong@intel.com>
  2026-03-10 22:50 ` [PATCH v9 1/7] drm/xe: Always kill exec queues in xe_guc_submit_pause_abort Zhanjun Dong
@ 2026-03-10 22:50 ` Zhanjun Dong
  2026-03-11 16:34   ` Dong, Zhanjun
  2026-03-10 22:50 ` [PATCH v9 3/7] drm/xe: Trigger queue cleanup if not in wedged mode 2 Zhanjun Dong
                   ` (2 subsequent siblings)
  4 siblings, 1 reply; 11+ messages in thread
From: Zhanjun Dong @ 2026-03-10 22:50 UTC (permalink / raw)
  To: intel-xe; +Cc: Zhanjun Dong, stable, Matthew Brost

In GuC submit fini, forcefully tear down any exec queues by disabling
CTs, stopping the scheduler (which cleans up lost G2H), killing all
remaining queues, and resuming scheduling to allow any remaining cleanup
actions to complete and signal any remaining fences.

Split guc_submit_fini into device related and software only part. Using
device-managed and drm-managed action guarantees the correct ordering of
cleanup.

Fixes: dd08ebf6c352 ("drm/xe: Introduce a new DRM driver for Intel GPUs")
Cc: stable@vger.kernel.org
Signed-off-by: Matthew Brost <matthew.brost@intel.com>
---
 drivers/gpu/drm/xe/xe_guc.c        | 26 ++++++++++++++--
 drivers/gpu/drm/xe/xe_guc.h        |  1 +
 drivers/gpu/drm/xe/xe_guc_submit.c | 48 +++++++++++++++++++++++-------
 3 files changed, 63 insertions(+), 12 deletions(-)

diff --git a/drivers/gpu/drm/xe/xe_guc.c b/drivers/gpu/drm/xe/xe_guc.c
index e75653a5e797..f6964b8f8ede 100644
--- a/drivers/gpu/drm/xe/xe_guc.c
+++ b/drivers/gpu/drm/xe/xe_guc.c
@@ -1399,15 +1399,37 @@ int xe_guc_enable_communication(struct xe_guc *guc)
 	return 0;
 }
 
-int xe_guc_suspend(struct xe_guc *guc)
+/**
+ * xe_guc_softreset() - Soft reset GuC
+ * @guc: The GuC object
+ *
+ * Send soft reset command to GuC through mmio send.
+ *
+ * Return: 0 if success, otherwise error code
+ */
+int xe_guc_softreset(struct xe_guc *guc)
 {
-	struct xe_gt *gt = guc_to_gt(guc);
 	u32 action[] = {
 		XE_GUC_ACTION_CLIENT_SOFT_RESET,
 	};
 	int ret;
 
+	if (!xe_uc_fw_is_running(&guc->fw))
+		return 0;
+
 	ret = xe_guc_mmio_send(guc, action, ARRAY_SIZE(action));
+	if (ret)
+		return ret;
+
+	return 0;
+}
+
+int xe_guc_suspend(struct xe_guc *guc)
+{
+	struct xe_gt *gt = guc_to_gt(guc);
+	int ret;
+
+	ret = xe_guc_softreset(guc);
 	if (ret) {
 		xe_gt_err(gt, "GuC suspend failed: %pe\n", ERR_PTR(ret));
 		return ret;
diff --git a/drivers/gpu/drm/xe/xe_guc.h b/drivers/gpu/drm/xe/xe_guc.h
index 66e7edc70ed9..02514914f404 100644
--- a/drivers/gpu/drm/xe/xe_guc.h
+++ b/drivers/gpu/drm/xe/xe_guc.h
@@ -44,6 +44,7 @@ int xe_guc_opt_in_features_enable(struct xe_guc *guc);
 void xe_guc_runtime_suspend(struct xe_guc *guc);
 void xe_guc_runtime_resume(struct xe_guc *guc);
 int xe_guc_suspend(struct xe_guc *guc);
+int xe_guc_softreset(struct xe_guc *guc);
 void xe_guc_notify(struct xe_guc *guc);
 int xe_guc_auth_huc(struct xe_guc *guc, u32 rsa_addr);
 int xe_guc_mmio_send(struct xe_guc *guc, const u32 *request, u32 len);
diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c
index b31e0e0af5cb..8afd424b27fb 100644
--- a/drivers/gpu/drm/xe/xe_guc_submit.c
+++ b/drivers/gpu/drm/xe/xe_guc_submit.c
@@ -47,6 +47,8 @@
 
 #define XE_GUC_EXEC_QUEUE_CGP_CONTEXT_ERROR_LEN		6
 
+static int guc_submit_reset_prepare(struct xe_guc *guc);
+
 static struct xe_guc *
 exec_queue_to_guc(struct xe_exec_queue *q)
 {
@@ -238,7 +240,7 @@ static bool exec_queue_killed_or_banned_or_wedged(struct xe_exec_queue *q)
 		 EXEC_QUEUE_STATE_BANNED));
 }
 
-static void guc_submit_fini(struct drm_device *drm, void *arg)
+static void guc_submit_sw_fini(struct drm_device *drm, void *arg)
 {
 	struct xe_guc *guc = arg;
 	struct xe_device *xe = guc_to_xe(guc);
@@ -256,6 +258,19 @@ static void guc_submit_fini(struct drm_device *drm, void *arg)
 	xa_destroy(&guc->submission_state.exec_queue_lookup);
 }
 
+static void guc_submit_fini(void *arg)
+{
+	struct xe_guc *guc = arg;
+
+	/* Forcefully kill any remaining exec queues */
+	xe_guc_ct_stop(&guc->ct);
+	guc_submit_reset_prepare(guc);
+	xe_guc_softreset(guc);
+	xe_guc_submit_stop(guc);
+	xe_uc_fw_sanitize(&guc->fw);
+	xe_guc_submit_pause_abort(guc);
+}
+
 static void guc_submit_wedged_fini(void *arg)
 {
 	struct xe_guc *guc = arg;
@@ -325,7 +340,11 @@ int xe_guc_submit_init(struct xe_guc *guc, unsigned int num_ids)
 
 	guc->submission_state.initialized = true;
 
-	return drmm_add_action_or_reset(&xe->drm, guc_submit_fini, guc);
+	err = drmm_add_action_or_reset(&xe->drm, guc_submit_sw_fini, guc);
+	if (err)
+		return err;
+
+	return devm_add_action_or_reset(xe->drm.dev, guc_submit_fini, guc);
 }
 
 /*
@@ -2298,6 +2317,7 @@ static const struct xe_exec_queue_ops guc_exec_queue_ops = {
 static void guc_exec_queue_stop(struct xe_guc *guc, struct xe_exec_queue *q)
 {
 	struct xe_gpu_scheduler *sched = &q->guc->sched;
+	bool do_destroy = false;
 
 	/* Stop scheduling + flush any DRM scheduler operations */
 	xe_sched_submission_stop(sched);
@@ -2305,7 +2325,7 @@ static void guc_exec_queue_stop(struct xe_guc *guc, struct xe_exec_queue *q)
 	/* Clean up lost G2H + reset engine state */
 	if (exec_queue_registered(q)) {
 		if (exec_queue_destroyed(q))
-			__guc_exec_queue_destroy(guc, q);
+			do_destroy = true;
 	}
 	if (q->guc->suspend_pending) {
 		set_exec_queue_suspended(q);
@@ -2341,18 +2361,15 @@ static void guc_exec_queue_stop(struct xe_guc *guc, struct xe_exec_queue *q)
 			xe_guc_exec_queue_trigger_cleanup(q);
 		}
 	}
+
+	if (do_destroy)
+		__guc_exec_queue_destroy(guc, q);
 }
 
-int xe_guc_submit_reset_prepare(struct xe_guc *guc)
+static int guc_submit_reset_prepare(struct xe_guc *guc)
 {
 	int ret;
 
-	if (xe_gt_WARN_ON(guc_to_gt(guc), vf_recovery(guc)))
-		return 0;
-
-	if (!guc->submission_state.initialized)
-		return 0;
-
 	/*
 	 * Using an atomic here rather than submission_state.lock as this
 	 * function can be called while holding the CT lock (engine reset
@@ -2367,6 +2384,17 @@ int xe_guc_submit_reset_prepare(struct xe_guc *guc)
 	return ret;
 }
 
+int xe_guc_submit_reset_prepare(struct xe_guc *guc)
+{
+	if (xe_gt_WARN_ON(guc_to_gt(guc), vf_recovery(guc)))
+		return 0;
+
+	if (!guc->submission_state.initialized)
+		return 0;
+
+	return guc_submit_reset_prepare(guc);
+}
+
 void xe_guc_submit_reset_wait(struct xe_guc *guc)
 {
 	wait_event(guc->ct.wq, xe_device_wedged(guc_to_xe(guc)) ||
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH v9 3/7] drm/xe: Trigger queue cleanup if not in wedged mode 2
       [not found] <20260310225039.1320161-1-zhanjun.dong@intel.com>
  2026-03-10 22:50 ` [PATCH v9 1/7] drm/xe: Always kill exec queues in xe_guc_submit_pause_abort Zhanjun Dong
  2026-03-10 22:50 ` [PATCH v9 2/7] drm/xe: Forcefully tear down exec queues in GuC submit fini Zhanjun Dong
@ 2026-03-10 22:50 ` Zhanjun Dong
  2026-03-11 16:33   ` Dong, Zhanjun
  2026-03-10 22:50 ` [PATCH v9 5/7] drm/xe/guc: Ensure CT state transitions via STOP before DISABLED Zhanjun Dong
  2026-03-10 22:50 ` [PATCH v9 7/7] drm/xe: Open-code GGTT MMIO access protection Zhanjun Dong
  4 siblings, 1 reply; 11+ messages in thread
From: Zhanjun Dong @ 2026-03-10 22:50 UTC (permalink / raw)
  To: intel-xe; +Cc: Zhanjun Dong, stable, Matthew Brost

The intent of wedging a device is to allow queues to continue running
only in wedged mode 2. In other modes, queues should initiate cleanup
and signal all remaining fences. Fix xe_guc_submit_wedge to correctly
clean up queues when wedge mode != 2.

Fixes: 7dbe8af13c18 ("drm/xe: Wedge the entire device")
Cc: stable@vger.kernel.org
Signed-off-by: Matthew Brost <matthew.brost@intel.com>
---
 drivers/gpu/drm/xe/xe_guc_submit.c | 35 +++++++++++++++++++-----------
 1 file changed, 22 insertions(+), 13 deletions(-)

diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c
index 8afd424b27fb..cb32053d57ec 100644
--- a/drivers/gpu/drm/xe/xe_guc_submit.c
+++ b/drivers/gpu/drm/xe/xe_guc_submit.c
@@ -1319,6 +1319,7 @@ static void disable_scheduling_deregister(struct xe_guc *guc,
  */
 void xe_guc_submit_wedge(struct xe_guc *guc)
 {
+	struct xe_device *xe = guc_to_xe(guc);
 	struct xe_gt *gt = guc_to_gt(guc);
 	struct xe_exec_queue *q;
 	unsigned long index;
@@ -1333,20 +1334,28 @@ void xe_guc_submit_wedge(struct xe_guc *guc)
 	if (!guc->submission_state.initialized)
 		return;
 
-	err = devm_add_action_or_reset(guc_to_xe(guc)->drm.dev,
-				       guc_submit_wedged_fini, guc);
-	if (err) {
-		xe_gt_err(gt, "Failed to register clean-up in wedged.mode=%s; "
-			  "Although device is wedged.\n",
-			  xe_wedged_mode_to_string(XE_WEDGED_MODE_UPON_ANY_HANG_NO_RESET));
-		return;
-	}
+	if (xe->wedged.mode == 2) {
+		err = devm_add_action_or_reset(guc_to_xe(guc)->drm.dev,
+					       guc_submit_wedged_fini, guc);
+		if (err) {
+			xe_gt_err(gt, "Failed to register clean-up on wedged.mode=2; "
+				  "Although device is wedged.\n");
+			return;
+		}
 
-	mutex_lock(&guc->submission_state.lock);
-	xa_for_each(&guc->submission_state.exec_queue_lookup, index, q)
-		if (xe_exec_queue_get_unless_zero(q))
-			set_exec_queue_wedged(q);
-	mutex_unlock(&guc->submission_state.lock);
+		mutex_lock(&guc->submission_state.lock);
+		xa_for_each(&guc->submission_state.exec_queue_lookup, index, q)
+			if (xe_exec_queue_get_unless_zero(q))
+				set_exec_queue_wedged(q);
+		mutex_unlock(&guc->submission_state.lock);
+	} else {
+		/* Forcefully kill any remaining exec queues, signal fences */
+		guc_submit_reset_prepare(guc);
+		xe_guc_submit_stop(guc);
+		xe_guc_softreset(guc);
+		xe_uc_fw_sanitize(&guc->fw);
+		xe_guc_submit_pause_abort(guc);
+	}
 }
 
 static bool guc_submit_hint_wedged(struct xe_guc *guc)
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH v9 5/7] drm/xe/guc: Ensure CT state transitions via STOP before DISABLED
       [not found] <20260310225039.1320161-1-zhanjun.dong@intel.com>
                   ` (2 preceding siblings ...)
  2026-03-10 22:50 ` [PATCH v9 3/7] drm/xe: Trigger queue cleanup if not in wedged mode 2 Zhanjun Dong
@ 2026-03-10 22:50 ` Zhanjun Dong
  2026-03-10 22:50 ` [PATCH v9 7/7] drm/xe: Open-code GGTT MMIO access protection Zhanjun Dong
  4 siblings, 0 replies; 11+ messages in thread
From: Zhanjun Dong @ 2026-03-10 22:50 UTC (permalink / raw)
  To: intel-xe; +Cc: Zhanjun Dong, stable, Matthew Brost

The GuC CT state transition requires moving to the STOP state before
entering the DISABLED state. Update the driver teardown sequence to make
the proper state machine transitions.

Fixes: ee4b32220a6b ("drm/xe/guc: Add devm release action to safely tear down CT")
Cc: stable@vger.kernel.org
Signed-off-by: Zhanjun Dong <zhanjun.dong@intel.com>
Reviewed-by: Matthew Brost <matthew.brost@intel.com>
---
 drivers/gpu/drm/xe/xe_guc_ct.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/gpu/drm/xe/xe_guc_ct.c b/drivers/gpu/drm/xe/xe_guc_ct.c
index 496c6c77bee6..3b1c03743f83 100644
--- a/drivers/gpu/drm/xe/xe_guc_ct.c
+++ b/drivers/gpu/drm/xe/xe_guc_ct.c
@@ -352,6 +352,7 @@ static void guc_action_disable_ct(void *arg)
 {
 	struct xe_guc_ct *ct = arg;
 
+	xe_guc_ct_stop(ct);
 	guc_ct_change_state(ct, XE_GUC_CT_STATE_DISABLED);
 }
 
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH v9 7/7] drm/xe: Open-code GGTT MMIO access protection
       [not found] <20260310225039.1320161-1-zhanjun.dong@intel.com>
                   ` (3 preceding siblings ...)
  2026-03-10 22:50 ` [PATCH v9 5/7] drm/xe/guc: Ensure CT state transitions via STOP before DISABLED Zhanjun Dong
@ 2026-03-10 22:50 ` Zhanjun Dong
  4 siblings, 0 replies; 11+ messages in thread
From: Zhanjun Dong @ 2026-03-10 22:50 UTC (permalink / raw)
  To: intel-xe; +Cc: Matthew Brost, stable, Zhanjun Dong

From: Matthew Brost <matthew.brost@intel.com>

GGTT MMIO access is currently protected by hotplug (drm_dev_enter),
which works correctly when the driver loads successfully and is later
unbound or unloaded. However, if driver load fails, this protection is
insufficient because drm_dev_unplug() is never called.

Additionally, devm release functions cannot guarantee that all BOs with
GGTT mappings are destroyed before the GGTT MMIO region is removed, as
some BOs may be freed asynchronously by worker threads.

To address this, introduce an open-coded flag, protected by the GGTT
lock, that guards GGTT MMIO access. The flag is cleared during the
dev_fini_ggtt devm release function to ensure MMIO access is disabled
once teardown begins.

Cc: stable@vger.kernel.org
Fixes: 919bb54e989c ("drm/xe: Fix missing runtime outer protection for ggtt_remove_node")
Signed-off-by: Matthew Brost <matthew.brost@intel.com>
Reviewed-by: Zhanjun Dong <zhanjun.dong@intel.com>
---
 drivers/gpu/drm/xe/xe_ggtt.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/xe/xe_ggtt.c b/drivers/gpu/drm/xe/xe_ggtt.c
index 0f2e3af49912..21071b64b09d 100644
--- a/drivers/gpu/drm/xe/xe_ggtt.c
+++ b/drivers/gpu/drm/xe/xe_ggtt.c
@@ -66,6 +66,9 @@
  * give us the correct placement for free.
  */
 
+#define XE_GGTT_FLAGS_64K	BIT(0)
+#define XE_GGTT_FLAGS_ONLINE	BIT(1)
+
 /**
  * struct xe_ggtt_node - A node in GGTT.
  *
@@ -117,6 +120,8 @@ struct xe_ggtt {
 	 * @flags: Flags for this GGTT
 	 * Acceptable flags:
 	 * - %XE_GGTT_FLAGS_64K - if PTE size is 64K. Otherwise, regular is 4K.
+	 * - %XE_GGTT_FLAGS_ONLINE - is GGTT online, protected by ggtt->lock
+	 *   after init
 	 */
 	unsigned int flags;
 	/** @scratch: Internal object allocation used as a scratch page */
@@ -367,6 +372,8 @@ static void dev_fini_ggtt(void *arg)
 {
 	struct xe_ggtt *ggtt = arg;
 
+	scoped_guard(mutex, &ggtt->lock)
+		ggtt->flags &= ~XE_GGTT_FLAGS_ONLINE;
 	drain_workqueue(ggtt->wq);
 }
 
@@ -437,6 +444,7 @@ int xe_ggtt_init_early(struct xe_ggtt *ggtt)
 	if (err)
 		return err;
 
+	ggtt->flags |= XE_GGTT_FLAGS_ONLINE;
 	return devm_add_action_or_reset(xe->drm.dev, dev_fini_ggtt, ggtt);
 }
 ALLOW_ERROR_INJECTION(xe_ggtt_init_early, ERRNO); /* See xe_pci_probe() */
@@ -465,13 +473,10 @@ static void ggtt_node_fini(struct xe_ggtt_node *node)
 static void ggtt_node_remove(struct xe_ggtt_node *node)
 {
 	struct xe_ggtt *ggtt = node->ggtt;
-	struct xe_device *xe = tile_to_xe(ggtt->tile);
 	bool bound;
-	int idx;
-
-	bound = drm_dev_enter(&xe->drm, &idx);
 
 	mutex_lock(&ggtt->lock);
+	bound = ggtt->flags & XE_GGTT_FLAGS_ONLINE;
 	if (bound)
 		xe_ggtt_clear(ggtt, xe_ggtt_node_addr(node), xe_ggtt_node_size(node));
 	drm_mm_remove_node(&node->base);
@@ -484,8 +489,6 @@ static void ggtt_node_remove(struct xe_ggtt_node *node)
 	if (node->invalidate_on_remove)
 		xe_ggtt_invalidate(ggtt);
 
-	drm_dev_exit(idx);
-
 free_node:
 	ggtt_node_fini(node);
 }
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: [PATCH v9 3/7] drm/xe: Trigger queue cleanup if not in wedged mode 2
  2026-03-10 22:50 ` [PATCH v9 3/7] drm/xe: Trigger queue cleanup if not in wedged mode 2 Zhanjun Dong
@ 2026-03-11 16:33   ` Dong, Zhanjun
  2026-03-13 18:49     ` Matthew Brost
  0 siblings, 1 reply; 11+ messages in thread
From: Dong, Zhanjun @ 2026-03-11 16:33 UTC (permalink / raw)
  To: intel-xe; +Cc: stable, Matthew Brost



On 2026-03-10 6:50 p.m., Zhanjun Dong wrote:
> The intent of wedging a device is to allow queues to continue running
> only in wedged mode 2. In other modes, queues should initiate cleanup
> and signal all remaining fences. Fix xe_guc_submit_wedge to correctly
> clean up queues when wedge mode != 2.
> 
> Fixes: 7dbe8af13c18 ("drm/xe: Wedge the entire device")
> Cc: stable@vger.kernel.org
> Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> ---
>   drivers/gpu/drm/xe/xe_guc_submit.c | 35 +++++++++++++++++++-----------
>   1 file changed, 22 insertions(+), 13 deletions(-)
> 
> diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c
> index 8afd424b27fb..cb32053d57ec 100644
> --- a/drivers/gpu/drm/xe/xe_guc_submit.c
> +++ b/drivers/gpu/drm/xe/xe_guc_submit.c
> @@ -1319,6 +1319,7 @@ static void disable_scheduling_deregister(struct xe_guc *guc,
>    */
>   void xe_guc_submit_wedge(struct xe_guc *guc)
>   {
> +	struct xe_device *xe = guc_to_xe(guc);
>   	struct xe_gt *gt = guc_to_gt(guc);
>   	struct xe_exec_queue *q;
>   	unsigned long index;
> @@ -1333,20 +1334,28 @@ void xe_guc_submit_wedge(struct xe_guc *guc)
>   	if (!guc->submission_state.initialized)
>   		return;
>   
> -	err = devm_add_action_or_reset(guc_to_xe(guc)->drm.dev,
> -				       guc_submit_wedged_fini, guc);
> -	if (err) {
> -		xe_gt_err(gt, "Failed to register clean-up in wedged.mode=%s; "
> -			  "Although device is wedged.\n",
> -			  xe_wedged_mode_to_string(XE_WEDGED_MODE_UPON_ANY_HANG_NO_RESET));
> -		return;
> -	}
> +	if (xe->wedged.mode == 2) {
> +		err = devm_add_action_or_reset(guc_to_xe(guc)->drm.dev,
> +					       guc_submit_wedged_fini, guc);
> +		if (err) {
> +			xe_gt_err(gt, "Failed to register clean-up on wedged.mode=2; "
> +				  "Although device is wedged.\n");
> +			return;
> +		}
>   
> -	mutex_lock(&guc->submission_state.lock);
> -	xa_for_each(&guc->submission_state.exec_queue_lookup, index, q)
> -		if (xe_exec_queue_get_unless_zero(q))
> -			set_exec_queue_wedged(q);
> -	mutex_unlock(&guc->submission_state.lock);
> +		mutex_lock(&guc->submission_state.lock);
> +		xa_for_each(&guc->submission_state.exec_queue_lookup, index, q)
> +			if (xe_exec_queue_get_unless_zero(q))
> +				set_exec_queue_wedged(q);
> +		mutex_unlock(&guc->submission_state.lock);
> +	} else {
> +		/* Forcefully kill any remaining exec queues, signal fences */
Q: Shall we do VF bypass here?

Regards,
Zhanjun Dong
> +		guc_submit_reset_prepare(guc);
> +		xe_guc_submit_stop(guc);
> +		xe_guc_softreset(guc);
> +		xe_uc_fw_sanitize(&guc->fw);
> +		xe_guc_submit_pause_abort(guc);
> +	}
>   }
>   
>   static bool guc_submit_hint_wedged(struct xe_guc *guc)


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v9 2/7] drm/xe: Forcefully tear down exec queues in GuC submit fini
  2026-03-10 22:50 ` [PATCH v9 2/7] drm/xe: Forcefully tear down exec queues in GuC submit fini Zhanjun Dong
@ 2026-03-11 16:34   ` Dong, Zhanjun
  2026-03-13 18:48     ` Matthew Brost
  0 siblings, 1 reply; 11+ messages in thread
From: Dong, Zhanjun @ 2026-03-11 16:34 UTC (permalink / raw)
  To: intel-xe; +Cc: stable, Matthew Brost



On 2026-03-10 6:50 p.m., Zhanjun Dong wrote:
> In GuC submit fini, forcefully tear down any exec queues by disabling
> CTs, stopping the scheduler (which cleans up lost G2H), killing all
> remaining queues, and resuming scheduling to allow any remaining cleanup
> actions to complete and signal any remaining fences.
> 
> Split guc_submit_fini into device related and software only part. Using
> device-managed and drm-managed action guarantees the correct ordering of
> cleanup.
> 
> Fixes: dd08ebf6c352 ("drm/xe: Introduce a new DRM driver for Intel GPUs")
> Cc: stable@vger.kernel.org
> Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> ---
>   drivers/gpu/drm/xe/xe_guc.c        | 26 ++++++++++++++--
>   drivers/gpu/drm/xe/xe_guc.h        |  1 +
>   drivers/gpu/drm/xe/xe_guc_submit.c | 48 +++++++++++++++++++++++-------
>   3 files changed, 63 insertions(+), 12 deletions(-)
> 
> diff --git a/drivers/gpu/drm/xe/xe_guc.c b/drivers/gpu/drm/xe/xe_guc.c
> index e75653a5e797..f6964b8f8ede 100644
> --- a/drivers/gpu/drm/xe/xe_guc.c
> +++ b/drivers/gpu/drm/xe/xe_guc.c
> @@ -1399,15 +1399,37 @@ int xe_guc_enable_communication(struct xe_guc *guc)
>   	return 0;
>   }
>   
> -int xe_guc_suspend(struct xe_guc *guc)
> +/**
> + * xe_guc_softreset() - Soft reset GuC
> + * @guc: The GuC object
> + *
> + * Send soft reset command to GuC through mmio send.
> + *
> + * Return: 0 if success, otherwise error code
> + */
> +int xe_guc_softreset(struct xe_guc *guc)
>   {
> -	struct xe_gt *gt = guc_to_gt(guc);
>   	u32 action[] = {
>   		XE_GUC_ACTION_CLIENT_SOFT_RESET,
>   	};
>   	int ret;
>   
> +	if (!xe_uc_fw_is_running(&guc->fw))
> +		return 0;
> +
>   	ret = xe_guc_mmio_send(guc, action, ARRAY_SIZE(action));
> +	if (ret)
> +		return ret;
> +
> +	return 0;
> +}
> +
> +int xe_guc_suspend(struct xe_guc *guc)
> +{
> +	struct xe_gt *gt = guc_to_gt(guc);
> +	int ret;
> +
> +	ret = xe_guc_softreset(guc);
>   	if (ret) {
>   		xe_gt_err(gt, "GuC suspend failed: %pe\n", ERR_PTR(ret));
>   		return ret;
> diff --git a/drivers/gpu/drm/xe/xe_guc.h b/drivers/gpu/drm/xe/xe_guc.h
> index 66e7edc70ed9..02514914f404 100644
> --- a/drivers/gpu/drm/xe/xe_guc.h
> +++ b/drivers/gpu/drm/xe/xe_guc.h
> @@ -44,6 +44,7 @@ int xe_guc_opt_in_features_enable(struct xe_guc *guc);
>   void xe_guc_runtime_suspend(struct xe_guc *guc);
>   void xe_guc_runtime_resume(struct xe_guc *guc);
>   int xe_guc_suspend(struct xe_guc *guc);
> +int xe_guc_softreset(struct xe_guc *guc);
>   void xe_guc_notify(struct xe_guc *guc);
>   int xe_guc_auth_huc(struct xe_guc *guc, u32 rsa_addr);
>   int xe_guc_mmio_send(struct xe_guc *guc, const u32 *request, u32 len);
> diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c
> index b31e0e0af5cb..8afd424b27fb 100644
> --- a/drivers/gpu/drm/xe/xe_guc_submit.c
> +++ b/drivers/gpu/drm/xe/xe_guc_submit.c
> @@ -47,6 +47,8 @@
>   
>   #define XE_GUC_EXEC_QUEUE_CGP_CONTEXT_ERROR_LEN		6
>   
> +static int guc_submit_reset_prepare(struct xe_guc *guc);
> +
>   static struct xe_guc *
>   exec_queue_to_guc(struct xe_exec_queue *q)
>   {
> @@ -238,7 +240,7 @@ static bool exec_queue_killed_or_banned_or_wedged(struct xe_exec_queue *q)
>   		 EXEC_QUEUE_STATE_BANNED));
>   }
>   
> -static void guc_submit_fini(struct drm_device *drm, void *arg)
> +static void guc_submit_sw_fini(struct drm_device *drm, void *arg)
>   {
>   	struct xe_guc *guc = arg;
>   	struct xe_device *xe = guc_to_xe(guc);
> @@ -256,6 +258,19 @@ static void guc_submit_fini(struct drm_device *drm, void *arg)
>   	xa_destroy(&guc->submission_state.exec_queue_lookup);
>   }
>   
> +static void guc_submit_fini(void *arg)
> +{
> +	struct xe_guc *guc = arg;
> +
> +	/* Forcefully kill any remaining exec queues */
Shall we do VF bypass here?

Regards,
Zhanjun Dong
> +	xe_guc_ct_stop(&guc->ct);
> +	guc_submit_reset_prepare(guc);
> +	xe_guc_softreset(guc);
> +	xe_guc_submit_stop(guc);
> +	xe_uc_fw_sanitize(&guc->fw);
> +	xe_guc_submit_pause_abort(guc);
> +}
> +
>   static void guc_submit_wedged_fini(void *arg)
>   {
>   	struct xe_guc *guc = arg;
> @@ -325,7 +340,11 @@ int xe_guc_submit_init(struct xe_guc *guc, unsigned int num_ids)
>   
>   	guc->submission_state.initialized = true;
>   
> -	return drmm_add_action_or_reset(&xe->drm, guc_submit_fini, guc);
> +	err = drmm_add_action_or_reset(&xe->drm, guc_submit_sw_fini, guc);
> +	if (err)
> +		return err;
> +
> +	return devm_add_action_or_reset(xe->drm.dev, guc_submit_fini, guc);
>   }
>   
>   /*
> @@ -2298,6 +2317,7 @@ static const struct xe_exec_queue_ops guc_exec_queue_ops = {
>   static void guc_exec_queue_stop(struct xe_guc *guc, struct xe_exec_queue *q)
>   {
>   	struct xe_gpu_scheduler *sched = &q->guc->sched;
> +	bool do_destroy = false;
>   
>   	/* Stop scheduling + flush any DRM scheduler operations */
>   	xe_sched_submission_stop(sched);
> @@ -2305,7 +2325,7 @@ static void guc_exec_queue_stop(struct xe_guc *guc, struct xe_exec_queue *q)
>   	/* Clean up lost G2H + reset engine state */
>   	if (exec_queue_registered(q)) {
>   		if (exec_queue_destroyed(q))
> -			__guc_exec_queue_destroy(guc, q);
> +			do_destroy = true;
>   	}
>   	if (q->guc->suspend_pending) {
>   		set_exec_queue_suspended(q);
> @@ -2341,18 +2361,15 @@ static void guc_exec_queue_stop(struct xe_guc *guc, struct xe_exec_queue *q)
>   			xe_guc_exec_queue_trigger_cleanup(q);
>   		}
>   	}
> +
> +	if (do_destroy)
> +		__guc_exec_queue_destroy(guc, q);
>   }
>   
> -int xe_guc_submit_reset_prepare(struct xe_guc *guc)
> +static int guc_submit_reset_prepare(struct xe_guc *guc)
>   {
>   	int ret;
>   
> -	if (xe_gt_WARN_ON(guc_to_gt(guc), vf_recovery(guc)))
> -		return 0;
> -
> -	if (!guc->submission_state.initialized)
> -		return 0;
> -
>   	/*
>   	 * Using an atomic here rather than submission_state.lock as this
>   	 * function can be called while holding the CT lock (engine reset
> @@ -2367,6 +2384,17 @@ int xe_guc_submit_reset_prepare(struct xe_guc *guc)
>   	return ret;
>   }
>   
> +int xe_guc_submit_reset_prepare(struct xe_guc *guc)
> +{
> +	if (xe_gt_WARN_ON(guc_to_gt(guc), vf_recovery(guc)))
> +		return 0;
> +
> +	if (!guc->submission_state.initialized)
> +		return 0;
> +
> +	return guc_submit_reset_prepare(guc);
> +}
> +
>   void xe_guc_submit_reset_wait(struct xe_guc *guc)
>   {
>   	wait_event(guc->ct.wq, xe_device_wedged(guc_to_xe(guc)) ||


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v9 2/7] drm/xe: Forcefully tear down exec queues in GuC submit fini
  2026-03-11 16:34   ` Dong, Zhanjun
@ 2026-03-13 18:48     ` Matthew Brost
  2026-03-13 20:30       ` Dong, Zhanjun
  0 siblings, 1 reply; 11+ messages in thread
From: Matthew Brost @ 2026-03-13 18:48 UTC (permalink / raw)
  To: Dong, Zhanjun; +Cc: intel-xe, stable

On Wed, Mar 11, 2026 at 12:34:30PM -0400, Dong, Zhanjun wrote:
> 
> 
> On 2026-03-10 6:50 p.m., Zhanjun Dong wrote:
> > In GuC submit fini, forcefully tear down any exec queues by disabling
> > CTs, stopping the scheduler (which cleans up lost G2H), killing all
> > remaining queues, and resuming scheduling to allow any remaining cleanup
> > actions to complete and signal any remaining fences.
> > 
> > Split guc_submit_fini into device related and software only part. Using
> > device-managed and drm-managed action guarantees the correct ordering of
> > cleanup.
> > 
> > Fixes: dd08ebf6c352 ("drm/xe: Introduce a new DRM driver for Intel GPUs")
> > Cc: stable@vger.kernel.org
> > Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> > ---
> >   drivers/gpu/drm/xe/xe_guc.c        | 26 ++++++++++++++--
> >   drivers/gpu/drm/xe/xe_guc.h        |  1 +
> >   drivers/gpu/drm/xe/xe_guc_submit.c | 48 +++++++++++++++++++++++-------
> >   3 files changed, 63 insertions(+), 12 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/xe/xe_guc.c b/drivers/gpu/drm/xe/xe_guc.c
> > index e75653a5e797..f6964b8f8ede 100644
> > --- a/drivers/gpu/drm/xe/xe_guc.c
> > +++ b/drivers/gpu/drm/xe/xe_guc.c
> > @@ -1399,15 +1399,37 @@ int xe_guc_enable_communication(struct xe_guc *guc)
> >   	return 0;
> >   }
> > -int xe_guc_suspend(struct xe_guc *guc)
> > +/**
> > + * xe_guc_softreset() - Soft reset GuC
> > + * @guc: The GuC object
> > + *
> > + * Send soft reset command to GuC through mmio send.
> > + *
> > + * Return: 0 if success, otherwise error code
> > + */
> > +int xe_guc_softreset(struct xe_guc *guc)
> >   {
> > -	struct xe_gt *gt = guc_to_gt(guc);
> >   	u32 action[] = {
> >   		XE_GUC_ACTION_CLIENT_SOFT_RESET,
> >   	};
> >   	int ret;
> > +	if (!xe_uc_fw_is_running(&guc->fw))
> > +		return 0;
> > +
> >   	ret = xe_guc_mmio_send(guc, action, ARRAY_SIZE(action));
> > +	if (ret)
> > +		return ret;
> > +
> > +	return 0;
> > +}
> > +
> > +int xe_guc_suspend(struct xe_guc *guc)
> > +{
> > +	struct xe_gt *gt = guc_to_gt(guc);
> > +	int ret;
> > +
> > +	ret = xe_guc_softreset(guc);
> >   	if (ret) {
> >   		xe_gt_err(gt, "GuC suspend failed: %pe\n", ERR_PTR(ret));
> >   		return ret;
> > diff --git a/drivers/gpu/drm/xe/xe_guc.h b/drivers/gpu/drm/xe/xe_guc.h
> > index 66e7edc70ed9..02514914f404 100644
> > --- a/drivers/gpu/drm/xe/xe_guc.h
> > +++ b/drivers/gpu/drm/xe/xe_guc.h
> > @@ -44,6 +44,7 @@ int xe_guc_opt_in_features_enable(struct xe_guc *guc);
> >   void xe_guc_runtime_suspend(struct xe_guc *guc);
> >   void xe_guc_runtime_resume(struct xe_guc *guc);
> >   int xe_guc_suspend(struct xe_guc *guc);
> > +int xe_guc_softreset(struct xe_guc *guc);
> >   void xe_guc_notify(struct xe_guc *guc);
> >   int xe_guc_auth_huc(struct xe_guc *guc, u32 rsa_addr);
> >   int xe_guc_mmio_send(struct xe_guc *guc, const u32 *request, u32 len);
> > diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c
> > index b31e0e0af5cb..8afd424b27fb 100644
> > --- a/drivers/gpu/drm/xe/xe_guc_submit.c
> > +++ b/drivers/gpu/drm/xe/xe_guc_submit.c
> > @@ -47,6 +47,8 @@
> >   #define XE_GUC_EXEC_QUEUE_CGP_CONTEXT_ERROR_LEN		6
> > +static int guc_submit_reset_prepare(struct xe_guc *guc);
> > +
> >   static struct xe_guc *
> >   exec_queue_to_guc(struct xe_exec_queue *q)
> >   {
> > @@ -238,7 +240,7 @@ static bool exec_queue_killed_or_banned_or_wedged(struct xe_exec_queue *q)
> >   		 EXEC_QUEUE_STATE_BANNED));
> >   }
> > -static void guc_submit_fini(struct drm_device *drm, void *arg)
> > +static void guc_submit_sw_fini(struct drm_device *drm, void *arg)
> >   {
> >   	struct xe_guc *guc = arg;
> >   	struct xe_device *xe = guc_to_xe(guc);
> > @@ -256,6 +258,19 @@ static void guc_submit_fini(struct drm_device *drm, void *arg)
> >   	xa_destroy(&guc->submission_state.exec_queue_lookup);
> >   }
> > +static void guc_submit_fini(void *arg)
> > +{
> > +	struct xe_guc *guc = arg;
> > +
> > +	/* Forcefully kill any remaining exec queues */
> Shall we do VF bypass here?
> 

Why? These flows work on VFs and are still required. This is initiating
a software cut off communication with the GuC and cleaning up any lost
communications with the GuC so all queues are destroyed.

Matt

> Regards,
> Zhanjun Dong
> > +	xe_guc_ct_stop(&guc->ct);
> > +	guc_submit_reset_prepare(guc);
> > +	xe_guc_softreset(guc);
> > +	xe_guc_submit_stop(guc);
> > +	xe_uc_fw_sanitize(&guc->fw);
> > +	xe_guc_submit_pause_abort(guc);
> > +}
> > +
> >   static void guc_submit_wedged_fini(void *arg)
> >   {
> >   	struct xe_guc *guc = arg;
> > @@ -325,7 +340,11 @@ int xe_guc_submit_init(struct xe_guc *guc, unsigned int num_ids)
> >   	guc->submission_state.initialized = true;
> > -	return drmm_add_action_or_reset(&xe->drm, guc_submit_fini, guc);
> > +	err = drmm_add_action_or_reset(&xe->drm, guc_submit_sw_fini, guc);
> > +	if (err)
> > +		return err;
> > +
> > +	return devm_add_action_or_reset(xe->drm.dev, guc_submit_fini, guc);
> >   }
> >   /*
> > @@ -2298,6 +2317,7 @@ static const struct xe_exec_queue_ops guc_exec_queue_ops = {
> >   static void guc_exec_queue_stop(struct xe_guc *guc, struct xe_exec_queue *q)
> >   {
> >   	struct xe_gpu_scheduler *sched = &q->guc->sched;
> > +	bool do_destroy = false;
> >   	/* Stop scheduling + flush any DRM scheduler operations */
> >   	xe_sched_submission_stop(sched);
> > @@ -2305,7 +2325,7 @@ static void guc_exec_queue_stop(struct xe_guc *guc, struct xe_exec_queue *q)
> >   	/* Clean up lost G2H + reset engine state */
> >   	if (exec_queue_registered(q)) {
> >   		if (exec_queue_destroyed(q))
> > -			__guc_exec_queue_destroy(guc, q);
> > +			do_destroy = true;
> >   	}
> >   	if (q->guc->suspend_pending) {
> >   		set_exec_queue_suspended(q);
> > @@ -2341,18 +2361,15 @@ static void guc_exec_queue_stop(struct xe_guc *guc, struct xe_exec_queue *q)
> >   			xe_guc_exec_queue_trigger_cleanup(q);
> >   		}
> >   	}
> > +
> > +	if (do_destroy)
> > +		__guc_exec_queue_destroy(guc, q);
> >   }
> > -int xe_guc_submit_reset_prepare(struct xe_guc *guc)
> > +static int guc_submit_reset_prepare(struct xe_guc *guc)
> >   {
> >   	int ret;
> > -	if (xe_gt_WARN_ON(guc_to_gt(guc), vf_recovery(guc)))
> > -		return 0;
> > -
> > -	if (!guc->submission_state.initialized)
> > -		return 0;
> > -
> >   	/*
> >   	 * Using an atomic here rather than submission_state.lock as this
> >   	 * function can be called while holding the CT lock (engine reset
> > @@ -2367,6 +2384,17 @@ int xe_guc_submit_reset_prepare(struct xe_guc *guc)
> >   	return ret;
> >   }
> > +int xe_guc_submit_reset_prepare(struct xe_guc *guc)
> > +{
> > +	if (xe_gt_WARN_ON(guc_to_gt(guc), vf_recovery(guc)))
> > +		return 0;
> > +
> > +	if (!guc->submission_state.initialized)
> > +		return 0;
> > +
> > +	return guc_submit_reset_prepare(guc);
> > +}
> > +
> >   void xe_guc_submit_reset_wait(struct xe_guc *guc)
> >   {
> >   	wait_event(guc->ct.wq, xe_device_wedged(guc_to_xe(guc)) ||
> 

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v9 3/7] drm/xe: Trigger queue cleanup if not in wedged mode 2
  2026-03-11 16:33   ` Dong, Zhanjun
@ 2026-03-13 18:49     ` Matthew Brost
  2026-03-13 20:31       ` Dong, Zhanjun
  0 siblings, 1 reply; 11+ messages in thread
From: Matthew Brost @ 2026-03-13 18:49 UTC (permalink / raw)
  To: Dong, Zhanjun; +Cc: intel-xe, stable

On Wed, Mar 11, 2026 at 12:33:54PM -0400, Dong, Zhanjun wrote:
> 
> 
> On 2026-03-10 6:50 p.m., Zhanjun Dong wrote:
> > The intent of wedging a device is to allow queues to continue running
> > only in wedged mode 2. In other modes, queues should initiate cleanup
> > and signal all remaining fences. Fix xe_guc_submit_wedge to correctly
> > clean up queues when wedge mode != 2.
> > 
> > Fixes: 7dbe8af13c18 ("drm/xe: Wedge the entire device")
> > Cc: stable@vger.kernel.org
> > Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> > ---
> >   drivers/gpu/drm/xe/xe_guc_submit.c | 35 +++++++++++++++++++-----------
> >   1 file changed, 22 insertions(+), 13 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c
> > index 8afd424b27fb..cb32053d57ec 100644
> > --- a/drivers/gpu/drm/xe/xe_guc_submit.c
> > +++ b/drivers/gpu/drm/xe/xe_guc_submit.c
> > @@ -1319,6 +1319,7 @@ static void disable_scheduling_deregister(struct xe_guc *guc,
> >    */
> >   void xe_guc_submit_wedge(struct xe_guc *guc)
> >   {
> > +	struct xe_device *xe = guc_to_xe(guc);
> >   	struct xe_gt *gt = guc_to_gt(guc);
> >   	struct xe_exec_queue *q;
> >   	unsigned long index;
> > @@ -1333,20 +1334,28 @@ void xe_guc_submit_wedge(struct xe_guc *guc)
> >   	if (!guc->submission_state.initialized)
> >   		return;
> > -	err = devm_add_action_or_reset(guc_to_xe(guc)->drm.dev,
> > -				       guc_submit_wedged_fini, guc);
> > -	if (err) {
> > -		xe_gt_err(gt, "Failed to register clean-up in wedged.mode=%s; "
> > -			  "Although device is wedged.\n",
> > -			  xe_wedged_mode_to_string(XE_WEDGED_MODE_UPON_ANY_HANG_NO_RESET));
> > -		return;
> > -	}
> > +	if (xe->wedged.mode == 2) {
> > +		err = devm_add_action_or_reset(guc_to_xe(guc)->drm.dev,
> > +					       guc_submit_wedged_fini, guc);
> > +		if (err) {
> > +			xe_gt_err(gt, "Failed to register clean-up on wedged.mode=2; "
> > +				  "Although device is wedged.\n");
> > +			return;
> > +		}
> > -	mutex_lock(&guc->submission_state.lock);
> > -	xa_for_each(&guc->submission_state.exec_queue_lookup, index, q)
> > -		if (xe_exec_queue_get_unless_zero(q))
> > -			set_exec_queue_wedged(q);
> > -	mutex_unlock(&guc->submission_state.lock);
> > +		mutex_lock(&guc->submission_state.lock);
> > +		xa_for_each(&guc->submission_state.exec_queue_lookup, index, q)
> > +			if (xe_exec_queue_get_unless_zero(q))
> > +				set_exec_queue_wedged(q);
> > +		mutex_unlock(&guc->submission_state.lock);
> > +	} else {
> > +		/* Forcefully kill any remaining exec queues, signal fences */
> Q: Shall we do VF bypass here?
> 

Same answer as last patch - no.

Matt

> Regards,
> Zhanjun Dong
> > +		guc_submit_reset_prepare(guc);
> > +		xe_guc_submit_stop(guc);
> > +		xe_guc_softreset(guc);
> > +		xe_uc_fw_sanitize(&guc->fw);
> > +		xe_guc_submit_pause_abort(guc);
> > +	}
> >   }
> >   static bool guc_submit_hint_wedged(struct xe_guc *guc)
> 

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v9 2/7] drm/xe: Forcefully tear down exec queues in GuC submit fini
  2026-03-13 18:48     ` Matthew Brost
@ 2026-03-13 20:30       ` Dong, Zhanjun
  0 siblings, 0 replies; 11+ messages in thread
From: Dong, Zhanjun @ 2026-03-13 20:30 UTC (permalink / raw)
  To: Matthew Brost; +Cc: intel-xe, stable


On 2026-03-13 2:48 p.m., Matthew Brost wrote:
> On Wed, Mar 11, 2026 at 12:34:30PM -0400, Dong, Zhanjun wrote:
>>
>>
>> On 2026-03-10 6:50 p.m., Zhanjun Dong wrote:
>>> In GuC submit fini, forcefully tear down any exec queues by disabling
>>> CTs, stopping the scheduler (which cleans up lost G2H), killing all
>>> remaining queues, and resuming scheduling to allow any remaining cleanup
>>> actions to complete and signal any remaining fences.
>>>
>>> Split guc_submit_fini into device related and software only part. Using
>>> device-managed and drm-managed action guarantees the correct ordering of
>>> cleanup.
>>>
>>> Fixes: dd08ebf6c352 ("drm/xe: Introduce a new DRM driver for Intel GPUs")
>>> Cc: stable@vger.kernel.org
>>> Signed-off-by: Matthew Brost <matthew.brost@intel.com>
>>> ---
>>>    drivers/gpu/drm/xe/xe_guc.c        | 26 ++++++++++++++--
>>>    drivers/gpu/drm/xe/xe_guc.h        |  1 +
>>>    drivers/gpu/drm/xe/xe_guc_submit.c | 48 +++++++++++++++++++++++-------
>>>    3 files changed, 63 insertions(+), 12 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/xe/xe_guc.c b/drivers/gpu/drm/xe/xe_guc.c
>>> index e75653a5e797..f6964b8f8ede 100644
>>> --- a/drivers/gpu/drm/xe/xe_guc.c
>>> +++ b/drivers/gpu/drm/xe/xe_guc.c
>>> @@ -1399,15 +1399,37 @@ int xe_guc_enable_communication(struct xe_guc *guc)
>>>    	return 0;
>>>    }
>>> -int xe_guc_suspend(struct xe_guc *guc)
>>> +/**
>>> + * xe_guc_softreset() - Soft reset GuC
>>> + * @guc: The GuC object
>>> + *
>>> + * Send soft reset command to GuC through mmio send.
>>> + *
>>> + * Return: 0 if success, otherwise error code
>>> + */
>>> +int xe_guc_softreset(struct xe_guc *guc)
>>>    {
>>> -	struct xe_gt *gt = guc_to_gt(guc);
>>>    	u32 action[] = {
>>>    		XE_GUC_ACTION_CLIENT_SOFT_RESET,
>>>    	};
>>>    	int ret;
>>> +	if (!xe_uc_fw_is_running(&guc->fw))
>>> +		return 0;
>>> +
>>>    	ret = xe_guc_mmio_send(guc, action, ARRAY_SIZE(action));
>>> +	if (ret)
>>> +		return ret;
>>> +
>>> +	return 0;
>>> +}
>>> +
>>> +int xe_guc_suspend(struct xe_guc *guc)
>>> +{
>>> +	struct xe_gt *gt = guc_to_gt(guc);
>>> +	int ret;
>>> +
>>> +	ret = xe_guc_softreset(guc);
>>>    	if (ret) {
>>>    		xe_gt_err(gt, "GuC suspend failed: %pe\n", ERR_PTR(ret));
>>>    		return ret;
>>> diff --git a/drivers/gpu/drm/xe/xe_guc.h b/drivers/gpu/drm/xe/xe_guc.h
>>> index 66e7edc70ed9..02514914f404 100644
>>> --- a/drivers/gpu/drm/xe/xe_guc.h
>>> +++ b/drivers/gpu/drm/xe/xe_guc.h
>>> @@ -44,6 +44,7 @@ int xe_guc_opt_in_features_enable(struct xe_guc *guc);
>>>    void xe_guc_runtime_suspend(struct xe_guc *guc);
>>>    void xe_guc_runtime_resume(struct xe_guc *guc);
>>>    int xe_guc_suspend(struct xe_guc *guc);
>>> +int xe_guc_softreset(struct xe_guc *guc);
>>>    void xe_guc_notify(struct xe_guc *guc);
>>>    int xe_guc_auth_huc(struct xe_guc *guc, u32 rsa_addr);
>>>    int xe_guc_mmio_send(struct xe_guc *guc, const u32 *request, u32 len);
>>> diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c
>>> index b31e0e0af5cb..8afd424b27fb 100644
>>> --- a/drivers/gpu/drm/xe/xe_guc_submit.c
>>> +++ b/drivers/gpu/drm/xe/xe_guc_submit.c
>>> @@ -47,6 +47,8 @@
>>>    #define XE_GUC_EXEC_QUEUE_CGP_CONTEXT_ERROR_LEN		6
>>> +static int guc_submit_reset_prepare(struct xe_guc *guc);
>>> +
>>>    static struct xe_guc *
>>>    exec_queue_to_guc(struct xe_exec_queue *q)
>>>    {
>>> @@ -238,7 +240,7 @@ static bool exec_queue_killed_or_banned_or_wedged(struct xe_exec_queue *q)
>>>    		 EXEC_QUEUE_STATE_BANNED));
>>>    }
>>> -static void guc_submit_fini(struct drm_device *drm, void *arg)
>>> +static void guc_submit_sw_fini(struct drm_device *drm, void *arg)
>>>    {
>>>    	struct xe_guc *guc = arg;
>>>    	struct xe_device *xe = guc_to_xe(guc);
>>> @@ -256,6 +258,19 @@ static void guc_submit_fini(struct drm_device *drm, void *arg)
>>>    	xa_destroy(&guc->submission_state.exec_queue_lookup);
>>>    }
>>> +static void guc_submit_fini(void *arg)
>>> +{
>>> +	struct xe_guc *guc = arg;
>>> +
>>> +	/* Forcefully kill any remaining exec queues */
>> Shall we do VF bypass here?
>>
> 
> Why? These flows work on VFs and are still required. This is initiating
> a software cut off communication with the GuC and cleaning up any lost
> communications with the GuC so all queues are destroyed.
> 
> Matt

Got it. LGTM

Reviewed-by: Zhanjun Dong <zhanjun.dong@intel.com>

> 
>> Regards,
>> Zhanjun Dong
>>> +	xe_guc_ct_stop(&guc->ct);
>>> +	guc_submit_reset_prepare(guc);
>>> +	xe_guc_softreset(guc);
>>> +	xe_guc_submit_stop(guc);
>>> +	xe_uc_fw_sanitize(&guc->fw);
>>> +	xe_guc_submit_pause_abort(guc);
>>> +}
>>> +
>>>    static void guc_submit_wedged_fini(void *arg)
>>>    {
>>>    	struct xe_guc *guc = arg;
>>> @@ -325,7 +340,11 @@ int xe_guc_submit_init(struct xe_guc *guc, unsigned int num_ids)
>>>    	guc->submission_state.initialized = true;
>>> -	return drmm_add_action_or_reset(&xe->drm, guc_submit_fini, guc);
>>> +	err = drmm_add_action_or_reset(&xe->drm, guc_submit_sw_fini, guc);
>>> +	if (err)
>>> +		return err;
>>> +
>>> +	return devm_add_action_or_reset(xe->drm.dev, guc_submit_fini, guc);
>>>    }
>>>    /*
>>> @@ -2298,6 +2317,7 @@ static const struct xe_exec_queue_ops guc_exec_queue_ops = {
>>>    static void guc_exec_queue_stop(struct xe_guc *guc, struct xe_exec_queue *q)
>>>    {
>>>    	struct xe_gpu_scheduler *sched = &q->guc->sched;
>>> +	bool do_destroy = false;
>>>    	/* Stop scheduling + flush any DRM scheduler operations */
>>>    	xe_sched_submission_stop(sched);
>>> @@ -2305,7 +2325,7 @@ static void guc_exec_queue_stop(struct xe_guc *guc, struct xe_exec_queue *q)
>>>    	/* Clean up lost G2H + reset engine state */
>>>    	if (exec_queue_registered(q)) {
>>>    		if (exec_queue_destroyed(q))
>>> -			__guc_exec_queue_destroy(guc, q);
>>> +			do_destroy = true;
>>>    	}
>>>    	if (q->guc->suspend_pending) {
>>>    		set_exec_queue_suspended(q);
>>> @@ -2341,18 +2361,15 @@ static void guc_exec_queue_stop(struct xe_guc *guc, struct xe_exec_queue *q)
>>>    			xe_guc_exec_queue_trigger_cleanup(q);
>>>    		}
>>>    	}
>>> +
>>> +	if (do_destroy)
>>> +		__guc_exec_queue_destroy(guc, q);
>>>    }
>>> -int xe_guc_submit_reset_prepare(struct xe_guc *guc)
>>> +static int guc_submit_reset_prepare(struct xe_guc *guc)
>>>    {
>>>    	int ret;
>>> -	if (xe_gt_WARN_ON(guc_to_gt(guc), vf_recovery(guc)))
>>> -		return 0;
>>> -
>>> -	if (!guc->submission_state.initialized)
>>> -		return 0;
>>> -
>>>    	/*
>>>    	 * Using an atomic here rather than submission_state.lock as this
>>>    	 * function can be called while holding the CT lock (engine reset
>>> @@ -2367,6 +2384,17 @@ int xe_guc_submit_reset_prepare(struct xe_guc *guc)
>>>    	return ret;
>>>    }
>>> +int xe_guc_submit_reset_prepare(struct xe_guc *guc)
>>> +{
>>> +	if (xe_gt_WARN_ON(guc_to_gt(guc), vf_recovery(guc)))
>>> +		return 0;
>>> +
>>> +	if (!guc->submission_state.initialized)
>>> +		return 0;
>>> +
>>> +	return guc_submit_reset_prepare(guc);
>>> +}
>>> +
>>>    void xe_guc_submit_reset_wait(struct xe_guc *guc)
>>>    {
>>>    	wait_event(guc->ct.wq, xe_device_wedged(guc_to_xe(guc)) ||
>>


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v9 3/7] drm/xe: Trigger queue cleanup if not in wedged mode 2
  2026-03-13 18:49     ` Matthew Brost
@ 2026-03-13 20:31       ` Dong, Zhanjun
  0 siblings, 0 replies; 11+ messages in thread
From: Dong, Zhanjun @ 2026-03-13 20:31 UTC (permalink / raw)
  To: Matthew Brost; +Cc: intel-xe, stable

LGTM

Reviewed-by: Zhanjun Dong <zhanjun.dong@intel.com>


On 2026-03-13 2:49 p.m., Matthew Brost wrote:
> On Wed, Mar 11, 2026 at 12:33:54PM -0400, Dong, Zhanjun wrote:
>>
>>
>> On 2026-03-10 6:50 p.m., Zhanjun Dong wrote:
>>> The intent of wedging a device is to allow queues to continue running
>>> only in wedged mode 2. In other modes, queues should initiate cleanup
>>> and signal all remaining fences. Fix xe_guc_submit_wedge to correctly
>>> clean up queues when wedge mode != 2.
>>>
>>> Fixes: 7dbe8af13c18 ("drm/xe: Wedge the entire device")
>>> Cc: stable@vger.kernel.org
>>> Signed-off-by: Matthew Brost <matthew.brost@intel.com>
>>> ---
>>>    drivers/gpu/drm/xe/xe_guc_submit.c | 35 +++++++++++++++++++-----------
>>>    1 file changed, 22 insertions(+), 13 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c
>>> index 8afd424b27fb..cb32053d57ec 100644
>>> --- a/drivers/gpu/drm/xe/xe_guc_submit.c
>>> +++ b/drivers/gpu/drm/xe/xe_guc_submit.c
>>> @@ -1319,6 +1319,7 @@ static void disable_scheduling_deregister(struct xe_guc *guc,
>>>     */
>>>    void xe_guc_submit_wedge(struct xe_guc *guc)
>>>    {
>>> +	struct xe_device *xe = guc_to_xe(guc);
>>>    	struct xe_gt *gt = guc_to_gt(guc);
>>>    	struct xe_exec_queue *q;
>>>    	unsigned long index;
>>> @@ -1333,20 +1334,28 @@ void xe_guc_submit_wedge(struct xe_guc *guc)
>>>    	if (!guc->submission_state.initialized)
>>>    		return;
>>> -	err = devm_add_action_or_reset(guc_to_xe(guc)->drm.dev,
>>> -				       guc_submit_wedged_fini, guc);
>>> -	if (err) {
>>> -		xe_gt_err(gt, "Failed to register clean-up in wedged.mode=%s; "
>>> -			  "Although device is wedged.\n",
>>> -			  xe_wedged_mode_to_string(XE_WEDGED_MODE_UPON_ANY_HANG_NO_RESET));
>>> -		return;
>>> -	}
>>> +	if (xe->wedged.mode == 2) {
>>> +		err = devm_add_action_or_reset(guc_to_xe(guc)->drm.dev,
>>> +					       guc_submit_wedged_fini, guc);
>>> +		if (err) {
>>> +			xe_gt_err(gt, "Failed to register clean-up on wedged.mode=2; "
>>> +				  "Although device is wedged.\n");
>>> +			return;
>>> +		}
>>> -	mutex_lock(&guc->submission_state.lock);
>>> -	xa_for_each(&guc->submission_state.exec_queue_lookup, index, q)
>>> -		if (xe_exec_queue_get_unless_zero(q))
>>> -			set_exec_queue_wedged(q);
>>> -	mutex_unlock(&guc->submission_state.lock);
>>> +		mutex_lock(&guc->submission_state.lock);
>>> +		xa_for_each(&guc->submission_state.exec_queue_lookup, index, q)
>>> +			if (xe_exec_queue_get_unless_zero(q))
>>> +				set_exec_queue_wedged(q);
>>> +		mutex_unlock(&guc->submission_state.lock);
>>> +	} else {
>>> +		/* Forcefully kill any remaining exec queues, signal fences */
>> Q: Shall we do VF bypass here?
>>
> 
> Same answer as last patch - no.
> 
> Matt
> 
>> Regards,
>> Zhanjun Dong
>>> +		guc_submit_reset_prepare(guc);
>>> +		xe_guc_submit_stop(guc);
>>> +		xe_guc_softreset(guc);
>>> +		xe_uc_fw_sanitize(&guc->fw);
>>> +		xe_guc_submit_pause_abort(guc);
>>> +	}
>>>    }
>>>    static bool guc_submit_hint_wedged(struct xe_guc *guc)
>>


^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2026-03-13 20:31 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
     [not found] <20260310225039.1320161-1-zhanjun.dong@intel.com>
2026-03-10 22:50 ` [PATCH v9 1/7] drm/xe: Always kill exec queues in xe_guc_submit_pause_abort Zhanjun Dong
2026-03-10 22:50 ` [PATCH v9 2/7] drm/xe: Forcefully tear down exec queues in GuC submit fini Zhanjun Dong
2026-03-11 16:34   ` Dong, Zhanjun
2026-03-13 18:48     ` Matthew Brost
2026-03-13 20:30       ` Dong, Zhanjun
2026-03-10 22:50 ` [PATCH v9 3/7] drm/xe: Trigger queue cleanup if not in wedged mode 2 Zhanjun Dong
2026-03-11 16:33   ` Dong, Zhanjun
2026-03-13 18:49     ` Matthew Brost
2026-03-13 20:31       ` Dong, Zhanjun
2026-03-10 22:50 ` [PATCH v9 5/7] drm/xe/guc: Ensure CT state transitions via STOP before DISABLED Zhanjun Dong
2026-03-10 22:50 ` [PATCH v9 7/7] drm/xe: Open-code GGTT MMIO access protection Zhanjun Dong

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox