[RFC PATCH 10/12] drm/xe: Use DRM dep queue kill semantics

public inbox for intel-xe@lists.freedesktop.org
 help / color / mirror / Atom feed

From: Matthew Brost <matthew.brost@intel.com>
To: intel-xe@lists.freedesktop.org
Cc: dri-devel@lists.freedesktop.org
Subject: [RFC PATCH 10/12] drm/xe: Use DRM dep queue kill semantics
Date: Sun, 15 Mar 2026 21:32:53 -0700	[thread overview]
Message-ID: <20260316043255.226352-11-matthew.brost@intel.com> (raw)
In-Reply-To: <20260316043255.226352-1-matthew.brost@intel.com>

Once the GuC context has its scheduling disabled by TDR or kill work
item, the queue is taken off the hardware and can no longer touch memory
without risking corruption. Invoke drm_dep_queue_kill, which bypasses
any remaining job dependencies in the queue and calls run_job
immediately for each remaining job. In run_job, if the queue returns
drm_dep_queue_is_killed, immediately signal the hardware fence, as the
queue can no longer access any memory associated with the fence being
signaled.

Signed-off-by: Matthew Brost <matthew.brost@intel.com>
---
 drivers/gpu/drm/xe/xe_guc_exec_queue_types.h |   2 +
 drivers/gpu/drm/xe/xe_guc_submit.c           | 139 +++++++++++++------
 2 files changed, 97 insertions(+), 44 deletions(-)

diff --git a/drivers/gpu/drm/xe/xe_guc_exec_queue_types.h b/drivers/gpu/drm/xe/xe_guc_exec_queue_types.h
index cb15e86823d2..72de6d0a754a 100644
--- a/drivers/gpu/drm/xe/xe_guc_exec_queue_types.h
+++ b/drivers/gpu/drm/xe/xe_guc_exec_queue_types.h
@@ -29,6 +29,8 @@ struct xe_guc_exec_queue {
 	 */
 #define MAX_STATIC_MSG_TYPE	3
 	struct xe_sched_msg static_msgs[MAX_STATIC_MSG_TYPE];
+	/** @kill_work: Kill work item */
+	struct delayed_work kill_work;
 	/** @resume_time: time of last resume */
 	u64 resume_time;
 	/** @state: GuC specific state for this xe_exec_queue */
diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c
index 064cf15166b9..58569969b4c7 100644
--- a/drivers/gpu/drm/xe/xe_guc_submit.c
+++ b/drivers/gpu/drm/xe/xe_guc_submit.c
@@ -1216,6 +1216,8 @@ guc_exec_queue_run_job(struct drm_dep_job *drm_job)
 	trace_xe_sched_job_run(job);
 
 	if (!killed_or_banned_or_wedged && !xe_sched_job_is_error(job)) {
+		xe_gt_assert(guc_to_gt(guc), !drm_dep_queue_is_killed(&q->dep_q));
+
 		if (xe_exec_queue_is_multi_queue_secondary(q)) {
 			struct xe_exec_queue *primary = xe_exec_queue_multi_queue_primary(q);
 
@@ -1234,6 +1236,15 @@ guc_exec_queue_run_job(struct drm_dep_job *drm_job)
 			q->ring_ops->emit_job(job);
 		submit_exec_queue(q, job);
 		job->restore_replay = false;
+	} else if (drm_dep_queue_is_killed(&q->dep_q)) {
+		xe_sched_job_set_error(job, -ECANCELED); /* fence signal */
+		dma_fence_put(job->fence); /* drop the DRM dep reference */
+
+		/*
+		 * Our queue is off hardware, so fences can be signalled
+		 * immediately with an error.
+		 */
+		return ERR_PTR(-ECANCELED);
 	}
 
 run_job_out:
@@ -1479,29 +1490,21 @@ static void disable_scheduling(struct xe_exec_queue *q, bool immediate)
 }
 
 static enum drm_dep_timedout_stat
-guc_exec_queue_timedout_job(struct drm_dep_job *drm_job)
+__guc_exec_queue_timedout_job(struct xe_guc *guc, struct xe_exec_queue *q,
+			      struct xe_sched_job *job)
 {
-	struct xe_sched_job *job = to_xe_sched_job(drm_job);
 	struct drm_dep_job *tmp_job;
-	struct xe_exec_queue *q = job->q, *primary;
+	struct xe_exec_queue *primary = xe_exec_queue_multi_queue_primary(q);
 	struct xe_gpu_scheduler *sched = &q->guc->sched;
-	struct xe_guc *guc = exec_queue_to_guc(q);
 	const char *process_name = "no process";
 	struct xe_device *xe = guc_to_xe(guc);
 	int err = -ETIME;
 	pid_t pid = -1;
 	bool wedged = false, skip_timeout_check;
 
-	xe_gt_assert(guc_to_gt(guc), !exec_queue_destroyed(q));
-
-	if (drm_dep_job_is_finished(&job->drm))
-		return DRM_DEP_TIMEDOUT_STAT_JOB_SIGNALED;
-
 	if (vf_recovery(guc))
 		return DRM_DEP_TIMEDOUT_STAT_REQUEUE_JOB;
 
-	primary = xe_exec_queue_multi_queue_primary(q);
-
 	/* Kill the run_job entry point */
 	if (xe_exec_queue_is_multi_queue(q))
 		xe_guc_exec_queue_group_stop(q);
@@ -1509,7 +1512,7 @@ guc_exec_queue_timedout_job(struct drm_dep_job *drm_job)
 		xe_sched_submission_stop(sched);
 
 	/* Must check all state after stopping scheduler */
-	skip_timeout_check = exec_queue_reset(q) ||
+	skip_timeout_check = !job || exec_queue_reset(q) ||
 		exec_queue_killed_or_banned_or_wedged(q);
 
 	/* Skip timeout check if multi-queue group is banned */
@@ -1603,43 +1606,45 @@ guc_exec_queue_timedout_job(struct drm_dep_job *drm_job)
 		}
 	}
 
-	if (q->vm && q->vm->xef) {
-		process_name = q->vm->xef->process_name;
-		pid = q->vm->xef->pid;
-	}
+	if (job) {
+		if (q->vm && q->vm->xef) {
+			process_name = q->vm->xef->process_name;
+			pid = q->vm->xef->pid;
+		}
 
-	if (!exec_queue_killed(q))
-		xe_gt_notice(guc_to_gt(guc),
-			     "Timedout job: seqno=%u, lrc_seqno=%u, guc_id=%d, flags=0x%lx in %s [%d]",
-			     xe_sched_job_seqno(job), xe_sched_job_lrc_seqno(job),
-			     q->guc->id, q->flags, process_name, pid);
+		if (!exec_queue_killed(q))
+			xe_gt_notice(guc_to_gt(guc),
+				     "Timedout job: seqno=%u, lrc_seqno=%u, guc_id=%d, flags=0x%lx in %s [%d]",
+				     xe_sched_job_seqno(job), xe_sched_job_lrc_seqno(job),
+				     q->guc->id, q->flags, process_name, pid);
 
-	trace_xe_sched_job_timedout(job);
+		trace_xe_sched_job_timedout(job);
 
-	if (!exec_queue_killed(q))
-		xe_devcoredump(q, job,
-			       "Timedout job - seqno=%u, lrc_seqno=%u, guc_id=%d, flags=0x%lx",
-			       xe_sched_job_seqno(job), xe_sched_job_lrc_seqno(job),
-			       q->guc->id, q->flags);
+		if (!exec_queue_killed(q))
+			xe_devcoredump(q, job,
+				       "Timedout job - seqno=%u, lrc_seqno=%u, guc_id=%d, flags=0x%lx",
+				       xe_sched_job_seqno(job), xe_sched_job_lrc_seqno(job),
+				       q->guc->id, q->flags);
 
-	/*
-	 * Kernel jobs should never fail, nor should VM jobs if they do
-	 * somethings has gone wrong and the GT needs a reset
-	 */
-	xe_gt_WARN(q->gt, q->flags & EXEC_QUEUE_FLAG_KERNEL,
-		   "Kernel-submitted job timed out\n");
-	xe_gt_WARN(q->gt, q->flags & EXEC_QUEUE_FLAG_VM && !exec_queue_killed(q),
-		   "VM job timed out on non-killed execqueue\n");
-	if (!wedged && (q->flags & EXEC_QUEUE_FLAG_KERNEL ||
-			(q->flags & EXEC_QUEUE_FLAG_VM && !exec_queue_killed(q)))) {
-		if (!xe_sched_invalidate_job(job, 2))
-			xe_gt_reset_async(q->gt);
-	}
+		/*
+		 * Kernel jobs should never fail, nor should VM jobs if they do
+		 * somethings has gone wrong and the GT needs a reset
+		 */
+		xe_gt_WARN(q->gt, q->flags & EXEC_QUEUE_FLAG_KERNEL,
+			   "Kernel-submitted job timed out\n");
+		xe_gt_WARN(q->gt, q->flags & EXEC_QUEUE_FLAG_VM && !exec_queue_killed(q),
+			   "VM job timed out on non-killed execqueue\n");
+		if (!wedged && (q->flags & EXEC_QUEUE_FLAG_KERNEL ||
+				(q->flags & EXEC_QUEUE_FLAG_VM && !exec_queue_killed(q)))) {
+			if (!xe_sched_invalidate_job(job, 2))
+				xe_gt_reset_async(q->gt);
+		}
 
-	/* Mark all outstanding jobs as bad, thus completing them */
-	xe_sched_job_set_error(job, err);
-	drm_dep_queue_for_each_pending_job(tmp_job, sched->dep_q)
-		xe_sched_job_set_error(to_xe_sched_job(tmp_job), -ECANCELED);
+		/* Mark all outstanding jobs as bad, thus completing them */
+		xe_sched_job_set_error(job, err);
+		drm_dep_queue_for_each_pending_job(tmp_job, sched->dep_q)
+			xe_sched_job_set_error(to_xe_sched_job(tmp_job), -ECANCELED);
+	}
 
 	if (xe_exec_queue_is_multi_queue(q)) {
 		xe_guc_exec_queue_group_start(q);
@@ -1649,6 +1654,10 @@ guc_exec_queue_timedout_job(struct drm_dep_job *drm_job)
 		xe_guc_exec_queue_trigger_cleanup(q);
 	}
 
+	/* Queue is off hardware; start flushing jobs bypassing dependencies. */
+	drm_dep_queue_kill(&q->dep_q);
+	cancel_delayed_work(&q->guc->kill_work);
+
 	return DRM_DEP_TIMEDOUT_STAT_REQUEUE_JOB;
 
 rearm:
@@ -1665,6 +1674,43 @@ guc_exec_queue_timedout_job(struct drm_dep_job *drm_job)
 	return DRM_DEP_TIMEDOUT_STAT_REQUEUE_JOB;
 }
 
+static enum drm_dep_timedout_stat
+guc_exec_queue_timedout_job(struct drm_dep_job *drm_job)
+{
+	struct xe_sched_job *job = to_xe_sched_job(drm_job);
+	struct xe_exec_queue *q = job->q;
+	struct xe_guc *guc = exec_queue_to_guc(q);
+
+	xe_gt_assert(guc_to_gt(guc), !exec_queue_destroyed(q));
+
+	if (drm_dep_job_is_finished(&job->drm))
+		return DRM_DEP_TIMEDOUT_STAT_JOB_SIGNALED;
+
+	return __guc_exec_queue_timedout_job(guc, q, job);
+}
+
+static void guc_exec_queue_kill_work_func(struct work_struct *w)
+{
+	struct xe_guc_exec_queue *ge =
+		container_of(w, typeof(*ge), kill_work.work);
+	struct xe_exec_queue *q = ge->q;
+	struct xe_exec_queue *primary = xe_exec_queue_multi_queue_primary(q);
+	struct xe_guc *guc = exec_queue_to_guc(q);
+
+	xe_gt_assert(guc_to_gt(guc), exec_queue_killed(q));
+
+	if (drm_dep_queue_is_killed(&q->dep_q))
+		return;
+
+	if (!(exec_queue_enabled(primary) ||
+	      exec_queue_pending_disable(primary))) {
+		drm_dep_queue_kill(&q->dep_q);
+		return;
+	}
+
+	__guc_exec_queue_timedout_job(guc, q, NULL);
+}
+
 static void guc_exec_queue_fini(struct xe_exec_queue *q)
 {
 	struct xe_guc_exec_queue *ge = q->guc;
@@ -1915,6 +1961,7 @@ static void guc_dep_queue_fini(struct drm_dep_queue *dep_q)
 {
 	struct xe_exec_queue *q = container_of(dep_q, typeof(*q), dep_q);
 
+	cancel_delayed_work(&q->guc->kill_work);
 	xe_exec_queue_destroy(q);
 }
 
@@ -1949,6 +1996,7 @@ static int guc_exec_queue_init(struct xe_exec_queue *q)
 	q->guc = ge;
 	ge->q = q;
 	init_waitqueue_head(&ge->suspend_wait);
+	INIT_DELAYED_WORK(&ge->kill_work, guc_exec_queue_kill_work_func);
 
 	for (i = 0; i < MAX_STATIC_MSG_TYPE; ++i)
 		INIT_LIST_HEAD(&ge->static_msgs[i].link);
@@ -2028,6 +2076,9 @@ static void guc_exec_queue_kill(struct xe_exec_queue *q)
 	set_exec_queue_killed(q);
 	__suspend_fence_signal(q);
 	xe_guc_exec_queue_trigger_cleanup(q);
+
+	mod_delayed_work(drm_dep_queue_timeout_wq(&q->dep_q),
+			 &q->guc->kill_work, 2);
 }
 
 static void guc_exec_queue_add_msg(struct xe_exec_queue *q, struct xe_sched_msg *msg,
-- 
2.34.1

next prev parent reply	other threads:[~2026-03-16  4:33 UTC|newest]

Thread overview: 65+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-03-16  4:32 [RFC PATCH 00/12] Introduce DRM dep queue Matthew Brost
2026-03-16  4:32 ` [RFC PATCH 01/12] workqueue: Add interface to teach lockdep to warn on reclaim violations Matthew Brost
2026-03-25 15:59   ` Tejun Heo
2026-03-26  1:49     ` Matthew Brost
2026-03-26  2:19       ` Tejun Heo
2026-03-27  4:33         ` Matthew Brost
2026-03-27 17:25           ` Tejun Heo
2026-03-16  4:32 ` [RFC PATCH 02/12] drm/dep: Add DRM dependency queue layer Matthew Brost
2026-03-16  9:16   ` Boris Brezillon
2026-03-17  5:22     ` Matthew Brost
2026-03-17  8:48       ` Boris Brezillon
2026-03-16 10:25   ` Danilo Krummrich
2026-03-17  5:10     ` Matthew Brost
2026-03-17 12:19       ` Danilo Krummrich
2026-03-18 23:02         ` Matthew Brost
2026-03-17  2:47   ` Daniel Almeida
2026-03-17  5:45     ` Matthew Brost
2026-03-17  7:17       ` Miguel Ojeda
2026-03-17  8:26         ` Matthew Brost
2026-03-17 12:04           ` Daniel Almeida
2026-03-17 19:41           ` Miguel Ojeda
2026-03-23 17:31             ` Matthew Brost
2026-03-23 17:42               ` Miguel Ojeda
2026-03-17 18:14       ` Matthew Brost
2026-03-17 19:48         ` Daniel Almeida
2026-03-17 20:43         ` Boris Brezillon
2026-03-18 22:40           ` Matthew Brost
2026-03-19  9:57             ` Boris Brezillon
2026-03-22  6:43               ` Matthew Brost
2026-03-23  7:58                 ` Matthew Brost
2026-03-23 10:06                   ` Boris Brezillon
2026-03-23 17:11                     ` Matthew Brost
2026-03-17 12:31     ` Danilo Krummrich
2026-03-17 14:25       ` Daniel Almeida
2026-03-17 14:33         ` Danilo Krummrich
2026-03-18 22:50           ` Matthew Brost
2026-03-17  8:47   ` Christian König
2026-03-17 14:55   ` Boris Brezillon
2026-03-18 23:28     ` Matthew Brost
2026-03-19  9:11       ` Boris Brezillon
2026-03-23  4:50         ` Matthew Brost
2026-03-23  9:55           ` Boris Brezillon
2026-03-23 17:08             ` Matthew Brost
2026-03-23 18:38               ` Matthew Brost
2026-03-24  9:23                 ` Boris Brezillon
2026-03-24 16:06                   ` Matthew Brost
2026-03-25  2:33                     ` Matthew Brost
2026-03-24  8:49               ` Boris Brezillon
2026-03-24 16:51                 ` Matthew Brost
2026-03-17 16:30   ` Shashank Sharma
2026-03-16  4:32 ` [RFC PATCH 03/12] drm/xe: Use WQ_MEM_WARN_ON_RECLAIM on all workqueues in the reclaim path Matthew Brost
2026-03-16  4:32 ` [RFC PATCH 04/12] drm/xe: Issue GGTT invalidation under lock in ggtt_node_remove Matthew Brost
2026-03-26  5:45   ` Bhadane, Dnyaneshwar
2026-03-16  4:32 ` [RFC PATCH 05/12] drm/xe: Return fence from xe_sched_job_arm and adjust job references Matthew Brost
2026-03-16  4:32 ` [RFC PATCH 06/12] drm/xe: Convert to DRM dep queue scheduler layer Matthew Brost
2026-03-16  4:32 ` [RFC PATCH 07/12] drm/xe: Make scheduler message lock IRQ-safe Matthew Brost
2026-03-16  4:32 ` [RFC PATCH 08/12] drm/xe: Rework exec queue object on top of DRM dep Matthew Brost
2026-03-16  4:32 ` [RFC PATCH 09/12] drm/xe: Enable IRQ job put in " Matthew Brost
2026-03-16  4:32 ` Matthew Brost [this message]
2026-03-16  4:32 ` [RFC PATCH 11/12] accel/amdxdna: Convert to drm_dep scheduler layer Matthew Brost
2026-03-16  4:32 ` [RFC PATCH 12/12] drm/panthor: " Matthew Brost
2026-03-16  4:52 ` ✗ CI.checkpatch: warning for Introduce DRM dep queue Patchwork
2026-03-16  4:53 ` ✓ CI.KUnit: success " Patchwork
2026-03-16  5:28 ` ✓ Xe.CI.BAT: " Patchwork
2026-03-16  8:09 ` ✗ Xe.CI.FULL: failure " Patchwork

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:cb15e86823d dfblob:72de6d0a754 dfblob:064cf15166b
dfblob:58569969b4c )
 OR (
bs:"[RFC PATCH 10/12] drm/xe: Use DRM dep queue kill semantics" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260316043255.226352-11-matthew.brost@intel.com \
    --to=matthew.brost@intel.com \
    --cc=dri-devel@lists.freedesktop.org \
    --cc=intel-xe@lists.freedesktop.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox