From: "Thomas Hellström" <thomas.hellstrom@linux.intel.com>
To: intel-xe@lists.freedesktop.org
Cc: "Thomas Hellström" <thomas.hellstrom@linux.intel.com>,
"Matthew Auld" <matthew.auld@intel.com>,
"Satyanarayana K V P" <satyanarayana.k.v.p@intel.com>,
stable@vger.kernel.org
Subject: [PATCH 1/5] drm/xe/guc: Defer user exec queue scheduler start until after page table restore
Date: Fri, 22 May 2026 18:43:51 +0200 [thread overview]
Message-ID: <20260522164355.2773-2-thomas.hellstrom@linux.intel.com> (raw)
In-Reply-To: <20260522164355.2773-1-thomas.hellstrom@linux.intel.com>
On S3/S4 and d3cold runtime PM resume, exec queue schedulers are
restarted before xe_bo_restore_late() has restored userspace VM page
table BOs and LRC BOs. If a pending job is submitted in this window,
GuC will attempt to load the context using stale or invalid data in
VRAM, leading to GuC exceptions.
Defer user exec queue scheduler start until after page tables and LRC
BOs are restored, ensuring no job can be submitted before the backing
storage is valid. Migrate and kernel VM exec queues are still started
immediately as they are required by the restore process itself.
For GT reset, VRAM is not evicted and all BOs remain valid, so user
exec queue schedulers are started without deferral.
This covers both LR and non-LR userspace exec queues.
Fixes: 7f387e6012b6 ("drm/xe: add XE_BO_FLAG_PINNED_LATE_RESTORE")
Cc: Matthew Auld <matthew.auld@intel.com>
Cc: Satyanarayana K V P <satyanarayana.k.v.p@intel.com>
Cc: <stable@vger.kernel.org> # v6.16+
Assisted-by: GitHub_Copilot:claude-sonnet-4.6
Signed-off-by: Thomas Hellström <thomas.hellstrom@linux.intel.com>
---
drivers/gpu/drm/xe/xe_gt.c | 16 +++++++++++
drivers/gpu/drm/xe/xe_gt.h | 2 ++
drivers/gpu/drm/xe/xe_guc.c | 13 +++++++++
drivers/gpu/drm/xe/xe_guc.h | 1 +
drivers/gpu/drm/xe/xe_guc_submit.c | 44 ++++++++++++++++++++++++++++++
drivers/gpu/drm/xe/xe_guc_submit.h | 1 +
drivers/gpu/drm/xe/xe_pm.c | 6 ++++
drivers/gpu/drm/xe/xe_uc.c | 16 +++++++++++
drivers/gpu/drm/xe/xe_uc.h | 1 +
9 files changed, 100 insertions(+)
diff --git a/drivers/gpu/drm/xe/xe_gt.c b/drivers/gpu/drm/xe/xe_gt.c
index 783eb6d631b5..2c63e4d6a649 100644
--- a/drivers/gpu/drm/xe/xe_gt.c
+++ b/drivers/gpu/drm/xe/xe_gt.c
@@ -955,6 +955,8 @@ static void gt_reset_worker(struct work_struct *w)
if (err)
goto err_out;
+ xe_uc_start_user_queues(>->uc);
+
xe_force_wake_put(gt_to_fw(gt), fw_ref);
/* Pair with get while enqueueing the work in xe_gt_reset_async() */
@@ -967,6 +969,7 @@ static void gt_reset_worker(struct work_struct *w)
err_out:
xe_force_wake_put(gt_to_fw(gt), fw_ref);
XE_WARN_ON(xe_uc_start(>->uc));
+ xe_uc_start_user_queues(>->uc);
err_fail:
xe_gt_err(gt, "reset failed (%pe)\n", ERR_PTR(err));
@@ -1050,6 +1053,19 @@ int xe_gt_sanitize_freq(struct xe_gt *gt)
return ret;
}
+/**
+ * xe_gt_start_user_queues() - Start user exec queues after page table restore
+ * @gt: the GT object
+ *
+ * Starts the DRM schedulers for all user exec queues on the GT. This must be
+ * called after xe_bo_restore_late() to ensure that userspace page table BOs
+ * are valid before any job submission triggers GuC context registration.
+ */
+void xe_gt_start_user_queues(struct xe_gt *gt)
+{
+ xe_uc_start_user_queues(>->uc);
+}
+
int xe_gt_resume(struct xe_gt *gt)
{
int err;
diff --git a/drivers/gpu/drm/xe/xe_gt.h b/drivers/gpu/drm/xe/xe_gt.h
index 4150aa594f05..b6ba05a317f7 100644
--- a/drivers/gpu/drm/xe/xe_gt.h
+++ b/drivers/gpu/drm/xe/xe_gt.h
@@ -170,4 +170,6 @@ static inline bool xe_gt_supports_multi_queue(const struct xe_gt *gt,
return gt->info.multi_queue_engine_class_mask & BIT(class);
}
+void xe_gt_start_user_queues(struct xe_gt *gt);
+
#endif
diff --git a/drivers/gpu/drm/xe/xe_guc.c b/drivers/gpu/drm/xe/xe_guc.c
index 4023700ff2a9..0359909b8b27 100644
--- a/drivers/gpu/drm/xe/xe_guc.c
+++ b/drivers/gpu/drm/xe/xe_guc.c
@@ -1717,6 +1717,19 @@ int xe_guc_start(struct xe_guc *guc)
return xe_guc_submit_start(guc);
}
+/**
+ * xe_guc_start_user_queues() - Start user exec queue schedulers on the GuC
+ * @guc: the GuC object
+ *
+ * Starts the DRM schedulers for all user exec queues managed by this GuC.
+ * Must be called after xe_bo_restore_late() to ensure page tables are valid
+ * before any job submission triggers GuC context registration.
+ */
+void xe_guc_start_user_queues(struct xe_guc *guc)
+{
+ xe_guc_submit_start_user_queues(guc);
+}
+
/**
* xe_guc_runtime_suspend() - GuC runtime suspend
* @guc: The GuC object
diff --git a/drivers/gpu/drm/xe/xe_guc.h b/drivers/gpu/drm/xe/xe_guc.h
index 02514914f404..ad2a6521852c 100644
--- a/drivers/gpu/drm/xe/xe_guc.h
+++ b/drivers/gpu/drm/xe/xe_guc.h
@@ -60,6 +60,7 @@ void xe_guc_reset_wait(struct xe_guc *guc);
void xe_guc_stop_prepare(struct xe_guc *guc);
void xe_guc_stop(struct xe_guc *guc);
int xe_guc_start(struct xe_guc *guc);
+void xe_guc_start_user_queues(struct xe_guc *guc);
void xe_guc_declare_wedged(struct xe_guc *guc);
bool xe_guc_using_main_gamctrl_queues(struct xe_guc *guc);
diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c
index 4d32b430bc15..084ecc8e7efa 100644
--- a/drivers/gpu/drm/xe/xe_guc_submit.c
+++ b/drivers/gpu/drm/xe/xe_guc_submit.c
@@ -2535,6 +2535,16 @@ static void guc_exec_queue_start(struct xe_exec_queue *q)
xe_sched_submission_resume_tdr(sched);
}
+/*
+ * Returns true for user exec queues whose page tables may not yet be
+ * restored when xe_guc_submit_start() is called during GT resume.
+ * These queues must be started later, after xe_bo_restore_late().
+ */
+static bool exec_queue_needs_late_start(const struct xe_exec_queue *q)
+{
+ return !(q->flags & (EXEC_QUEUE_FLAG_MIGRATE | EXEC_QUEUE_FLAG_VM));
+}
+
int xe_guc_submit_start(struct xe_guc *guc)
{
struct xe_exec_queue *q;
@@ -2549,6 +2559,10 @@ int xe_guc_submit_start(struct xe_guc *guc)
if (q->guc->id != index)
continue;
+ /* User queues are deferred until page tables are restored */
+ if (exec_queue_needs_late_start(q))
+ continue;
+
guc_exec_queue_start(q);
}
mutex_unlock(&guc->submission_state.lock);
@@ -2558,6 +2572,36 @@ int xe_guc_submit_start(struct xe_guc *guc)
return 0;
}
+/**
+ * xe_guc_submit_start_user_queues() - Start user exec queues after late restore
+ * @guc: the GuC object
+ *
+ * Starts the DRM schedulers for all user exec queues (those not flagged as
+ * migrate or VM queues). Must be called after xe_bo_restore_late() to ensure
+ * page tables are valid before any job submission is attempted.
+ */
+void xe_guc_submit_start_user_queues(struct xe_guc *guc)
+{
+ struct xe_exec_queue *q;
+ unsigned long index;
+
+ if (!guc->submission_state.initialized)
+ return;
+
+ mutex_lock(&guc->submission_state.lock);
+ xa_for_each(&guc->submission_state.exec_queue_lookup, index, q) {
+ /* Prevent redundant attempts to start parallel queues */
+ if (q->guc->id != index)
+ continue;
+
+ if (!exec_queue_needs_late_start(q))
+ continue;
+
+ guc_exec_queue_start(q);
+ }
+ mutex_unlock(&guc->submission_state.lock);
+}
+
static void guc_exec_queue_unpause_prepare(struct xe_guc *guc,
struct xe_exec_queue *q)
{
diff --git a/drivers/gpu/drm/xe/xe_guc_submit.h b/drivers/gpu/drm/xe/xe_guc_submit.h
index b3839a90c142..b210b2f6cd2d 100644
--- a/drivers/gpu/drm/xe/xe_guc_submit.h
+++ b/drivers/gpu/drm/xe/xe_guc_submit.h
@@ -20,6 +20,7 @@ int xe_guc_submit_reset_prepare(struct xe_guc *guc);
void xe_guc_submit_reset_wait(struct xe_guc *guc);
void xe_guc_submit_stop(struct xe_guc *guc);
int xe_guc_submit_start(struct xe_guc *guc);
+void xe_guc_submit_start_user_queues(struct xe_guc *guc);
void xe_guc_submit_pause(struct xe_guc *guc);
void xe_guc_submit_pause_abort(struct xe_guc *guc);
void xe_guc_submit_pause_vf(struct xe_guc *guc);
diff --git a/drivers/gpu/drm/xe/xe_pm.c b/drivers/gpu/drm/xe/xe_pm.c
index d4672eb07476..c203a59d7000 100644
--- a/drivers/gpu/drm/xe/xe_pm.c
+++ b/drivers/gpu/drm/xe/xe_pm.c
@@ -282,6 +282,9 @@ int xe_pm_resume(struct xe_device *xe)
if (err)
goto err;
+ for_each_gt(gt, xe, id)
+ xe_gt_start_user_queues(gt);
+
xe_pxp_pm_resume(xe->pxp);
if (IS_VF_CCS_READY(xe))
@@ -696,6 +699,9 @@ int xe_pm_runtime_resume(struct xe_device *xe)
err = xe_bo_restore_late(xe);
if (err)
goto out;
+
+ for_each_gt(gt, xe, id)
+ xe_gt_start_user_queues(gt);
}
xe_pxp_pm_resume(xe->pxp);
diff --git a/drivers/gpu/drm/xe/xe_uc.c b/drivers/gpu/drm/xe/xe_uc.c
index 75091bde0d50..12606133f5bc 100644
--- a/drivers/gpu/drm/xe/xe_uc.c
+++ b/drivers/gpu/drm/xe/xe_uc.c
@@ -263,6 +263,22 @@ int xe_uc_start(struct xe_uc *uc)
return xe_guc_start(&uc->guc);
}
+/**
+ * xe_uc_start_user_queues() - Start user exec queues after late restore
+ * @uc: the UC object
+ *
+ * Starts the DRM schedulers for all user exec queues. Must be called after
+ * xe_bo_restore_late() to ensure page tables are valid before any job
+ * submission is attempted. Has no effect if GuC submission is not enabled.
+ */
+void xe_uc_start_user_queues(struct xe_uc *uc)
+{
+ if (!xe_device_uc_enabled(uc_to_xe(uc)))
+ return;
+
+ xe_guc_start_user_queues(&uc->guc);
+}
+
static void uc_reset_wait(struct xe_uc *uc)
{
int ret;
diff --git a/drivers/gpu/drm/xe/xe_uc.h b/drivers/gpu/drm/xe/xe_uc.h
index 255a54a8f876..2fd056cfa1d0 100644
--- a/drivers/gpu/drm/xe/xe_uc.h
+++ b/drivers/gpu/drm/xe/xe_uc.h
@@ -18,6 +18,7 @@ void xe_uc_runtime_suspend(struct xe_uc *uc);
void xe_uc_stop_prepare(struct xe_uc *uc);
void xe_uc_stop(struct xe_uc *uc);
int xe_uc_start(struct xe_uc *uc);
+void xe_uc_start_user_queues(struct xe_uc *uc);
void xe_uc_suspend_prepare(struct xe_uc *uc);
int xe_uc_suspend(struct xe_uc *uc);
int xe_uc_sanitize_reset(struct xe_uc *uc);
--
2.54.0
next parent reply other threads:[~2026-05-22 16:44 UTC|newest]
Thread overview: 3+ messages / expand[flat|nested] mbox.gz Atom feed top
[not found] <20260522164355.2773-1-thomas.hellstrom@linux.intel.com>
2026-05-22 16:43 ` Thomas Hellström [this message]
2026-05-22 16:43 ` [PATCH 2/5] drm/xe/guc: Don't ban LR VM exec queues on PM suspend Thomas Hellström
2026-05-22 16:43 ` [PATCH 5/5] drm/xe: Suspend fault-mode LR jobs before VRAM eviction on S3/S4 Thomas Hellström
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260522164355.2773-2-thomas.hellstrom@linux.intel.com \
--to=thomas.hellstrom@linux.intel.com \
--cc=intel-xe@lists.freedesktop.org \
--cc=matthew.auld@intel.com \
--cc=satyanarayana.k.v.p@intel.com \
--cc=stable@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox