From: Tomasz Lis <tomasz.lis@intel.com>
To: intel-xe@lists.freedesktop.org
Cc: "Michał Winiarski" <michal.winiarski@intel.com>,
"Michał Wajdeczko" <michal.wajdeczko@intel.com>,
"Piotr Piórkowski" <piotr.piorkowski@intel.com>,
"Matthew Brost" <matthew.brost@intel.com>,
"Lucas De Marchi" <lucas.demarchi@intel.com>
Subject: [PATCH v3 3/7] drm/xe/vf: Pause submissions during RESFIX fixups
Date: Tue, 20 May 2025 01:19:21 +0200 [thread overview]
Message-ID: <20250519231925.3196154-4-tomasz.lis@intel.com> (raw)
In-Reply-To: <20250519231925.3196154-1-tomasz.lis@intel.com>
While applying post-migration fixups to VF, GuC will not respond
to any commands. This means submissions have no way of finishing.
To avoid acquiring additional resources and then stalling
on hardware access, pause the submission work. This will
decrease the chance of depleting resources, and speed up
the recovery.
v2: Commented xe_irq_resume() call
v3: Typo fix
Signed-off-by: Tomasz Lis <tomasz.lis@intel.com>
Cc: Michal Wajdeczko <michal.wajdeczko@intel.com>
---
drivers/gpu/drm/xe/xe_gpu_scheduler.c | 13 +++++++++
drivers/gpu/drm/xe/xe_gpu_scheduler.h | 1 +
drivers/gpu/drm/xe/xe_guc_submit.c | 35 ++++++++++++++++++++++
drivers/gpu/drm/xe/xe_guc_submit.h | 2 ++
drivers/gpu/drm/xe/xe_sriov_vf.c | 42 +++++++++++++++++++++++++++
5 files changed, 93 insertions(+)
diff --git a/drivers/gpu/drm/xe/xe_gpu_scheduler.c b/drivers/gpu/drm/xe/xe_gpu_scheduler.c
index 869b43a4151d..455ccaf17314 100644
--- a/drivers/gpu/drm/xe/xe_gpu_scheduler.c
+++ b/drivers/gpu/drm/xe/xe_gpu_scheduler.c
@@ -101,6 +101,19 @@ void xe_sched_submission_stop(struct xe_gpu_scheduler *sched)
cancel_work_sync(&sched->work_process_msg);
}
+/**
+ * xe_sched_submission_stop_async - Stop further runs of submission tasks on a scheduler.
+ * @sched: the &xe_gpu_scheduler struct instance
+ *
+ * This call disables further runs of scheduling work queue. It does not wait
+ * for any in-progress runs to finish, only makes sure no further runs happen
+ * afterwards.
+ */
+void xe_sched_submission_stop_async(struct xe_gpu_scheduler *sched)
+{
+ drm_sched_wqueue_stop(&sched->base);
+}
+
void xe_sched_submission_resume_tdr(struct xe_gpu_scheduler *sched)
{
drm_sched_resume_timeout(&sched->base, sched->base.timeout);
diff --git a/drivers/gpu/drm/xe/xe_gpu_scheduler.h b/drivers/gpu/drm/xe/xe_gpu_scheduler.h
index c250ea773491..d78b4e8203f9 100644
--- a/drivers/gpu/drm/xe/xe_gpu_scheduler.h
+++ b/drivers/gpu/drm/xe/xe_gpu_scheduler.h
@@ -21,6 +21,7 @@ void xe_sched_fini(struct xe_gpu_scheduler *sched);
void xe_sched_submission_start(struct xe_gpu_scheduler *sched);
void xe_sched_submission_stop(struct xe_gpu_scheduler *sched);
+void xe_sched_submission_stop_async(struct xe_gpu_scheduler *sched);
void xe_sched_submission_resume_tdr(struct xe_gpu_scheduler *sched);
diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c
index 80f748baad3f..6f280333de13 100644
--- a/drivers/gpu/drm/xe/xe_guc_submit.c
+++ b/drivers/gpu/drm/xe/xe_guc_submit.c
@@ -1811,6 +1811,19 @@ void xe_guc_submit_stop(struct xe_guc *guc)
}
+/**
+ * xe_guc_submit_pause - Stop further runs of submission tasks on given GuC.
+ * @guc: the &xe_guc struct instance whose scheduler is to be disabled
+ */
+void xe_guc_submit_pause(struct xe_guc *guc)
+{
+ struct xe_exec_queue *q;
+ unsigned long index;
+
+ xa_for_each(&guc->submission_state.exec_queue_lookup, index, q)
+ xe_sched_submission_stop_async(&q->guc->sched);
+}
+
static void guc_exec_queue_start(struct xe_exec_queue *q)
{
struct xe_gpu_scheduler *sched = &q->guc->sched;
@@ -1851,6 +1864,28 @@ int xe_guc_submit_start(struct xe_guc *guc)
return 0;
}
+static void guc_exec_queue_unpause(struct xe_exec_queue *q)
+{
+ struct xe_gpu_scheduler *sched = &q->guc->sched;
+
+ xe_sched_submission_start(sched);
+}
+
+/**
+ * xe_guc_submit_unpause - Allow further runs of submission tasks on given GuC.
+ * @guc: the &xe_guc struct instance whose scheduler is to be enabled
+ */
+void xe_guc_submit_unpause(struct xe_guc *guc)
+{
+ struct xe_exec_queue *q;
+ unsigned long index;
+
+ xa_for_each(&guc->submission_state.exec_queue_lookup, index, q)
+ guc_exec_queue_unpause(q);
+
+ wake_up_all(&guc->ct.wq);
+}
+
static struct xe_exec_queue *
g2h_exec_queue_lookup(struct xe_guc *guc, u32 guc_id)
{
diff --git a/drivers/gpu/drm/xe/xe_guc_submit.h b/drivers/gpu/drm/xe/xe_guc_submit.h
index 9b71a986c6ca..f1cf271492ae 100644
--- a/drivers/gpu/drm/xe/xe_guc_submit.h
+++ b/drivers/gpu/drm/xe/xe_guc_submit.h
@@ -18,6 +18,8 @@ int xe_guc_submit_reset_prepare(struct xe_guc *guc);
void xe_guc_submit_reset_wait(struct xe_guc *guc);
void xe_guc_submit_stop(struct xe_guc *guc);
int xe_guc_submit_start(struct xe_guc *guc);
+void xe_guc_submit_pause(struct xe_guc *guc);
+void xe_guc_submit_unpause(struct xe_guc *guc);
void xe_guc_submit_wedge(struct xe_guc *guc);
int xe_guc_read_stopped(struct xe_guc *guc);
diff --git a/drivers/gpu/drm/xe/xe_sriov_vf.c b/drivers/gpu/drm/xe/xe_sriov_vf.c
index 099a395fbf59..fcd82a0fda48 100644
--- a/drivers/gpu/drm/xe/xe_sriov_vf.c
+++ b/drivers/gpu/drm/xe/xe_sriov_vf.c
@@ -11,6 +11,8 @@
#include "xe_gt_sriov_printk.h"
#include "xe_gt_sriov_vf.h"
#include "xe_guc_ct.h"
+#include "xe_guc_submit.h"
+#include "xe_irq.h"
#include "xe_pm.h"
#include "xe_sriov.h"
#include "xe_sriov_printk.h"
@@ -134,6 +136,44 @@ void xe_sriov_vf_init_early(struct xe_device *xe)
INIT_WORK(&xe->sriov.vf.migration.worker, migration_worker_func);
}
+/**
+ * vf_post_migration_shutdown - Stop the driver activities after VF migration.
+ * @xe: the &xe_device struct instance
+ *
+ * After this VM is migrated and assigned to a new VF, it is running on a new
+ * hardware, and therefore many hardware-dependent states and related structures
+ * require fixups. Without fixups, the hardware cannot do any work, and therefore
+ * all GPU pipelines are stalled.
+ * Stop some of kernel activities to make the fixup process faster.
+ */
+static void vf_post_migration_shutdown(struct xe_device *xe)
+{
+ struct xe_gt *gt;
+ unsigned int id;
+
+ for_each_gt(gt, xe, id)
+ xe_guc_submit_pause(>->uc.guc);
+}
+
+/**
+ * vf_post_migration_kickstart - Re-start the driver activities under new hardware.
+ * @xe: the &xe_device struct instance
+ *
+ * After we have finished with all post-migration fixups, restart the driver
+ * activities to continue feeding the GPU with workloads.
+ */
+static void vf_post_migration_kickstart(struct xe_device *xe)
+{
+ struct xe_gt *gt;
+ unsigned int id;
+
+ /* make sure interrupts on the new HW are properly set */
+ xe_irq_resume(xe);
+
+ for_each_gt(gt, xe, id)
+ xe_guc_submit_unpause(>->uc.guc);
+}
+
/**
* xe_sriov_vf_post_migration_reset_guc_state - Reset VF state in all GuCs.
* @xe: the &xe_device struct instance
@@ -247,6 +287,7 @@ static void vf_post_migration_recovery(struct xe_device *xe)
drm_dbg(&xe->drm, "migration recovery in progress\n");
xe_pm_runtime_get(xe);
+ vf_post_migration_shutdown(xe);
err = vf_post_migration_requery_guc(xe);
if (vf_post_migration_imminent(xe))
goto defer;
@@ -258,6 +299,7 @@ static void vf_post_migration_recovery(struct xe_device *xe)
if (need_fixups)
vf_post_migration_fixup_ctb(xe);
+ vf_post_migration_kickstart(xe);
vf_post_migration_notify_resfix_done(xe);
xe_pm_runtime_put(xe);
drm_notice(&xe->drm, "migration recovery ended\n");
--
2.25.1
next prev parent reply other threads:[~2025-05-19 23:19 UTC|newest]
Thread overview: 33+ messages / expand[flat|nested] mbox.gz Atom feed top
2025-05-19 23:19 [PATCH v3 0/7] drm/xe/vf: Post-migration recovery of queues and jobs Tomasz Lis
2025-05-19 23:19 ` [PATCH v3 1/7] drm/xe/sa: Avoid caching GGTT address within the manager Tomasz Lis
2025-05-19 23:19 ` [PATCH v3 2/7] drm/xe/vf: Finish RESFIX by reset if CTB not enabled Tomasz Lis
2025-05-27 11:56 ` K V P, Satyanarayana
2025-05-27 14:14 ` Lis, Tomasz
2025-05-19 23:19 ` Tomasz Lis [this message]
2025-05-27 13:10 ` [PATCH v3 3/7] drm/xe/vf: Pause submissions during RESFIX fixups K V P, Satyanarayana
2025-05-27 14:28 ` Lis, Tomasz
2025-05-28 20:16 ` Michał Winiarski
2025-05-31 0:05 ` Lis, Tomasz
2025-05-19 23:19 ` [PATCH v3 4/7] drm/xe: Block reset while recovering from VF migration Tomasz Lis
2025-05-28 20:02 ` Michał Winiarski
2025-06-03 20:23 ` Lis, Tomasz
2025-05-19 23:19 ` [PATCH v3 5/7] drm/xe/vf: Rebase HWSP of all contexts after migration Tomasz Lis
2025-05-27 13:45 ` K V P, Satyanarayana
2025-05-28 12:49 ` Michał Winiarski
2025-06-03 20:23 ` Lis, Tomasz
2025-05-19 23:19 ` [PATCH v3 6/7] drm/xe/vf: Rebase MEMIRQ structures for " Tomasz Lis
2025-05-27 14:06 ` K V P, Satyanarayana
2025-05-28 10:44 ` Michał Winiarski
2025-05-29 1:19 ` Lis, Tomasz
2025-05-19 23:19 ` [PATCH v3 7/7] drm/xe/vf: Post migration, repopulate ring area for pending request Tomasz Lis
2025-05-28 10:54 ` Michał Winiarski
2025-05-30 23:03 ` Lis, Tomasz
2025-05-20 0:00 ` ✓ CI.Patch_applied: success for drm/xe/vf: Post-migration recovery of queues and jobs (rev3) Patchwork
2025-05-20 0:00 ` ✗ CI.checkpatch: warning " Patchwork
2025-05-20 0:01 ` ✓ CI.KUnit: success " Patchwork
2025-05-20 0:11 ` ✓ CI.Build: " Patchwork
2025-05-20 0:14 ` ✓ CI.Hooks: " Patchwork
2025-05-20 0:15 ` ✓ CI.checksparse: " Patchwork
2025-05-20 0:39 ` ✓ Xe.CI.BAT: " Patchwork
2025-05-20 12:47 ` ✗ Xe.CI.Full: failure " Patchwork
2025-05-27 5:49 ` ✗ CI.Patch_applied: " Patchwork
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20250519231925.3196154-4-tomasz.lis@intel.com \
--to=tomasz.lis@intel.com \
--cc=intel-xe@lists.freedesktop.org \
--cc=lucas.demarchi@intel.com \
--cc=matthew.brost@intel.com \
--cc=michal.wajdeczko@intel.com \
--cc=michal.winiarski@intel.com \
--cc=piotr.piorkowski@intel.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox