Be cautious and ensure the VF post-migration worker is not running
during driver unload.
Signed-off-by: Matthew Brost <matthew.brost@intel.com>
---
drivers/gpu/drm/xe/xe_gt_sriov_vf.c | 17 +++++++++++++++--
drivers/gpu/drm/xe/xe_gt_sriov_vf_types.h | 4 +++-
2 files changed, 18 insertions(+), 3 deletions(-)
diff --git a/drivers/gpu/drm/xe/xe_gt_sriov_vf.c b/drivers/gpu/drm/xe/xe_gt_sriov_vf.c
index 807fdced0228..4eaffad6ebcf 100644
--- a/drivers/gpu/drm/xe/xe_gt_sriov_vf.c
+++ b/drivers/gpu/drm/xe/xe_gt_sriov_vf.c
@@ -811,7 +811,8 @@ static void xe_gt_sriov_vf_start_migration_recovery(struct xe_gt *gt)
spin_lock(>->sriov.vf.migration.lock);
- if (!gt->sriov.vf.migration.recovery_queued) {
+ if (!gt->sriov.vf.migration.recovery_queued ||
+ !gt->sriov.vf.migration.recovery_teardown) {
We're registering `vf_migration_fini` very early in the init. That means it will be called very late. With that in mind, is it even possible to hit?
During `xe_gt_sriov_vf_migration_init_early`, interrupts are not enabled yet. Doesn't that mean they are disabled already when vf_migration_fini is called?
Both ggtt_fini_early and guc_submit_fini should be finished by then; so if the recovery was running, it already crashed or errored out at this point.
So, maybe register the fini later? We need to `init` before IRQs are enabled, but for `fini` - it should be before `exec_queue_lookup` is torn down.
-Tomasz
gt->sriov.vf.migration.recovery_queued = true;
WRITE_ONCE(gt->sriov.vf.migration.recovery_inprogress, true);
@@ -1280,6 +1281,17 @@ static void migration_worker_func(struct work_struct *w)
vf_post_migration_recovery(gt);
}
+static void vf_migration_fini(struct drm_device *drm, void *arg)
+{
+ struct xe_gt *gt = arg;
+
+ spin_lock_irq(>->sriov.vf.migration.lock);
+ gt->sriov.vf.migration.recovery_teardown = true;
+ spin_unlock_irq(>->sriov.vf.migration.lock);
+
+ cancel_work_sync(>->sriov.vf.migration.worker);
+}
+
/**
* xe_gt_sriov_vf_migration_init_early() - VF post migration init early
* @gt: the &xe_gt
@@ -1308,7 +1320,8 @@ int xe_gt_sriov_vf_migration_init_early(struct xe_gt *gt)
if (!xe_sriov_vf_migration_supported(gt_to_xe(gt)))
xe_gt_sriov_info(gt, "migration not supported by this module version\n");
- return 0;
+ return drmm_add_action_or_reset(>_to_xe(gt)->drm,
+ vf_migration_fini, gt);
}
/**
diff --git a/drivers/gpu/drm/xe/xe_gt_sriov_vf_types.h b/drivers/gpu/drm/xe/xe_gt_sriov_vf_types.h
index 61484c7c9a36..beb9978336bb 100644
--- a/drivers/gpu/drm/xe/xe_gt_sriov_vf_types.h
+++ b/drivers/gpu/drm/xe/xe_gt_sriov_vf_types.h
@@ -59,10 +59,12 @@ struct xe_gt_sriov_vf_runtime {
struct xe_gt_sriov_vf_migration {
/** @migration: VF migration recovery worker */
struct work_struct worker;
- /** @lock: Protects recovery_queued */
+ /** @lock: Protects recovery_queued, teardown */
spinlock_t lock;
/** @lrc_wa_bb: Scratch memory for LRC WA BB in recovery */
void *lrc_wa_bb;
+ /** @recovery_teardown: VF post migration recovery is being torn down */
+ bool recovery_teardown;
/** @recovery_queued: VF post migration recovery in queued */
bool recovery_queued;
/** @recovery_inprogress: VF post migration recovery in progress */