From: Zhanjun Dong <zhanjun.dong@intel.com>
To: intel-xe@lists.freedesktop.org
Cc: daniele.ceraolospurio@intel.com, matthew.brost@intel.com,
stuart.summers@intel.com, Zhanjun Dong <zhanjun.dong@intel.com>
Subject: [PATCH v8] drm/xe/uc: Add stop on hardware initialization error
Date: Fri, 28 Nov 2025 10:01:09 -0500 [thread overview]
Message-ID: <20251128150109.3064876-1-zhanjun.dong@intel.com> (raw)
On hardware init fail, the hardware might no longer response, add uc stop
to clean up. At driver unload, all exec_queue items need to be freeed,
change xe_guc_submit_pause_abort to xe_guc_submit_clear_all to free all
contexts.
This will fix memory leak issue like:
[ 189.997904] [drm:drm_mm_takedown] *ERROR* node [00f0f000 + 00007000]: inserted at
drm_mm_insert_node_in_range+0x2c0/0x510
__xe_ggtt_insert_bo_at+0x167/0x540 [xe]
xe_ggtt_insert_bo+0x1a/0x30 [xe]
__xe_bo_create_locked+0x1f3/0x930 [xe]
xe_bo_create_pin_map_at_aligned+0x59/0x1f0 [xe]
xe_bo_create_pin_map_at_novm+0xae/0x140 [xe]
xe_bo_create_pin_map_novm+0x23/0x40 [xe]
xe_lrc_create+0x1e4/0x17c0 [xe]
xe_exec_queue_create+0x38a/0x6a0 [xe]
xe_gt_record_default_lrcs+0x117/0x8b0 [xe]
xe_uc_load_hw+0xa2/0x290 [xe]
xe_gt_init+0x357/0xab0 [xe]
xe_device_probe+0x403/0xa30 [xe]
xe_pci_probe+0x39a/0x610 [xe]
local_pci_probe+0x47/0xb0
pci_device_probe+0xf3/0x260
really_probe+0xf1/0x3b0
__driver_probe_device+0x8c/0x180
device_driver_attach+0x57/0xd0
bind_store+0x77/0xd0
drv_attr_store+0x24/0x50
sysfs_kf_write+0x4d/0x80
kernfs_fop_write_iter+0x188/0x240
vfs_write+0x280/0x540
ksys_write+0x6f/0xf0
__x64_sys_write+0x19/0x30
x64_sys_call+0x2171/0x25a0
do_syscall_64+0x93/0xb80
entry_SYSCALL_64_after_hwframe+0x7
and:
[ 189.973775] xe 0000:00:02.0: [drm] *ERROR* Tile0: GT1: GUC ID manager unclean (1/65535)
[ 189.981731] xe 0000:00:02.0: [drm] Tile0: GT1: total 65535
[ 189.981733] xe 0000:00:02.0: [drm] Tile0: GT1: used 1
[ 189.981734] xe 0000:00:02.0: [drm] Tile0: GT1: range 2..2 (1)
Closes: https://gitlab.freedesktop.org/drm/xe/kernel/-/issues/5466
Closes: https://gitlab.freedesktop.org/drm/xe/kernel/-/issues/5530
Signed-off-by: Zhanjun Dong <zhanjun.dong@intel.com>
---
v8: Fix __mutex_lock warning
v7: Clear all queue items by guc_submit_fini/xe_guc_submit_pause_abort (Matthew)
v6: As huc not involved in vf_uc_load_hw, roll back to guc sanitize
v5: Move stop flag set in guc_fini_hw
Change to uc_sanitize in uc init path
v4: Add memory leak fix
Switch to xe_uc_stop
v3: Switch to xe_guc_stop
v2: Switch to xe_guc_ct_stop
---
drivers/gpu/drm/xe/xe_gt_sriov_vf.c | 2 +-
drivers/gpu/drm/xe/xe_guc.c | 6 ++++++
drivers/gpu/drm/xe/xe_guc_submit.c | 7 +++----
drivers/gpu/drm/xe/xe_guc_submit.h | 2 +-
drivers/gpu/drm/xe/xe_uc.c | 8 +++++++-
5 files changed, 18 insertions(+), 7 deletions(-)
diff --git a/drivers/gpu/drm/xe/xe_gt_sriov_vf.c b/drivers/gpu/drm/xe/xe_gt_sriov_vf.c
index 4c73a077d314..04f7f5ac0028 100644
--- a/drivers/gpu/drm/xe/xe_gt_sriov_vf.c
+++ b/drivers/gpu/drm/xe/xe_gt_sriov_vf.c
@@ -1180,7 +1180,7 @@ static void vf_post_migration_abort(struct xe_gt *gt)
wake_up_all(>->sriov.vf.migration.wq);
- xe_guc_submit_pause_abort(>->uc.guc);
+ xe_guc_submit_clear_all(>->uc.guc);
}
static int vf_post_migration_notify_resfix_done(struct xe_gt *gt)
diff --git a/drivers/gpu/drm/xe/xe_guc.c b/drivers/gpu/drm/xe/xe_guc.c
index d6672cf30d3e..d8235ae69988 100644
--- a/drivers/gpu/drm/xe/xe_guc.c
+++ b/drivers/gpu/drm/xe/xe_guc.c
@@ -662,6 +662,12 @@ static void guc_fini_hw(void *arg)
struct xe_guc *guc = arg;
struct xe_gt *gt = guc_to_gt(guc);
+ if (guc->submission_state.initialized) {
+ xe_guc_reset_prepare(guc);
+ xe_guc_stop(guc);
+ xe_guc_submit_clear_all(guc);
+ }
+
xe_with_force_wake(fw_ref, gt_to_fw(gt), XE_FORCEWAKE_ALL)
xe_uc_sanitize_reset(&guc_to_gt(guc)->uc);
diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c
index 7e0882074a99..1bb9c03ba1d7 100644
--- a/drivers/gpu/drm/xe/xe_guc_submit.c
+++ b/drivers/gpu/drm/xe/xe_guc_submit.c
@@ -2353,10 +2353,10 @@ void xe_guc_submit_unpause(struct xe_guc *guc)
}
/**
- * xe_guc_submit_pause_abort - Abort all paused submission task on given GuC.
+ * xe_guc_submit_clear_all - Clear all paused submission task on given GuC.
* @guc: the &xe_guc struct instance whose scheduler is to be aborted
*/
-void xe_guc_submit_pause_abort(struct xe_guc *guc)
+void xe_guc_submit_clear_all(struct xe_guc *guc)
{
struct xe_exec_queue *q;
unsigned long index;
@@ -2370,8 +2370,7 @@ void xe_guc_submit_pause_abort(struct xe_guc *guc)
continue;
xe_sched_submission_start(sched);
- if (exec_queue_killed_or_banned_or_wedged(q))
- xe_guc_exec_queue_trigger_cleanup(q);
+ guc_exec_queue_kill(q);
}
mutex_unlock(&guc->submission_state.lock);
}
diff --git a/drivers/gpu/drm/xe/xe_guc_submit.h b/drivers/gpu/drm/xe/xe_guc_submit.h
index b49a2748ec46..ab794ee9ca8c 100644
--- a/drivers/gpu/drm/xe/xe_guc_submit.h
+++ b/drivers/gpu/drm/xe/xe_guc_submit.h
@@ -23,7 +23,7 @@ int xe_guc_submit_start(struct xe_guc *guc);
void xe_guc_submit_pause(struct xe_guc *guc);
void xe_guc_submit_unpause(struct xe_guc *guc);
void xe_guc_submit_unpause_prepare(struct xe_guc *guc);
-void xe_guc_submit_pause_abort(struct xe_guc *guc);
+void xe_guc_submit_clear_all(struct xe_guc *guc);
void xe_guc_submit_wedge(struct xe_guc *guc);
int xe_guc_read_stopped(struct xe_guc *guc);
diff --git a/drivers/gpu/drm/xe/xe_uc.c b/drivers/gpu/drm/xe/xe_uc.c
index 40aed4a66bac..726bd1bf7e30 100644
--- a/drivers/gpu/drm/xe/xe_uc.c
+++ b/drivers/gpu/drm/xe/xe_uc.c
@@ -173,6 +173,9 @@ static int vf_uc_load_hw(struct xe_uc *uc)
return 0;
err_out:
+ /* Stop guc submission */
+ atomic_fetch_or(1, &uc->guc.submission_state.stopped);
+ xe_uc_stop(uc);
xe_guc_sanitize(&uc->guc);
return err;
}
@@ -231,7 +234,10 @@ int xe_uc_load_hw(struct xe_uc *uc)
return 0;
err_out:
- xe_guc_sanitize(&uc->guc);
+ /* Stop guc submission */
+ atomic_fetch_or(1, &uc->guc.submission_state.stopped);
+ xe_uc_stop(uc);
+ xe_uc_sanitize(uc);
return ret;
}
--
2.34.1
reply other threads:[~2025-11-28 15:01 UTC|newest]
Thread overview: [no followups] expand[flat|nested] mbox.gz Atom feed
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20251128150109.3064876-1-zhanjun.dong@intel.com \
--to=zhanjun.dong@intel.com \
--cc=daniele.ceraolospurio@intel.com \
--cc=intel-xe@lists.freedesktop.org \
--cc=matthew.brost@intel.com \
--cc=stuart.summers@intel.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox