From: Jesse Zhang <Jesse.Zhang@amd.com>
To: <igt-dev@lists.freedesktop.org>
Cc: Vitaly Prosyak <vitaly.prosyak@amd.com>,
Alex Deucher <alexander.deucher@amd.com>,
Christian Koenig <christian.koenig@amd.com>,
Jesse Zhang <Jesse.Zhang@amd.com>,
Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>,
Jesse Zhang <jesse.zhang@amd.com>
Subject: [PATCH i-g-t 1/2] lib/amdgpu: restore sched_mask after abnormal subtest exits
Date: Thu, 9 Apr 2026 15:15:48 +0800 [thread overview]
Message-ID: <20260409071558.2658707-1-Jesse.Zhang@amd.com> (raw)
amdgpu_wait_memory_helper(), bad_access_ring_helper(),
amdgpu_hang_sdma_ring_helper(), and mm_queue_test_helper()
temporarily isolate individual rings by writing a reduced mask to the
sched_mask sysfs nodes and restore the original mask only at the end of
the function.
When an igt_assert fires inside the ring iteration loop or in a helper
called from it, IGT unwinds via siglongjmp back to the subtest entry
point and bypasses the normal restore path. This leaves all-but-one ring
disabled for the rest of the test process, so later subtests can observe
disabled schedulers and run into follow-up failures.
Fix this by adding the same three-layer protection used in the dispatch
helpers:
- register a file-scoped igt exit handler that restores the saved mask
when the process exits with a dirty sched_mask state
- restore a stale dirty mask lazily at the start of the next helper
- keep the existing happy-path restore and clear the dirty flag once the
mask is restored successfully
This makes the deadlock and multimedia helpers resilient to abnormal
subtest exits without changing their normal ring-isolation behavior.
Cc: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Cc: Christian König <christian.koenig@amd.com>
Cc: Alex Deucher <alexander.deucher@amd.com>
Cc: Vitaly Prosyak <vitaly.prosyak@amd.com>
Suggested-by: Vitaly Prosyak <vitaly.prosyak@amd.com>
Signed-off-by: Jesse Zhang <jesse.zhang@amd.com>
---
lib/amdgpu/amd_deadlock_helpers.c | 56 +++++++++++++++++++++++++++++++
lib/amdgpu/amd_mmd_shared.c | 47 +++++++++++++++++++++++++-
2 files changed, 102 insertions(+), 1 deletion(-)
diff --git a/lib/amdgpu/amd_deadlock_helpers.c b/lib/amdgpu/amd_deadlock_helpers.c
index cb37a3564..06c577085 100644
--- a/lib/amdgpu/amd_deadlock_helpers.c
+++ b/lib/amdgpu/amd_deadlock_helpers.c
@@ -27,6 +27,50 @@ struct thread_param {
static int
use_uc_mtype = 1;
+/*
+ * Static state for sched_mask cleanup on abnormal subtest exit.
+ *
+ * A failing assert in ring iteration helpers can jump over the normal
+ * sched_mask restore path, leaving non-selected rings disabled for later
+ * subtests. Keep one file-scoped backup and restore it from an exit handler.
+ */
+static char sched_mask_sysfs[256];
+static uint64_t sched_mask_saved;
+static bool sched_mask_dirty;
+static bool sched_mask_handler_installed;
+
+static void sched_mask_exit_handler(int sig)
+{
+ char cmd[1024];
+
+ (void)sig;
+
+ if (!sched_mask_dirty)
+ return;
+
+ sched_mask_dirty = false;
+ snprintf(cmd, sizeof(cmd) - 1, "sudo echo 0x%" PRIx64 " > %s",
+ sched_mask_saved, sched_mask_sysfs);
+ system(cmd);
+}
+
+static void sched_mask_arm(const char *sysfs, uint64_t mask)
+{
+ /* Restore stale state first if a prior subtest exited abnormally. */
+ if (sched_mask_dirty)
+ sched_mask_exit_handler(0);
+
+ strncpy(sched_mask_sysfs, sysfs, sizeof(sched_mask_sysfs) - 1);
+ sched_mask_sysfs[sizeof(sched_mask_sysfs) - 1] = '\0';
+ sched_mask_saved = mask;
+ sched_mask_dirty = true;
+
+ if (!sched_mask_handler_installed) {
+ igt_install_exit_handler(sched_mask_exit_handler);
+ sched_mask_handler_installed = true;
+ }
+}
+
static void*
write_mem_address(void *data)
{
@@ -239,6 +283,9 @@ void amdgpu_wait_memory_helper(amdgpu_device_handle device_handle, unsigned int
igt_info("The scheduling ring only enables one for ip %d\n", ip_type);
}
+ if (sched_mask > 1)
+ sched_mask_arm(sysfs, sched_mask);
+
for (ring_id = 0; ((uint64_t)0x1 << ring_id) <= sched_mask; ring_id += 1) {
/* check sched is ready is on the ring. */
if (!((1 << ring_id) & sched_mask))
@@ -289,6 +336,7 @@ void amdgpu_wait_memory_helper(amdgpu_device_handle device_handle, unsigned int
snprintf(cmd, sizeof(cmd) - 1, "sudo echo 0x%" PRIx64 " > %s", sched_mask, sysfs);
r = system(cmd);
igt_assert_eq(r, 0);
+ sched_mask_dirty = false;
}
}
@@ -494,6 +542,9 @@ void bad_access_ring_helper(amdgpu_device_handle device_handle, unsigned int cmd
igt_info("The scheduling ring only enables one for ip %d\n", ip_type);
}
+ if (sched_mask > 1)
+ sched_mask_arm(sysfs, sched_mask);
+
for (ring_id = 0; ((uint64_t)0x1 << ring_id) <= sched_mask; ring_id++) {
/* check sched is ready is on the ring. */
if (!((1 << ring_id) & sched_mask))
@@ -544,6 +595,7 @@ void bad_access_ring_helper(amdgpu_device_handle device_handle, unsigned int cmd
snprintf(cmd, sizeof(cmd) - 1, "sudo echo 0x%" PRIx64 " > %s", sched_mask, sysfs);
r = system(cmd);
igt_assert_eq(r, 0);
+ sched_mask_dirty = false;
}
}
@@ -581,6 +633,9 @@ void amdgpu_hang_sdma_ring_helper(amdgpu_device_handle device_handle, uint8_t ha
} else
sched_mask = 1;
+ if (sched_mask > 1)
+ sched_mask_arm(sysfs, sched_mask);
+
for (ring_id = 0; ((uint64_t)0x1 << ring_id) <= sched_mask; ring_id++) {
/* check sched is ready is on the ring. */
if (!((1 << ring_id) & sched_mask))
@@ -613,6 +668,7 @@ void amdgpu_hang_sdma_ring_helper(amdgpu_device_handle device_handle, uint8_t ha
snprintf(cmd, sizeof(cmd) - 1, "sudo echo 0x%" PRIx64 " > %s", sched_mask, sysfs);
r = system(cmd);
igt_assert_eq(r, 0);
+ sched_mask_dirty = false;
}
}
diff --git a/lib/amdgpu/amd_mmd_shared.c b/lib/amdgpu/amd_mmd_shared.c
index 588f2302c..39d4eea68 100644
--- a/lib/amdgpu/amd_mmd_shared.c
+++ b/lib/amdgpu/amd_mmd_shared.c
@@ -4,6 +4,47 @@
*/
#include "amd_mmd_shared.h"
+/*
+ * Static state for sched_mask cleanup when subtests abort out of the
+ * per-ring loop before reaching the normal restore path.
+ */
+static char sched_mask_sysfs[256];
+static long sched_mask_saved;
+static bool sched_mask_dirty;
+static bool sched_mask_handler_installed;
+
+static void sched_mask_exit_handler(int sig)
+{
+ char cmd[1024];
+
+ (void)sig;
+
+ if (!sched_mask_dirty)
+ return;
+
+ sched_mask_dirty = false;
+ snprintf(cmd, sizeof(cmd) - 1, "sudo echo 0x%lx > %s",
+ sched_mask_saved, sched_mask_sysfs);
+ system(cmd);
+}
+
+static void sched_mask_arm(const char *sysfs, long mask)
+{
+ /* Restore stale state first if a prior subtest exited abnormally. */
+ if (sched_mask_dirty)
+ sched_mask_exit_handler(0);
+
+ strncpy(sched_mask_sysfs, sysfs, sizeof(sched_mask_sysfs) - 1);
+ sched_mask_sysfs[sizeof(sched_mask_sysfs) - 1] = '\0';
+ sched_mask_saved = mask;
+ sched_mask_dirty = true;
+
+ if (!sched_mask_handler_installed) {
+ igt_install_exit_handler(sched_mask_exit_handler);
+ sched_mask_handler_installed = true;
+ }
+}
+
bool
is_gfx_pipe_removed(uint32_t family_id, uint32_t chip_id, uint32_t chip_rev)
{
@@ -214,7 +255,7 @@ int
mm_queue_test_helper(amdgpu_device_handle device_handle, struct mmd_shared_context *context,
mm_test_callback callback, int err_type, const struct pci_addr *pci)
{
- int r;
+ int r = 0;
char cmd[1024];
long sched_mask = 0;
long mask = 0;
@@ -230,6 +271,9 @@ mm_queue_test_helper(amdgpu_device_handle device_handle, struct mmd_shared_conte
sched_mask = 1;
}
+ if (sched_mask > 1)
+ sched_mask_arm(sysfs, sched_mask);
+
mask = sched_mask;
for (ring_id = 0; mask > 0; ring_id++) {
/* check sched is ready is on the ring. */
@@ -251,6 +295,7 @@ mm_queue_test_helper(amdgpu_device_handle device_handle, struct mmd_shared_conte
snprintf(cmd, sizeof(cmd) - 1, "sudo echo 0x%lx > %s", sched_mask, sysfs);
r = system(cmd);
igt_assert_eq(r, 0);
+ sched_mask_dirty = false;
}
return r;
}
--
2.49.0
next reply other threads:[~2026-04-09 7:16 UTC|newest]
Thread overview: 6+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-04-09 7:15 Jesse Zhang [this message]
2026-04-09 7:15 ` [PATCH i-g-t 2/2] tests/amdgpu/queue_reset: Add test for oversize packet length Jesse Zhang
2026-04-10 1:37 ` ✓ Xe.CI.BAT: success for series starting with [i-g-t,1/2] lib/amdgpu: restore sched_mask after abnormal subtest exits Patchwork
2026-04-10 1:48 ` ✓ i915.CI.BAT: " Patchwork
2026-04-10 4:52 ` ✗ Xe.CI.FULL: failure " Patchwork
2026-04-10 19:51 ` ✓ i915.CI.Full: success " Patchwork
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260409071558.2658707-1-Jesse.Zhang@amd.com \
--to=jesse.zhang@amd.com \
--cc=alexander.deucher@amd.com \
--cc=christian.koenig@amd.com \
--cc=igt-dev@lists.freedesktop.org \
--cc=pierre-eric.pelloux-prayer@amd.com \
--cc=vitaly.prosyak@amd.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox