[PATCH i-g-t 1/2] lib/amdgpu: restore sched_mask after abnormal subtest exits

public inbox for igt-dev@lists.freedesktop.org
 help / color / mirror / Atom feed

From: Jesse Zhang <Jesse.Zhang@amd.com>
To: <igt-dev@lists.freedesktop.org>
Cc: Vitaly Prosyak <vitaly.prosyak@amd.com>,
	Alex Deucher <alexander.deucher@amd.com>,
	Christian Koenig <christian.koenig@amd.com>,
	Jesse Zhang <Jesse.Zhang@amd.com>,
	Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>,
	Jesse Zhang <jesse.zhang@amd.com>
Subject: [PATCH i-g-t 1/2] lib/amdgpu: restore sched_mask after abnormal subtest exits
Date: Thu, 9 Apr 2026 15:15:48 +0800	[thread overview]
Message-ID: <20260409071558.2658707-1-Jesse.Zhang@amd.com> (raw)

amdgpu_wait_memory_helper(), bad_access_ring_helper(),
amdgpu_hang_sdma_ring_helper(), and mm_queue_test_helper()
temporarily isolate individual rings by writing a reduced mask to the
sched_mask sysfs nodes and restore the original mask only at the end of
the function.

When an igt_assert fires inside the ring iteration loop or in a helper
called from it, IGT unwinds via siglongjmp back to the subtest entry
point and bypasses the normal restore path. This leaves all-but-one ring
disabled for the rest of the test process, so later subtests can observe
disabled schedulers and run into follow-up failures.

Fix this by adding the same three-layer protection used in the dispatch
helpers:
- register a file-scoped igt exit handler that restores the saved mask
  when the process exits with a dirty sched_mask state
- restore a stale dirty mask lazily at the start of the next helper
- keep the existing happy-path restore and clear the dirty flag once the
  mask is restored successfully

This makes the deadlock and multimedia helpers resilient to abnormal
subtest exits without changing their normal ring-isolation behavior.

Cc: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Cc: Christian König <christian.koenig@amd.com>
Cc: Alex Deucher <alexander.deucher@amd.com>
Cc: Vitaly Prosyak <vitaly.prosyak@amd.com>

Suggested-by: Vitaly Prosyak <vitaly.prosyak@amd.com>
Signed-off-by: Jesse Zhang <jesse.zhang@amd.com>
---
 lib/amdgpu/amd_deadlock_helpers.c | 56 +++++++++++++++++++++++++++++++
 lib/amdgpu/amd_mmd_shared.c       | 47 +++++++++++++++++++++++++-
 2 files changed, 102 insertions(+), 1 deletion(-)

diff --git a/lib/amdgpu/amd_deadlock_helpers.c b/lib/amdgpu/amd_deadlock_helpers.c
index cb37a3564..06c577085 100644
--- a/lib/amdgpu/amd_deadlock_helpers.c
+++ b/lib/amdgpu/amd_deadlock_helpers.c
@@ -27,6 +27,50 @@ struct thread_param {
 static int
 use_uc_mtype = 1;
 
+/*
+ * Static state for sched_mask cleanup on abnormal subtest exit.
+ *
+ * A failing assert in ring iteration helpers can jump over the normal
+ * sched_mask restore path, leaving non-selected rings disabled for later
+ * subtests. Keep one file-scoped backup and restore it from an exit handler.
+ */
+static char sched_mask_sysfs[256];
+static uint64_t sched_mask_saved;
+static bool sched_mask_dirty;
+static bool sched_mask_handler_installed;
+
+static void sched_mask_exit_handler(int sig)
+{
+	char cmd[1024];
+
+	(void)sig;
+
+	if (!sched_mask_dirty)
+		return;
+
+	sched_mask_dirty = false;
+	snprintf(cmd, sizeof(cmd) - 1, "sudo echo  0x%" PRIx64 " > %s",
+		 sched_mask_saved, sched_mask_sysfs);
+	system(cmd);
+}
+
+static void sched_mask_arm(const char *sysfs, uint64_t mask)
+{
+	/* Restore stale state first if a prior subtest exited abnormally. */
+	if (sched_mask_dirty)
+		sched_mask_exit_handler(0);
+
+	strncpy(sched_mask_sysfs, sysfs, sizeof(sched_mask_sysfs) - 1);
+	sched_mask_sysfs[sizeof(sched_mask_sysfs) - 1] = '\0';
+	sched_mask_saved = mask;
+	sched_mask_dirty = true;
+
+	if (!sched_mask_handler_installed) {
+		igt_install_exit_handler(sched_mask_exit_handler);
+		sched_mask_handler_installed = true;
+	}
+}
+
 static void*
 write_mem_address(void *data)
 {
@@ -239,6 +283,9 @@ void amdgpu_wait_memory_helper(amdgpu_device_handle device_handle, unsigned int
 		igt_info("The scheduling ring only enables one for ip %d\n", ip_type);
 	}
 
+	if (sched_mask > 1)
+		sched_mask_arm(sysfs, sched_mask);
+
 	for (ring_id = 0; ((uint64_t)0x1 << ring_id) <= sched_mask; ring_id += 1) {
 		/* check sched is ready is on the ring. */
 		if (!((1 << ring_id) & sched_mask))
@@ -289,6 +336,7 @@ void amdgpu_wait_memory_helper(amdgpu_device_handle device_handle, unsigned int
 		snprintf(cmd, sizeof(cmd) - 1, "sudo echo  0x%" PRIx64 " > %s", sched_mask, sysfs);
 		r = system(cmd);
 		igt_assert_eq(r, 0);
+		sched_mask_dirty = false;
 	}
 
 }
@@ -494,6 +542,9 @@ void bad_access_ring_helper(amdgpu_device_handle device_handle, unsigned int cmd
 		igt_info("The scheduling ring only enables one for ip %d\n", ip_type);
 	}
 
+	if (sched_mask > 1)
+		sched_mask_arm(sysfs, sched_mask);
+
 	for (ring_id = 0; ((uint64_t)0x1 << ring_id) <= sched_mask; ring_id++) {
 		/* check sched is ready is on the ring. */
 		if (!((1 << ring_id) & sched_mask))
@@ -544,6 +595,7 @@ void bad_access_ring_helper(amdgpu_device_handle device_handle, unsigned int cmd
 		snprintf(cmd, sizeof(cmd) - 1, "sudo echo  0x%" PRIx64 " > %s", sched_mask, sysfs);
 		r = system(cmd);
 		igt_assert_eq(r, 0);
+		sched_mask_dirty = false;
 	}
 
 }
@@ -581,6 +633,9 @@ void amdgpu_hang_sdma_ring_helper(amdgpu_device_handle device_handle, uint8_t ha
 	} else
 		sched_mask = 1;
 
+	if (sched_mask > 1)
+		sched_mask_arm(sysfs, sched_mask);
+
 	for (ring_id = 0; ((uint64_t)0x1 << ring_id) <= sched_mask; ring_id++) {
 		/* check sched is ready is on the ring. */
 		if (!((1 << ring_id) & sched_mask))
@@ -613,6 +668,7 @@ void amdgpu_hang_sdma_ring_helper(amdgpu_device_handle device_handle, uint8_t ha
 		snprintf(cmd, sizeof(cmd) - 1, "sudo echo  0x%" PRIx64 " > %s", sched_mask, sysfs);
 		r = system(cmd);
 		igt_assert_eq(r, 0);
+		sched_mask_dirty = false;
 	}
 }
 
diff --git a/lib/amdgpu/amd_mmd_shared.c b/lib/amdgpu/amd_mmd_shared.c
index 588f2302c..39d4eea68 100644
--- a/lib/amdgpu/amd_mmd_shared.c
+++ b/lib/amdgpu/amd_mmd_shared.c
@@ -4,6 +4,47 @@
  */
 #include "amd_mmd_shared.h"
 
+/*
+ * Static state for sched_mask cleanup when subtests abort out of the
+ * per-ring loop before reaching the normal restore path.
+ */
+static char sched_mask_sysfs[256];
+static long sched_mask_saved;
+static bool sched_mask_dirty;
+static bool sched_mask_handler_installed;
+
+static void sched_mask_exit_handler(int sig)
+{
+	char cmd[1024];
+
+	(void)sig;
+
+	if (!sched_mask_dirty)
+		return;
+
+	sched_mask_dirty = false;
+	snprintf(cmd, sizeof(cmd) - 1, "sudo echo  0x%lx > %s",
+		 sched_mask_saved, sched_mask_sysfs);
+	system(cmd);
+}
+
+static void sched_mask_arm(const char *sysfs, long mask)
+{
+	/* Restore stale state first if a prior subtest exited abnormally. */
+	if (sched_mask_dirty)
+		sched_mask_exit_handler(0);
+
+	strncpy(sched_mask_sysfs, sysfs, sizeof(sched_mask_sysfs) - 1);
+	sched_mask_sysfs[sizeof(sched_mask_sysfs) - 1] = '\0';
+	sched_mask_saved = mask;
+	sched_mask_dirty = true;
+
+	if (!sched_mask_handler_installed) {
+		igt_install_exit_handler(sched_mask_exit_handler);
+		sched_mask_handler_installed = true;
+	}
+}
+
 bool
 is_gfx_pipe_removed(uint32_t family_id, uint32_t chip_id, uint32_t chip_rev)
 {
@@ -214,7 +255,7 @@ int
 mm_queue_test_helper(amdgpu_device_handle device_handle, struct mmd_shared_context *context,
 		mm_test_callback callback, int err_type, const struct pci_addr *pci)
 {
-	int r;
+	int r = 0;
 	char cmd[1024];
 	long sched_mask = 0;
 	long mask = 0;
@@ -230,6 +271,9 @@ mm_queue_test_helper(amdgpu_device_handle device_handle, struct mmd_shared_conte
 		sched_mask = 1;
 	}
 
+	if (sched_mask > 1)
+		sched_mask_arm(sysfs, sched_mask);
+
 	mask = sched_mask;
 	for (ring_id = 0;  mask > 0; ring_id++) {
 		/* check sched is ready is on the ring. */
@@ -251,6 +295,7 @@ mm_queue_test_helper(amdgpu_device_handle device_handle, struct mmd_shared_conte
 		snprintf(cmd, sizeof(cmd) - 1, "sudo echo  0x%lx > %s", sched_mask, sysfs);
 		r = system(cmd);
 		igt_assert_eq(r, 0);
+		sched_mask_dirty = false;
 	}
 	return r;
 }
-- 
2.49.0

next             reply	other threads:[~2026-04-09  7:16 UTC|newest]

Thread overview: 6+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-04-09  7:15 Jesse Zhang [this message]
2026-04-09  7:15 ` [PATCH i-g-t 2/2] tests/amdgpu/queue_reset: Add test for oversize packet length Jesse Zhang
2026-04-10  1:37 ` ✓ Xe.CI.BAT: success for series starting with [i-g-t,1/2] lib/amdgpu: restore sched_mask after abnormal subtest exits Patchwork
2026-04-10  1:48 ` ✓ i915.CI.BAT: " Patchwork
2026-04-10  4:52 ` ✗ Xe.CI.FULL: failure " Patchwork
2026-04-10 19:51 ` ✓ i915.CI.Full: success " Patchwork

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:cb37a356 dfblob:06c57708 dfblob:588f2302 dfblob:39d4eea6 )
 OR (
bs:"[PATCH i-g-t 1/2] lib/amdgpu: restore sched_mask after abnormal subtest exits" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260409071558.2658707-1-Jesse.Zhang@amd.com \
    --to=jesse.zhang@amd.com \
    --cc=alexander.deucher@amd.com \
    --cc=christian.koenig@amd.com \
    --cc=igt-dev@lists.freedesktop.org \
    --cc=pierre-eric.pelloux-prayer@amd.com \
    --cc=vitaly.prosyak@amd.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox