public inbox for igt-dev@lists.freedesktop.org
 help / color / mirror / Atom feed
* [PATCH i-g-t v3] lib/amdgpu: implement selective sync skipping for error injection tests
@ 2026-01-08  6:33 Jesse.Zhang
  2026-01-08  7:32 ` ✓ Xe.CI.BAT: success for lib/amdgpu: implement selective sync skipping for error injection tests (rev3) Patchwork
                   ` (3 more replies)
  0 siblings, 4 replies; 5+ messages in thread
From: Jesse.Zhang @ 2026-01-08  6:33 UTC (permalink / raw)
  To: igt-dev
  Cc: Vitaly Prosyak, Alex Deucher, Christian Koenig, Jesse.Zhang,
	Jesse Zhang

Refactor user queue submission to handle error injection cases where
GPU commands are intentionally invalid and would cause syncobj waits
to hang indefinitely.

Key changes:
1. Introduce `enum uq_submission_mode` with two modes:
   - UQ_SUBMIT_NORMAL: Full synchronization (default)
   - UQ_SUBMIT_NO_SYNC: Skip sync for error injection

2. Add helper functions:
   - `wait_for_packet_consumption()`: Busy-waits with timeout for GPU
     to process commands without synchronization
   - `create_sync_signal()`: Extracts signal creation and wait logic
     for normal submissions

3. Update `user_queue_submit()` to switch between modes:
   - NO_SYNC mode: Waits for command consumption via rptr/wptr polling
   - NORMAL mode: Creates sync signal and waits for completion

4. Modify `bad_access_helper()` to use UQ_SUBMIT_NO_SYNC mode for
   error injection tests, replacing the hardcoded timeout value

Benefits:
- Prevents permanent hangs when submitting invalid commands in tests
- Maintains full synchronization for normal operation
- Provides timeout protection for error injection cases
- Improves code organization with clear separation of concerns
- Enables future expansion of submission modes

The fix specifically addresses deadlock test scenarios where invalid
GPU commands would cause `amdgpu_cs_syncobj_wait()` to block forever,
preventing proper resource cleanup in `user_queue_destroy()`.

v2: fix build warning


Signed-off-by: Jesse Zhang <jesse.zhang@amd.com>
---
 lib/amdgpu/amd_command_submission.c | 20 ++++++--
 lib/amdgpu/amd_deadlock_helpers.c   |  1 -
 lib/amdgpu/amd_ip_blocks.c          | 72 +++++++++++++++++++++--------
 lib/amdgpu/amd_ip_blocks.h          |  7 +++
 4 files changed, 76 insertions(+), 24 deletions(-)

diff --git a/lib/amdgpu/amd_command_submission.c b/lib/amdgpu/amd_command_submission.c
index fc5a0ed32..6129b7104 100644
--- a/lib/amdgpu/amd_command_submission.c
+++ b/lib/amdgpu/amd_command_submission.c
@@ -70,6 +70,11 @@ int amdgpu_test_exec_cs_helper(amdgpu_device_handle device, unsigned int ip_type
 	memcpy(ring_ptr, ring_context->pm4, ring_context->pm4_dw * sizeof(*ring_context->pm4));
 
 	if (user_queue) {
+		if (expect_failure)
+			ring_context->submit_mode = UQ_SUBMIT_NO_SYNC;
+		else
+			ring_context->submit_mode = UQ_SUBMIT_NORMAL;
+
 		r = ip_block->funcs->userq_submit(device, ring_context, ip_type, ib_result_mc_address);
 		if (!expect_failure)
 			igt_assert_eq(r, 0);
@@ -180,7 +185,8 @@ static void amdgpu_create_ip_queues(amdgpu_device_handle device,
 		ring_context[ring_id].pm4_size = pm4_dw;
 		ring_context[ring_id].res_cnt = 1;
 		ring_context[ring_id].user_queue = user_queue;
-		ring_context[ring_id].time_out = 0;
+		if (user_queue)
+			ring_context[ring_id].time_out = INT64_MAX;
 		igt_assert(ring_context[ring_id].pm4);
 
 		/* Copy the previously queried HW IP info instead of querying again */
@@ -370,7 +376,8 @@ void amdgpu_command_submission_write_linear_helper(amdgpu_device_handle device,
 	ring_context->pm4_size = pm4_dw;
 	ring_context->res_cnt = 1;
 	ring_context->user_queue = user_queue;
-	ring_context->time_out = 0;
+	if (user_queue)
+		ring_context->time_out = INT64_MAX;
 	igt_assert(ring_context->pm4);
 
 	r = amdgpu_query_hw_ip_info(device, ip_block->type, 0, &ring_context->hw_ip_info);
@@ -503,7 +510,8 @@ void amdgpu_command_submission_const_fill_helper(amdgpu_device_handle device,
 	ring_context->pm4_size = pm4_dw;
 	ring_context->res_cnt = 1;
 	ring_context->user_queue = user_queue;
-	ring_context->time_out = 0;
+	if (user_queue)
+		ring_context->time_out = INT64_MAX;
 	igt_assert(ring_context->pm4);
 	r = amdgpu_query_hw_ip_info(device, ip_block->type, 0, &ring_context->hw_ip_info);
 	igt_assert_eq(r, 0);
@@ -604,7 +612,8 @@ void amdgpu_command_submission_copy_linear_helper(amdgpu_device_handle device,
 	ring_context->pm4_size = pm4_dw;
 	ring_context->res_cnt = 2;
 	ring_context->user_queue = user_queue;
-	ring_context->time_out = 0;
+	if (user_queue)
+		ring_context->time_out = INT64_MAX;
 	igt_assert(ring_context->pm4);
 	r = amdgpu_query_hw_ip_info(device, ip_block->type, 0, &ring_context->hw_ip_info);
 	igt_assert_eq(r, 0);
@@ -927,7 +936,8 @@ cmd_context_t* cmd_context_create(amdgpu_device_handle device,
 	ctx->ring_ctx->ring_id = ring_id;
 	ctx->ring_ctx->secure = false;
 	ctx->ring_ctx->user_queue = user_queue;
-	ctx->ring_ctx->time_out = 0;
+	if (user_queue)
+		ctx->ring_ctx->time_out = INT64_MAX;
 
 	if (user_queue) {
 	/* Initialize user queue if requested */
diff --git a/lib/amdgpu/amd_deadlock_helpers.c b/lib/amdgpu/amd_deadlock_helpers.c
index 5efb5e73d..01c0f9928 100644
--- a/lib/amdgpu/amd_deadlock_helpers.c
+++ b/lib/amdgpu/amd_deadlock_helpers.c
@@ -347,7 +347,6 @@ bad_access_helper(amdgpu_device_handle device_handle, unsigned int cmd_error,
 	ring_context->res_cnt = 1;
 	ring_context->ring_id = 0;
 	ring_context->user_queue = user_queue;
-	ring_context->time_out = 0x7ffff;
 	igt_assert(ring_context->pm4);
 	r = amdgpu_bo_alloc_and_map_sync(device_handle,
 				    ring_context->write_length * sizeof(uint32_t),
diff --git a/lib/amdgpu/amd_ip_blocks.c b/lib/amdgpu/amd_ip_blocks.c
index 73bdace5a..0a9487c95 100644
--- a/lib/amdgpu/amd_ip_blocks.c
+++ b/lib/amdgpu/amd_ip_blocks.c
@@ -582,15 +582,55 @@ int amdgpu_timeline_syncobj_wait(amdgpu_device_handle device_handle,
 	return r;
 }
 
+static
+int wait_for_packet_consumption(struct amdgpu_ring_context *ring_context)
+{
+	uint64_t count = 0;
+
+	while (*ring_context->rptr_cpu == *ring_context->wptr_cpu) {
+		if (count > 2000) {
+			igt_warn("Timeout waiting for bad packet consumption\n");
+			return -ETIMEDOUT;
+		}
+		count++;
+		usleep(1000);
+	}
+	return 0;
+}
+
+static
+int create_sync_signal(amdgpu_device_handle device,
+                             struct amdgpu_ring_context *ring_context,
+                             uint64_t timeout)
+{
+	uint32_t syncarray[1];
+	struct drm_amdgpu_userq_signal signal_data;
+	int r;
+
+	syncarray[0] = ring_context->timeline_syncobj_handle;
+	signal_data.queue_id = ring_context->queue_id;
+	signal_data.syncobj_handles = (uintptr_t)syncarray;
+	signal_data.num_syncobj_handles = 1;
+	signal_data.bo_read_handles = 0;
+	signal_data.bo_write_handles = 0;
+	signal_data.num_bo_read_handles = 0;
+	signal_data.num_bo_write_handles = 0;
+
+	r = amdgpu_userq_signal(device, &signal_data);
+	if (r)
+		return r;
+
+	return amdgpu_cs_syncobj_wait(device, &ring_context->timeline_syncobj_handle,
+				  1, timeout, DRM_SYNCOBJ_WAIT_FLAGS_WAIT_ALL, NULL);
+}
+
 static int
 user_queue_submit(amdgpu_device_handle device, struct amdgpu_ring_context *ring_context,
 			      unsigned int ip_type, uint64_t mc_address)
 {
 	int r;
 	uint32_t control = ring_context->pm4_dw;
-	uint32_t syncarray[1];
-	struct drm_amdgpu_userq_signal signal_data;
-	uint64_t timeout = ring_context->time_out ? ring_context->time_out : INT64_MAX;
+	uint64_t timeout = ring_context->time_out;
 	unsigned int nop_count;
 
 	if (ip_type == AMD_IP_DMA) {
@@ -640,21 +680,17 @@ user_queue_submit(amdgpu_device_handle device, struct amdgpu_ring_context *ring_
 #endif
 	ring_context->doorbell_cpu[DOORBELL_INDEX] = *ring_context->wptr_cpu;
 
-	/* Add a fence packet for signal */
-	syncarray[0] = ring_context->timeline_syncobj_handle;
-	signal_data.queue_id = ring_context->queue_id;
-	signal_data.syncobj_handles = (uintptr_t)syncarray;
-	signal_data.num_syncobj_handles = 1;
-	signal_data.bo_read_handles = 0;
-	signal_data.bo_write_handles = 0;
-	signal_data.num_bo_read_handles = 0;
-	signal_data.num_bo_write_handles = 0;
-
-	r = amdgpu_userq_signal(device, &signal_data);
-	igt_assert_eq(r, 0);
-
-	r = amdgpu_cs_syncobj_wait(device, &ring_context->timeline_syncobj_handle, 1, timeout,
-				DRM_SYNCOBJ_WAIT_FLAGS_WAIT_ALL, NULL);
+	switch (ring_context->submit_mode) {
+	case UQ_SUBMIT_NO_SYNC:
+		/* Error injection: wait for packet consumption without sync */
+		r = wait_for_packet_consumption(ring_context);
+		break;
+	case UQ_SUBMIT_NORMAL:
+	default:
+		/* Standard submission with full synchronization */
+		r = create_sync_signal(device, ring_context, timeout);
+		break;
+	}
 	return r;
 }
 
diff --git a/lib/amdgpu/amd_ip_blocks.h b/lib/amdgpu/amd_ip_blocks.h
index 51f492da2..8fd9fde9a 100644
--- a/lib/amdgpu/amd_ip_blocks.h
+++ b/lib/amdgpu/amd_ip_blocks.h
@@ -194,6 +194,12 @@ struct amdgpu_userq_bo {
 	void *ptr;
 };
 
+/* Submission modes for user queues */
+enum uq_submission_mode {
+	UQ_SUBMIT_NORMAL,        /* Full synchronization */
+	UQ_SUBMIT_NO_SYNC,       /* Skip sync for error injection */
+};
+
 #define for_each_test(t, T) for(typeof(*T) *t = T; t->name; t++)
 
 /* set during execution */
@@ -272,6 +278,7 @@ struct amdgpu_ring_context {
 	uint64_t point;
 	bool user_queue;
 	uint64_t time_out;
+	enum uq_submission_mode submit_mode;
 
 	struct drm_amdgpu_info_uq_fw_areas info;
 };
-- 
2.49.0


^ permalink raw reply related	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2026-01-12  1:12 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-01-08  6:33 [PATCH i-g-t v3] lib/amdgpu: implement selective sync skipping for error injection tests Jesse.Zhang
2026-01-08  7:32 ` ✓ Xe.CI.BAT: success for lib/amdgpu: implement selective sync skipping for error injection tests (rev3) Patchwork
2026-01-08  8:02 ` ✓ i915.CI.BAT: " Patchwork
2026-01-08  9:58 ` ✓ i915.CI.Full: " Patchwork
2026-01-12  1:12 ` [PATCH i-g-t v3] lib/amdgpu: implement selective sync skipping for error injection tests vitaly prosyak

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox