From: Jesse.Zhang <Jesse.Zhang@amd.com>
To: <igt-dev@lists.freedesktop.org>
Cc: Vitaly Prosyak <vitaly.prosyak@amd.com>,
Alex Deucher <alexander.deucher@amd.com>,
Christian Koenig <christian.koenig@amd.com>,
Jesse.Zhang <Jesse.Zhang@amd.com>,
Jesse Zhang <jesse.zhang@amd.com>
Subject: [PATCH i-g-t v3] lib/amdgpu: implement selective sync skipping for error injection tests
Date: Thu, 8 Jan 2026 14:33:36 +0800 [thread overview]
Message-ID: <20260108063451.4110907-1-Jesse.Zhang@amd.com> (raw)
Refactor user queue submission to handle error injection cases where
GPU commands are intentionally invalid and would cause syncobj waits
to hang indefinitely.
Key changes:
1. Introduce `enum uq_submission_mode` with two modes:
- UQ_SUBMIT_NORMAL: Full synchronization (default)
- UQ_SUBMIT_NO_SYNC: Skip sync for error injection
2. Add helper functions:
- `wait_for_packet_consumption()`: Busy-waits with timeout for GPU
to process commands without synchronization
- `create_sync_signal()`: Extracts signal creation and wait logic
for normal submissions
3. Update `user_queue_submit()` to switch between modes:
- NO_SYNC mode: Waits for command consumption via rptr/wptr polling
- NORMAL mode: Creates sync signal and waits for completion
4. Modify `bad_access_helper()` to use UQ_SUBMIT_NO_SYNC mode for
error injection tests, replacing the hardcoded timeout value
Benefits:
- Prevents permanent hangs when submitting invalid commands in tests
- Maintains full synchronization for normal operation
- Provides timeout protection for error injection cases
- Improves code organization with clear separation of concerns
- Enables future expansion of submission modes
The fix specifically addresses deadlock test scenarios where invalid
GPU commands would cause `amdgpu_cs_syncobj_wait()` to block forever,
preventing proper resource cleanup in `user_queue_destroy()`.
v2: fix build warning
Signed-off-by: Jesse Zhang <jesse.zhang@amd.com>
---
lib/amdgpu/amd_command_submission.c | 20 ++++++--
lib/amdgpu/amd_deadlock_helpers.c | 1 -
lib/amdgpu/amd_ip_blocks.c | 72 +++++++++++++++++++++--------
lib/amdgpu/amd_ip_blocks.h | 7 +++
4 files changed, 76 insertions(+), 24 deletions(-)
diff --git a/lib/amdgpu/amd_command_submission.c b/lib/amdgpu/amd_command_submission.c
index fc5a0ed32..6129b7104 100644
--- a/lib/amdgpu/amd_command_submission.c
+++ b/lib/amdgpu/amd_command_submission.c
@@ -70,6 +70,11 @@ int amdgpu_test_exec_cs_helper(amdgpu_device_handle device, unsigned int ip_type
memcpy(ring_ptr, ring_context->pm4, ring_context->pm4_dw * sizeof(*ring_context->pm4));
if (user_queue) {
+ if (expect_failure)
+ ring_context->submit_mode = UQ_SUBMIT_NO_SYNC;
+ else
+ ring_context->submit_mode = UQ_SUBMIT_NORMAL;
+
r = ip_block->funcs->userq_submit(device, ring_context, ip_type, ib_result_mc_address);
if (!expect_failure)
igt_assert_eq(r, 0);
@@ -180,7 +185,8 @@ static void amdgpu_create_ip_queues(amdgpu_device_handle device,
ring_context[ring_id].pm4_size = pm4_dw;
ring_context[ring_id].res_cnt = 1;
ring_context[ring_id].user_queue = user_queue;
- ring_context[ring_id].time_out = 0;
+ if (user_queue)
+ ring_context[ring_id].time_out = INT64_MAX;
igt_assert(ring_context[ring_id].pm4);
/* Copy the previously queried HW IP info instead of querying again */
@@ -370,7 +376,8 @@ void amdgpu_command_submission_write_linear_helper(amdgpu_device_handle device,
ring_context->pm4_size = pm4_dw;
ring_context->res_cnt = 1;
ring_context->user_queue = user_queue;
- ring_context->time_out = 0;
+ if (user_queue)
+ ring_context->time_out = INT64_MAX;
igt_assert(ring_context->pm4);
r = amdgpu_query_hw_ip_info(device, ip_block->type, 0, &ring_context->hw_ip_info);
@@ -503,7 +510,8 @@ void amdgpu_command_submission_const_fill_helper(amdgpu_device_handle device,
ring_context->pm4_size = pm4_dw;
ring_context->res_cnt = 1;
ring_context->user_queue = user_queue;
- ring_context->time_out = 0;
+ if (user_queue)
+ ring_context->time_out = INT64_MAX;
igt_assert(ring_context->pm4);
r = amdgpu_query_hw_ip_info(device, ip_block->type, 0, &ring_context->hw_ip_info);
igt_assert_eq(r, 0);
@@ -604,7 +612,8 @@ void amdgpu_command_submission_copy_linear_helper(amdgpu_device_handle device,
ring_context->pm4_size = pm4_dw;
ring_context->res_cnt = 2;
ring_context->user_queue = user_queue;
- ring_context->time_out = 0;
+ if (user_queue)
+ ring_context->time_out = INT64_MAX;
igt_assert(ring_context->pm4);
r = amdgpu_query_hw_ip_info(device, ip_block->type, 0, &ring_context->hw_ip_info);
igt_assert_eq(r, 0);
@@ -927,7 +936,8 @@ cmd_context_t* cmd_context_create(amdgpu_device_handle device,
ctx->ring_ctx->ring_id = ring_id;
ctx->ring_ctx->secure = false;
ctx->ring_ctx->user_queue = user_queue;
- ctx->ring_ctx->time_out = 0;
+ if (user_queue)
+ ctx->ring_ctx->time_out = INT64_MAX;
if (user_queue) {
/* Initialize user queue if requested */
diff --git a/lib/amdgpu/amd_deadlock_helpers.c b/lib/amdgpu/amd_deadlock_helpers.c
index 5efb5e73d..01c0f9928 100644
--- a/lib/amdgpu/amd_deadlock_helpers.c
+++ b/lib/amdgpu/amd_deadlock_helpers.c
@@ -347,7 +347,6 @@ bad_access_helper(amdgpu_device_handle device_handle, unsigned int cmd_error,
ring_context->res_cnt = 1;
ring_context->ring_id = 0;
ring_context->user_queue = user_queue;
- ring_context->time_out = 0x7ffff;
igt_assert(ring_context->pm4);
r = amdgpu_bo_alloc_and_map_sync(device_handle,
ring_context->write_length * sizeof(uint32_t),
diff --git a/lib/amdgpu/amd_ip_blocks.c b/lib/amdgpu/amd_ip_blocks.c
index 73bdace5a..0a9487c95 100644
--- a/lib/amdgpu/amd_ip_blocks.c
+++ b/lib/amdgpu/amd_ip_blocks.c
@@ -582,15 +582,55 @@ int amdgpu_timeline_syncobj_wait(amdgpu_device_handle device_handle,
return r;
}
+static
+int wait_for_packet_consumption(struct amdgpu_ring_context *ring_context)
+{
+ uint64_t count = 0;
+
+ while (*ring_context->rptr_cpu == *ring_context->wptr_cpu) {
+ if (count > 2000) {
+ igt_warn("Timeout waiting for bad packet consumption\n");
+ return -ETIMEDOUT;
+ }
+ count++;
+ usleep(1000);
+ }
+ return 0;
+}
+
+static
+int create_sync_signal(amdgpu_device_handle device,
+ struct amdgpu_ring_context *ring_context,
+ uint64_t timeout)
+{
+ uint32_t syncarray[1];
+ struct drm_amdgpu_userq_signal signal_data;
+ int r;
+
+ syncarray[0] = ring_context->timeline_syncobj_handle;
+ signal_data.queue_id = ring_context->queue_id;
+ signal_data.syncobj_handles = (uintptr_t)syncarray;
+ signal_data.num_syncobj_handles = 1;
+ signal_data.bo_read_handles = 0;
+ signal_data.bo_write_handles = 0;
+ signal_data.num_bo_read_handles = 0;
+ signal_data.num_bo_write_handles = 0;
+
+ r = amdgpu_userq_signal(device, &signal_data);
+ if (r)
+ return r;
+
+ return amdgpu_cs_syncobj_wait(device, &ring_context->timeline_syncobj_handle,
+ 1, timeout, DRM_SYNCOBJ_WAIT_FLAGS_WAIT_ALL, NULL);
+}
+
static int
user_queue_submit(amdgpu_device_handle device, struct amdgpu_ring_context *ring_context,
unsigned int ip_type, uint64_t mc_address)
{
int r;
uint32_t control = ring_context->pm4_dw;
- uint32_t syncarray[1];
- struct drm_amdgpu_userq_signal signal_data;
- uint64_t timeout = ring_context->time_out ? ring_context->time_out : INT64_MAX;
+ uint64_t timeout = ring_context->time_out;
unsigned int nop_count;
if (ip_type == AMD_IP_DMA) {
@@ -640,21 +680,17 @@ user_queue_submit(amdgpu_device_handle device, struct amdgpu_ring_context *ring_
#endif
ring_context->doorbell_cpu[DOORBELL_INDEX] = *ring_context->wptr_cpu;
- /* Add a fence packet for signal */
- syncarray[0] = ring_context->timeline_syncobj_handle;
- signal_data.queue_id = ring_context->queue_id;
- signal_data.syncobj_handles = (uintptr_t)syncarray;
- signal_data.num_syncobj_handles = 1;
- signal_data.bo_read_handles = 0;
- signal_data.bo_write_handles = 0;
- signal_data.num_bo_read_handles = 0;
- signal_data.num_bo_write_handles = 0;
-
- r = amdgpu_userq_signal(device, &signal_data);
- igt_assert_eq(r, 0);
-
- r = amdgpu_cs_syncobj_wait(device, &ring_context->timeline_syncobj_handle, 1, timeout,
- DRM_SYNCOBJ_WAIT_FLAGS_WAIT_ALL, NULL);
+ switch (ring_context->submit_mode) {
+ case UQ_SUBMIT_NO_SYNC:
+ /* Error injection: wait for packet consumption without sync */
+ r = wait_for_packet_consumption(ring_context);
+ break;
+ case UQ_SUBMIT_NORMAL:
+ default:
+ /* Standard submission with full synchronization */
+ r = create_sync_signal(device, ring_context, timeout);
+ break;
+ }
return r;
}
diff --git a/lib/amdgpu/amd_ip_blocks.h b/lib/amdgpu/amd_ip_blocks.h
index 51f492da2..8fd9fde9a 100644
--- a/lib/amdgpu/amd_ip_blocks.h
+++ b/lib/amdgpu/amd_ip_blocks.h
@@ -194,6 +194,12 @@ struct amdgpu_userq_bo {
void *ptr;
};
+/* Submission modes for user queues */
+enum uq_submission_mode {
+ UQ_SUBMIT_NORMAL, /* Full synchronization */
+ UQ_SUBMIT_NO_SYNC, /* Skip sync for error injection */
+};
+
#define for_each_test(t, T) for(typeof(*T) *t = T; t->name; t++)
/* set during execution */
@@ -272,6 +278,7 @@ struct amdgpu_ring_context {
uint64_t point;
bool user_queue;
uint64_t time_out;
+ enum uq_submission_mode submit_mode;
struct drm_amdgpu_info_uq_fw_areas info;
};
--
2.49.0
next reply other threads:[~2026-01-08 6:35 UTC|newest]
Thread overview: 5+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-01-08 6:33 Jesse.Zhang [this message]
2026-01-08 7:32 ` ✓ Xe.CI.BAT: success for lib/amdgpu: implement selective sync skipping for error injection tests (rev3) Patchwork
2026-01-08 8:02 ` ✓ i915.CI.BAT: " Patchwork
2026-01-08 9:58 ` ✓ i915.CI.Full: " Patchwork
2026-01-12 1:12 ` [PATCH i-g-t v3] lib/amdgpu: implement selective sync skipping for error injection tests vitaly prosyak
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260108063451.4110907-1-Jesse.Zhang@amd.com \
--to=jesse.zhang@amd.com \
--cc=alexander.deucher@amd.com \
--cc=christian.koenig@amd.com \
--cc=igt-dev@lists.freedesktop.org \
--cc=vitaly.prosyak@amd.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox