Also add WRITE_DATA job start marker packet optionally. This can be used to know job has started and make FWM packet negative testing timeout to be shorter.
Signed-off-by: Yogesh Mohan Marimuthu <yogesh.mohanmarimuthu@amd.com>
---
lib/amdgpu/amd_PM4.h | 6 ++++++
lib/amdgpu/amd_command_submission.c | 5 +++++
lib/amdgpu/amd_ip_blocks.c | 27 +++++++++++++++++++++++++++
lib/amdgpu/amd_ip_blocks.h | 5 +++++
4 files changed, 43 insertions(+)
diff --git a/lib/amdgpu/amd_PM4.h b/lib/amdgpu/amd_PM4.h
index 8f59b4223..923826656 100644
--- a/lib/amdgpu/amd_PM4.h
+++ b/lib/amdgpu/amd_PM4.h
@@ -224,4 +224,10 @@
#define PACKET3_INCREMENT_CE_COUNTER 0x84
#define PACKET3_WAIT_ON_CE_COUNTER 0x86
+#define PACKET3_FENCE_WAIT_MULTI 0xD1
+#define FWM_ENGINE_SEL(x) ((x & 1) << 0)
+#define FWM_PREEMPTABLE(x) ((x & 1) << 1)
+#define FWM_CACHE_POLICY(x) ((x & 3) << 2)
+#define FWM_POLL_INTERVAL(x) ((x & 0xFFFF) << 16)
+
#endif
diff --git a/lib/amdgpu/amd_command_submission.c b/lib/amdgpu/amd_command_submission.c
index 52327c69e..399171b44 100644
--- a/lib/amdgpu/amd_command_submission.c
+++ b/lib/amdgpu/amd_command_submission.c
@@ -10,6 +10,7 @@
#include "lib/amdgpu/amd_sdma.h"
#include "lib/amdgpu/amd_PM4.h"
#include "lib/amdgpu/amd_command_submission.h"
+#include "lib/amdgpu/amdgpu_asic_addr.h"
#include "ioctl_wrappers.h"
@@ -185,6 +186,10 @@ static void amdgpu_create_ip_queues(amdgpu_device_handle device,
ring_context[ring_id].pm4_size = pm4_dw;
ring_context[ring_id].res_cnt = 1;
ring_context[ring_id].user_queue = user_queue;
+ if (ip_block->funcs->family_id == FAMILY_GFX1150)
+ ring_context[ring_id].max_num_fences_fwm = 4;
+ else
+ ring_context[ring_id].max_num_fences_fwm = 32;
igt_assert(ring_context[ring_id].pm4);
/* Copy the previously queried HW IP info instead of querying again */
diff --git a/lib/amdgpu/amd_ip_blocks.c b/lib/amdgpu/amd_ip_blocks.c
index 12f92a1d5..e8803c6e2 100644
--- a/lib/amdgpu/amd_ip_blocks.c
+++ b/lib/amdgpu/amd_ip_blocks.c
@@ -659,6 +659,33 @@ user_queue_submit(amdgpu_device_handle device, struct amdgpu_ring_context *ring_
amdgpu_sdma_pkt_end();
} else {
amdgpu_pkt_begin();
+
+ if (ring_context->job_start_write_data_va_addr) {
+ amdgpu_pkt_add_dw(PACKET3(PACKET3_WRITE_DATA, 4));
+ amdgpu_pkt_add_dw(WRITE_DATA_DST_SEL(5) | WR_CONFIRM | WRITE_DATA_CACHE_POLICY(3));
+ amdgpu_pkt_add_dw(ring_context->job_start_write_data_va_addr);
+ amdgpu_pkt_add_dw(ring_context->job_start_write_data_va_addr >> 32);
Please use lower_32_bits and upper_32_bits everywhere as it is done into the upper part of the function!
+ amdgpu_pkt_add_dw(ring_context->job_start_write_data_val);
+ amdgpu_pkt_add_dw(ring_context->job_start_write_data_val >> 32);
+ }
+
+ if (ring_context->num_fences) {
+ unsigned num_fences_in_iter;
+
+ for (unsigned i = 0; i < ring_context->num_fences; i = i + ring_context->max_num_fences_fwm) {
+ num_fences_in_iter = (i + ring_context->max_num_fences_fwm > ring_context->num_fences) ?
+ ring_context->num_fences - i : ring_context->max_num_fences_fwm;
+ amdgpu_pkt_add_dw(PACKET3(PACKET3_FENCE_WAIT_MULTI, num_fences_in_iter * 4));
+ amdgpu_pkt_add_dw(FWM_ENGINE_SEL(1) | FWM_POLL_INTERVAL(4));
+ for (unsigned j = 0; j < num_fences_in_iter; j++) {
Please use lower_32_bits and upper_32_bits macros:
+ amdgpu_pkt_add_dw(ring_context->fence_info[i + j].va);
+ amdgpu_pkt_add_dw(ring_context->fence_info[i + j].va >> 32);
+ amdgpu_pkt_add_dw(ring_context->fence_info[i + j].value);
+ amdgpu_pkt_add_dw(ring_context->fence_info[i + j].value >> 32);
+ }
+ }
+ }
+
/* Prepare the Indirect IB to submit the IB to user queue */
amdgpu_pkt_add_dw(PACKET3(PACKET3_INDIRECT_BUFFER, 2));
amdgpu_pkt_add_dw(lower_32_bits(mc_address));
diff --git a/lib/amdgpu/amd_ip_blocks.h b/lib/amdgpu/amd_ip_blocks.h
index 424b3210a..17f94fd5b 100644
--- a/lib/amdgpu/amd_ip_blocks.h
+++ b/lib/amdgpu/amd_ip_blocks.h
@@ -279,6 +279,11 @@ struct amdgpu_ring_context {
bool user_queue;
uint64_t time_out;
enum uq_submission_mode submit_mode;
+ uint32_t max_num_fences_fwm;
+ uint64_t num_fences;
+ struct drm_amdgpu_userq_fence_info *fence_info;
Please remove these variables and keep the logic local to the function.
We should only add fields to this struct when it’s absolutely necessary. We can’t balloon the struct size just for internal/test-specific logic—this struct is used across all tests.
+ uint64_t job_start_write_data_va_addr; + uint64_t job_start_write_data_val; struct drm_amdgpu_info_uq_fw_areas info; };