[PATCH] lib/xe/xe_util: Creating the helper functions

Intel-XE Archive on lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH] lib/xe/xe_util: Creating the helper functions
@ 2024-03-31 18:59 Bommu Krishnaiah
  2024-03-31 19:04 ` Bommu, Krishnaiah
                   ` (3 more replies)
  0 siblings, 4 replies; 9+ messages in thread
From: Bommu Krishnaiah @ 2024-03-31 18:59 UTC (permalink / raw)
  To: intel-xe
  Cc: Bommu Krishnaiah, Janga Rahul Kumar, Himal Prasad Ghimiray,
	Oak Zeng

Creating the helper functions which can be reused by many tests,
as part of this patch xe_exec_compute_mode and xe_exec_fault_mode
are used this helper functions.

Signed-off-by: Bommu Krishnaiah <krishnaiah.bommu@intel.com>
Cc: Janga Rahul Kumar <janga.rahul.kumar@intel.com>
Cc: Himal Prasad Ghimiray <himal.prasad.ghimiray@intel.com>
Cc: Oak Zeng <oak.zeng@intel.com>
---
 lib/xe/xe_util.c                   | 194 +++++++++++++++++++++++++++++
 lib/xe/xe_util.h                   |  48 +++++++
 tests/intel/xe_exec_compute_mode.c | 190 ++++------------------------
 tests/intel/xe_exec_fault_mode.c   | 179 +++-----------------------
 4 files changed, 282 insertions(+), 329 deletions(-)

diff --git a/lib/xe/xe_util.c b/lib/xe/xe_util.c
index 050162b5e..17cd45e45 100644
--- a/lib/xe/xe_util.c
+++ b/lib/xe/xe_util.c
@@ -255,3 +255,197 @@ bool xe_is_gt_in_c6(int fd, int gt)
 
 	return strcmp(gt_c_state, "gt-c6") == 0;
 }
+
+struct cpu_va *create_xe_bo(int fd, uint32_t vm, uint32_t *bo,
+		uint32_t bo_size, uint64_t placement, unsigned int flags)
+{
+	struct cpu_va *data;
+
+	if (flags & USERPTR) {
+		if (flags & INVALIDATE) {
+			data = mmap((void *)MAP_ADDRESS, bo_size, PROT_READ | PROT_WRITE,
+					MAP_SHARED | MAP_FIXED | MAP_ANONYMOUS, -1, 0);
+			igt_assert(data != MAP_FAILED);
+		} else {
+			data = aligned_alloc(xe_get_default_alignment(fd), bo_size);
+			igt_assert(data);
+		}
+	} else {
+		*bo = xe_bo_create(fd, flags & VM_FOR_BO ? vm : 0, bo_size, placement,
+				DRM_XE_GEM_CREATE_FLAG_NEEDS_VISIBLE_VRAM);
+		data = xe_bo_map(fd, *bo, bo_size);
+	}
+
+	memset(data, 0, bo_size);
+
+	return data;
+}
+
+void create_exec_queue(int fd, uint32_t vm, uint32_t *exec_queues,
+		uint32_t *bind_exec_queues, int n_exec_queues,
+		struct drm_xe_engine_class_instance *eci, unsigned int flags)
+{
+	int i;
+
+	for (i = 0; i < n_exec_queues; i++) {
+		exec_queues[i] = xe_exec_queue_create(fd, vm, eci, 0);
+		if (flags & BIND_EXEC_QUEUE)
+			bind_exec_queues[i] = xe_bind_exec_queue_create(fd, vm, 0);
+		else
+			bind_exec_queues[i] = 0;
+	};
+}
+
+void xe_bo_bind(int fd, uint32_t vm, uint32_t bind_exec_queues, uint32_t bo,
+		uint32_t bo_size, uint64_t addr, struct drm_xe_sync *sync,
+		struct cpu_va *data, unsigned int flags)
+{
+	int64_t fence_timeout = igt_run_in_simulation() ? HUNDRED_SEC : ONE_SEC;
+
+	sync[0].addr = to_user_pointer(&data[0].vm_sync);
+	if (bo)
+		xe_vm_bind_async(fd, vm, bind_exec_queues, bo, 0, addr, bo_size, sync, 1);
+	else
+		xe_vm_bind_userptr_async(fd, vm, bind_exec_queues,
+				to_user_pointer(data), addr,
+				bo_size, sync, 1);
+
+	xe_wait_ufence(fd, &data[0].vm_sync, USER_FENCE_VALUE,
+			bind_exec_queues, fence_timeout);
+	data[0].vm_sync = 0;
+
+	if (flags & PREFETCH) {
+		/* Should move to system memory */
+		xe_vm_prefetch_async(fd, vm, bind_exec_queues, 0, addr,
+				bo_size, sync, 1, 0);
+		xe_wait_ufence(fd, &data[0].vm_sync, USER_FENCE_VALUE,
+				bind_exec_queues, fence_timeout);
+		data[0].vm_sync = 0;
+	}
+}
+
+void insert_store(uint64_t dst, struct cpu_va *data, uint32_t val, int i)
+{
+	int b = 0;
+	data[i].batch[b++] = MI_STORE_DWORD_IMM_GEN4;
+	data[i].batch[b++] = dst;
+	data[i].batch[b++] = dst >> 32;
+	data[i].batch[b++] = val;
+	data[i].batch[b++] = MI_BATCH_BUFFER_END;
+	igt_assert(b <= ARRAY_SIZE(data[i].batch));
+}
+
+void xe_execbuf(int fd, uint32_t vm, struct drm_xe_exec *exec, int n_execs,
+		uint32_t *exec_queues, uint32_t *bind_exec_queues,
+		int n_exec_queues, uint32_t bo, uint32_t bo_size,
+		struct drm_xe_sync *sync, uint64_t addr, struct cpu_va *data,
+		int *map_fd, unsigned int flags)
+{
+	int i;
+	int64_t fence_timeout = igt_run_in_simulation() ? HUNDRED_SEC : ONE_SEC;
+
+	for (i = 0; i < n_execs; i++) {
+		uint64_t batch_offset = (char *)&data[i].batch - (char *)data;
+		uint64_t batch_addr = addr + batch_offset;
+		uint64_t sdi_offset = (char *)&data[i].data - (char *)data;
+		uint64_t sdi_addr = addr + sdi_offset;
+		int e = i % n_exec_queues;
+
+		if (flags & INVALID_VA)
+			sdi_addr = 0x1fffffffffff000;
+
+		insert_store(sdi_addr, data, 0xc0ffee, i);
+
+		sync[0].addr = addr + (char *)&data[i].exec_sync - (char *)data;
+
+		exec->exec_queue_id = exec_queues[e];
+		exec->address = batch_addr;
+		xe_exec(fd, exec);
+
+		if (flags & REBIND && i + 1 != n_execs) {
+			xe_wait_ufence(fd, &data[i].exec_sync, USER_FENCE_VALUE,
+					exec_queues[e], fence_timeout);
+			xe_vm_unbind_async(fd, vm, bind_exec_queues[e], 0,
+					addr, bo_size, NULL, 0);
+
+			sync[0].addr = to_user_pointer(&data[0].vm_sync);
+			addr += bo_size;
+
+			if (bo)
+				xe_vm_bind_async(fd, vm, bind_exec_queues[e], bo,
+						0, addr, bo_size, sync, 1);
+			else
+				xe_vm_bind_userptr_async(fd, vm,
+						bind_exec_queues[e],
+						to_user_pointer(data),
+						addr, bo_size, sync, 1);
+
+			xe_wait_ufence(fd, &data[0].vm_sync, USER_FENCE_VALUE,
+					bind_exec_queues[e], fence_timeout);
+			data[0].vm_sync = 0;
+		}
+
+		if (flags & INVALIDATE && i + 1 != n_execs) {
+			if (!(flags & RACE)) {
+				/*
+				 * Wait for exec completion and check data as
+				 * userptr will likely change to different
+				 * physical memory on next mmap call triggering
+				 * an invalidate.
+				 */
+				xe_wait_ufence(fd, &data[i].exec_sync,
+						USER_FENCE_VALUE, exec_queues[e],
+						fence_timeout);
+				igt_assert_eq(data[i].data, 0xc0ffee);
+			} else if (i * 2 != n_execs) {
+				/*
+				 * We issue 1 mmap which races against running
+				 * jobs. No real check here aside from this test
+				 * not faulting on the GPU.
+				 */
+				continue;
+			}
+
+			if (flags & RACE) {
+				*map_fd = open("/tmp", O_TMPFILE | O_RDWR,
+						0x666);
+				write(*map_fd, data, bo_size);
+				data = mmap((void *)MAP_ADDRESS, bo_size,
+						PROT_READ | PROT_WRITE, MAP_SHARED |
+						MAP_FIXED, *map_fd, 0);
+			} else {
+				data = mmap((void *)MAP_ADDRESS, bo_size,
+						PROT_READ | PROT_WRITE, MAP_SHARED |
+						MAP_FIXED | MAP_ANONYMOUS, -1, 0);
+			}
+
+			igt_assert(data != MAP_FAILED);
+		}
+	}
+}
+
+void xe_vm_unbind(int fd, uint32_t vm, uint32_t bind_exec_queues,
+		struct drm_xe_sync *sync, struct cpu_va *data, uint64_t addr,
+		uint32_t bo_size)
+{
+	int64_t fence_timeout = igt_run_in_simulation() ? HUNDRED_SEC : ONE_SEC;
+
+	sync[0].addr = to_user_pointer(&data[0].vm_sync);
+	xe_vm_unbind_async(fd, vm, bind_exec_queues, 0, addr, bo_size,
+			sync, 1);
+	xe_wait_ufence(fd, &data[0].vm_sync, USER_FENCE_VALUE,
+			bind_exec_queues, fence_timeout);
+}
+
+void destory_exec_queue(int fd, uint32_t *exec_queues,
+		uint32_t *bind_exec_queues, int n_exec_queues)
+{
+	int i;
+
+	for (i = 0; i < n_exec_queues; i++) {
+		xe_exec_queue_destroy(fd, exec_queues[i]);
+		if (bind_exec_queues[i])
+			xe_exec_queue_destroy(fd, bind_exec_queues[i]);
+	}
+}
+
diff --git a/lib/xe/xe_util.h b/lib/xe/xe_util.h
index 6480ea01a..aa1b0dcc1 100644
--- a/lib/xe/xe_util.h
+++ b/lib/xe/xe_util.h
@@ -47,4 +47,52 @@ void xe_bind_unbind_async(int fd, uint32_t vm, uint32_t bind_engine,
 
 bool xe_is_gt_in_c6(int fd, int gt);
 
+
+#define USERPTR			(0x1 << 0)
+#define REBIND			(0x1 << 1)
+#define INVALIDATE		(0x1 << 2)
+#define RACE			(0x1 << 3)
+#define BIND_EXEC_QUEUE		(0x1 << 4)
+#define PREFETCH		(0x1 << 5)
+#define INVALID_FAULT		(0x1 << 6)
+#define INVALID_VA		(0x1 << 7)
+#define ENABLE_SCRATCH		(0x1 << 8)
+#define VM_FOR_BO		(0x1 << 9)
+#define EXEC_QUEUE_EARLY	(0x1 << 10)
+
+#define MAX_N_EXEC_QUEUES	16
+#define USER_FENCE_VALUE	0xdeadbeefdeadbeefull
+#define MAP_ADDRESS		0x00007fadeadbe000
+
+#define ONE_SEC		MS_TO_NS(1000)
+#define HUNDRED_SEC	MS_TO_NS(100000)
+
+struct cpu_va {
+	uint32_t batch[16];
+	uint64_t pad;
+	uint64_t vm_sync;
+	uint64_t exec_sync;
+	uint32_t data;
+};
+
+struct cpu_va *create_xe_bo(int fd, uint32_t vm, uint32_t *bo,
+		uint32_t bo_size, uint64_t placement,  unsigned int flags);
+void create_exec_queue(int fd, uint32_t vm, uint32_t *exec_queues,
+		uint32_t *bind_exec_queues, int n_exec_queues,
+		struct drm_xe_engine_class_instance *eci, unsigned int flags);
+void xe_bo_bind(int fd, uint32_t vm, uint32_t bind_exec_queues, uint32_t bo,
+		uint32_t bo_size, uint64_t addr, struct drm_xe_sync *sync,
+		struct cpu_va *data, unsigned int flags);
+void xe_execbuf(int fd, uint32_t vm, struct drm_xe_exec *exec, int n_execs,
+		uint32_t *exec_queues, uint32_t *bind_exec_queues,
+		int n_exec_queues, uint32_t bo, uint32_t bo_size,
+		struct drm_xe_sync *sync, uint64_t addr, struct cpu_va *data,
+		int *map_fd, unsigned int flags);
+void insert_store(uint64_t dst, struct cpu_va *data, uint32_t val, int i);
+void xe_vm_unbind(int fd, uint32_t vm, uint32_t bind_exec_queues,
+		struct drm_xe_sync *sync, struct cpu_va *data,
+		uint64_t addr, uint32_t bo_size);
+void destory_exec_queue(int fd, uint32_t *exec_queues,
+		uint32_t *bind_exec_queues, int n_exec_queues);
+
 #endif /* XE_UTIL_H */
diff --git a/tests/intel/xe_exec_compute_mode.c b/tests/intel/xe_exec_compute_mode.c
index 3ec848164..e8d82cc69 100644
--- a/tests/intel/xe_exec_compute_mode.c
+++ b/tests/intel/xe_exec_compute_mode.c
@@ -15,6 +15,7 @@
 #include "igt.h"
 #include "lib/igt_syncobj.h"
 #include "lib/intel_reg.h"
+#include "lib/xe/xe_util.h"
 #include <sys/ioctl.h>
 #include "xe_drm.h"
 
@@ -23,15 +24,6 @@
 #include "xe/xe_spin.h"
 #include <string.h>
 
-#define MAX_N_EXECQUEUES 	16
-#define USERPTR				(0x1 << 0)
-#define REBIND				(0x1 << 1)
-#define INVALIDATE			(0x1 << 2)
-#define RACE				(0x1 << 3)
-#define BIND_EXECQUEUE		(0x1 << 4)
-#define VM_FOR_BO			(0x1 << 5)
-#define EXEC_QUEUE_EARLY	(0x1 << 6)
-
 /**
  * SUBTEST: twice-%s
  * Description: Run %arg[1] compute machine test twice
@@ -88,7 +80,6 @@ test_exec(int fd, struct drm_xe_engine_class_instance *eci,
 {
 	uint32_t vm;
 	uint64_t addr = 0x1a0000;
-#define USER_FENCE_VALUE	0xdeadbeefdeadbeefull
 	struct drm_xe_sync sync[1] = {
 		{ .type = DRM_XE_SYNC_TYPE_USER_FENCE,
 		  .flags = DRM_XE_SYNC_FLAG_SIGNAL,
@@ -99,161 +90,38 @@ test_exec(int fd, struct drm_xe_engine_class_instance *eci,
 		.num_syncs = 1,
 		.syncs = to_user_pointer(sync),
 	};
-	uint32_t exec_queues[MAX_N_EXECQUEUES];
-	uint32_t bind_exec_queues[MAX_N_EXECQUEUES];
+	uint32_t exec_queues[MAX_N_EXEC_QUEUES];
+	uint32_t bind_exec_queues[MAX_N_EXEC_QUEUES];
 	size_t bo_size;
 	uint32_t bo = 0;
-	struct {
-		uint32_t batch[16];
-		uint64_t pad;
-		uint64_t vm_sync;
-		uint64_t exec_sync;
-		uint32_t data;
-	} *data;
-	int i, j, b;
+	struct cpu_va *data;
+	int i, j;
 	int map_fd = -1;
 	int64_t fence_timeout;
+	uint64_t placement;
 
-	igt_assert(n_exec_queues <= MAX_N_EXECQUEUES);
+	igt_assert(n_exec_queues <= MAX_N_EXEC_QUEUES);
 
 	vm = xe_vm_create(fd, DRM_XE_VM_CREATE_FLAG_LR_MODE, 0);
 	bo_size = sizeof(*data) * n_execs;
 	bo_size = xe_bb_size(fd, bo_size);
 
-	for (i = 0; (flags & EXEC_QUEUE_EARLY) && i < n_exec_queues; i++) {
-		exec_queues[i] = xe_exec_queue_create(fd, vm, eci, 0);
-		if (flags & BIND_EXECQUEUE)
-			bind_exec_queues[i] =
-				xe_bind_exec_queue_create(fd, vm, 0);
-		else
-			bind_exec_queues[i] = 0;
-	};
+	if (flags & EXEC_QUEUE_EARLY)
+		create_exec_queue(fd, vm, exec_queues, bind_exec_queues, n_exec_queues, eci, flags);
 
-	if (flags & USERPTR) {
-#define	MAP_ADDRESS	0x00007fadeadbe000
-		if (flags & INVALIDATE) {
-			data = mmap((void *)MAP_ADDRESS, bo_size, PROT_READ |
-				    PROT_WRITE, MAP_SHARED | MAP_FIXED |
-				    MAP_ANONYMOUS, -1, 0);
-			igt_assert(data != MAP_FAILED);
-		} else {
-			data = aligned_alloc(xe_get_default_alignment(fd),
-					     bo_size);
-			igt_assert(data);
-		}
-	} else {
-		bo = xe_bo_create(fd, flags & VM_FOR_BO ? vm : 0,
-				  bo_size, vram_if_possible(fd, eci->gt_id),
-				  DRM_XE_GEM_CREATE_FLAG_NEEDS_VISIBLE_VRAM);
-		data = xe_bo_map(fd, bo, bo_size);
-	}
-	memset(data, 0, bo_size);
+	placement = vram_if_possible(fd, eci->gt_id);
 
-	for (i = 0; !(flags & EXEC_QUEUE_EARLY) && i < n_exec_queues; i++) {
-		exec_queues[i] = xe_exec_queue_create(fd, vm, eci, 0);
-		if (flags & BIND_EXECQUEUE)
-			bind_exec_queues[i] =
-				xe_bind_exec_queue_create(fd, vm, 0);
-		else
-			bind_exec_queues[i] = 0;
-	};
+	data = create_xe_bo(fd, vm, &bo, bo_size, placement, flags);
 
-	sync[0].addr = to_user_pointer(&data[0].vm_sync);
-	if (bo)
-		xe_vm_bind_async(fd, vm, bind_exec_queues[0], bo, 0, addr,
-				 bo_size, sync, 1);
-	else
-		xe_vm_bind_userptr_async(fd, vm, bind_exec_queues[0],
-					 to_user_pointer(data), addr,
-					 bo_size, sync, 1);
-#define ONE_SEC	MS_TO_NS(1000)
-#define HUNDRED_SEC	MS_TO_NS(100000)
+	if(!(flags & EXEC_QUEUE_EARLY))
+		create_exec_queue(fd, vm, exec_queues, bind_exec_queues, n_exec_queues, eci, flags);
+
+	xe_bo_bind(fd, vm, bind_exec_queues[0], bo, bo_size, addr, sync, data, flags);
 
 	fence_timeout = igt_run_in_simulation() ? HUNDRED_SEC : ONE_SEC;
 
-	xe_wait_ufence(fd, &data[0].vm_sync, USER_FENCE_VALUE,
-		       bind_exec_queues[0], fence_timeout);
-	data[0].vm_sync = 0;
-
-	for (i = 0; i < n_execs; i++) {
-		uint64_t batch_offset = (char *)&data[i].batch - (char *)data;
-		uint64_t batch_addr = addr + batch_offset;
-		uint64_t sdi_offset = (char *)&data[i].data - (char *)data;
-		uint64_t sdi_addr = addr + sdi_offset;
-		int e = i % n_exec_queues;
-
-		b = 0;
-		data[i].batch[b++] = MI_STORE_DWORD_IMM_GEN4;
-		data[i].batch[b++] = sdi_addr;
-		data[i].batch[b++] = sdi_addr >> 32;
-		data[i].batch[b++] = 0xc0ffee;
-		data[i].batch[b++] = MI_BATCH_BUFFER_END;
-		igt_assert(b <= ARRAY_SIZE(data[i].batch));
-
-		sync[0].addr = addr + (char *)&data[i].exec_sync - (char *)data;
-
-		exec.exec_queue_id = exec_queues[e];
-		exec.address = batch_addr;
-		xe_exec(fd, &exec);
-
-		if (flags & REBIND && i + 1 != n_execs) {
-			xe_wait_ufence(fd, &data[i].exec_sync, USER_FENCE_VALUE,
-				       exec_queues[e], fence_timeout);
-			xe_vm_unbind_async(fd, vm, bind_exec_queues[e], 0,
-					   addr, bo_size, NULL, 0);
-
-			sync[0].addr = to_user_pointer(&data[0].vm_sync);
-			addr += bo_size;
-			if (bo)
-				xe_vm_bind_async(fd, vm, bind_exec_queues[e], bo,
-						 0, addr, bo_size, sync, 1);
-			else
-				xe_vm_bind_userptr_async(fd, vm,
-							 bind_exec_queues[e],
-							 to_user_pointer(data),
-							 addr, bo_size, sync,
-							 1);
-			xe_wait_ufence(fd, &data[0].vm_sync, USER_FENCE_VALUE,
-				       bind_exec_queues[e], fence_timeout);
-			data[0].vm_sync = 0;
-		}
-
-		if (flags & INVALIDATE && i + 1 != n_execs) {
-			if (!(flags & RACE)) {
-				/*
-				 * Wait for exec completion and check data as
-				 * userptr will likely change to different
-				 * physical memory on next mmap call triggering
-				 * an invalidate.
-				 */
-				xe_wait_ufence(fd, &data[i].exec_sync,
-					       USER_FENCE_VALUE, exec_queues[e],
-					       fence_timeout);
-				igt_assert_eq(data[i].data, 0xc0ffee);
-			} else if (i * 2 != n_execs) {
-				/*
-				 * We issue 1 mmap which races against running
-				 * jobs. No real check here aside from this test
-				 * not faulting on the GPU.
-				 */
-				continue;
-			}
-
-			if (flags & RACE) {
-				map_fd = open("/tmp", O_TMPFILE | O_RDWR,
-					      0x666);
-				write(map_fd, data, bo_size);
-				data = mmap((void *)MAP_ADDRESS, bo_size,
-					    PROT_READ | PROT_WRITE, MAP_SHARED |
-					    MAP_FIXED, map_fd, 0);
-			} else {
-				data = mmap((void *)MAP_ADDRESS, bo_size,
-					    PROT_READ | PROT_WRITE, MAP_SHARED |
-					    MAP_FIXED | MAP_ANONYMOUS, -1, 0);
-			}
-			igt_assert(data != MAP_FAILED);
-		}
-	}
+	xe_execbuf(fd, vm, &exec, n_execs, exec_queues, bind_exec_queues,
+			n_exec_queues, bo, bo_size, sync, addr, data, &map_fd, flags);
 
 	j = flags & INVALIDATE ? n_execs - 1 : 0;
 	for (i = j; i < n_execs; i++)
@@ -264,20 +132,12 @@ test_exec(int fd, struct drm_xe_engine_class_instance *eci,
 	if (flags & INVALIDATE)
 		usleep(250000);
 
-	sync[0].addr = to_user_pointer(&data[0].vm_sync);
-	xe_vm_unbind_async(fd, vm, bind_exec_queues[0], 0, addr, bo_size,
-			   sync, 1);
-	xe_wait_ufence(fd, &data[0].vm_sync, USER_FENCE_VALUE,
-		       bind_exec_queues[0], fence_timeout);
+	xe_vm_unbind(fd, vm, bind_exec_queues[0], sync, data, addr, bo_size);
 
 	for (i = j; i < n_execs; i++)
 		igt_assert_eq(data[i].data, 0xc0ffee);
 
-	for (i = 0; i < n_exec_queues; i++) {
-		xe_exec_queue_destroy(fd, exec_queues[i]);
-		if (bind_exec_queues[i])
-			xe_exec_queue_destroy(fd, bind_exec_queues[i]);
-	}
+	destory_exec_queue(fd, exec_queues, bind_exec_queues, n_exec_queues);
 
 	if (bo) {
 		munmap(data, bo_size);
@@ -492,14 +352,14 @@ igt_main
 		{ "userptr-rebind", USERPTR | REBIND },
 		{ "userptr-invalidate", USERPTR | INVALIDATE },
 		{ "userptr-invalidate-race", USERPTR | INVALIDATE | RACE },
-		{ "bindexecqueue", BIND_EXECQUEUE },
-		{ "bindexecqueue-userptr", BIND_EXECQUEUE | USERPTR },
-		{ "bindexecqueue-rebind",  BIND_EXECQUEUE | REBIND },
-		{ "bindexecqueue-userptr-rebind",  BIND_EXECQUEUE | USERPTR |
+		{ "bindexecqueue", BIND_EXEC_QUEUE },
+		{ "bindexecqueue-userptr", BIND_EXEC_QUEUE | USERPTR },
+		{ "bindexecqueue-rebind",  BIND_EXEC_QUEUE | REBIND },
+		{ "bindexecqueue-userptr-rebind",  BIND_EXEC_QUEUE | USERPTR |
 			REBIND },
-		{ "bindexecqueue-userptr-invalidate",  BIND_EXECQUEUE | USERPTR |
+		{ "bindexecqueue-userptr-invalidate",  BIND_EXEC_QUEUE | USERPTR |
 			INVALIDATE },
-		{ "bindexecqueue-userptr-invalidate-race", BIND_EXECQUEUE | USERPTR |
+		{ "bindexecqueue-userptr-invalidate-race", BIND_EXEC_QUEUE | USERPTR |
 			INVALIDATE | RACE },
 		{ NULL },
 	};
diff --git a/tests/intel/xe_exec_fault_mode.c b/tests/intel/xe_exec_fault_mode.c
index 40fe1743e..7b2fef224 100644
--- a/tests/intel/xe_exec_fault_mode.c
+++ b/tests/intel/xe_exec_fault_mode.c
@@ -16,24 +16,13 @@
 #include "igt.h"
 #include "lib/igt_syncobj.h"
 #include "lib/intel_reg.h"
+#include "lib/xe/xe_util.h"
 #include "xe_drm.h"
 
 #include "xe/xe_ioctl.h"
 #include "xe/xe_query.h"
 #include <string.h>
 
-#define MAX_N_EXEC_QUEUES	16
-
-#define USERPTR		(0x1 << 0)
-#define REBIND		(0x1 << 1)
-#define INVALIDATE	(0x1 << 2)
-#define RACE		(0x1 << 3)
-#define BIND_EXEC_QUEUE	(0x1 << 4)
-#define PREFETCH	(0x1 << 5)
-#define INVALID_FAULT	(0x1 << 6)
-#define INVALID_VA	(0x1 << 7)
-#define ENABLE_SCRATCH  (0x1 << 8)
-
 /**
  * SUBTEST: invalid-va
  * Description: Access invalid va and check for EIO through user fence.
@@ -99,7 +88,6 @@ test_exec(int fd, struct drm_xe_engine_class_instance *eci,
 {
 	uint32_t vm;
 	uint64_t addr = 0x1a0000;
-#define USER_FENCE_VALUE	0xdeadbeefdeadbeefull
 	struct drm_xe_sync sync[1] = {
 		{ .type = DRM_XE_SYNC_TYPE_USER_FENCE, .flags = DRM_XE_SYNC_FLAG_SIGNAL,
 	          .timeline_value = USER_FENCE_VALUE },
@@ -113,14 +101,9 @@ test_exec(int fd, struct drm_xe_engine_class_instance *eci,
 	uint32_t bind_exec_queues[MAX_N_EXEC_QUEUES];
 	size_t bo_size;
 	uint32_t bo = 0;
-	struct {
-		uint32_t batch[16];
-		uint64_t pad;
-		uint64_t vm_sync;
-		uint64_t exec_sync;
-		uint32_t data;
-	} *data;
-	int i, j, b;
+	struct cpu_va *data;
+	uint64_t placement;
+	int i, j;
 	int map_fd = -1;
 
 	igt_assert(n_exec_queues <= MAX_N_EXEC_QUEUES);
@@ -134,144 +117,19 @@ test_exec(int fd, struct drm_xe_engine_class_instance *eci,
 	bo_size = sizeof(*data) * n_execs;
 	bo_size = xe_bb_size(fd, bo_size);
 
-	if (flags & USERPTR) {
-#define	MAP_ADDRESS	0x00007fadeadbe000
-		if (flags & INVALIDATE) {
-			data = mmap((void *)MAP_ADDRESS, bo_size, PROT_READ |
-				    PROT_WRITE, MAP_SHARED | MAP_FIXED |
-				    MAP_ANONYMOUS, -1, 0);
-			igt_assert(data != MAP_FAILED);
-		} else {
-			data = aligned_alloc(xe_get_default_alignment(fd),
-					     bo_size);
-			igt_assert(data);
-		}
-	} else {
-		if (flags & PREFETCH)
-			bo = xe_bo_create(fd, 0, bo_size,
-					  all_memory_regions(fd) |
-					  vram_if_possible(fd, 0),
-					  DRM_XE_GEM_CREATE_FLAG_NEEDS_VISIBLE_VRAM);
-		else
-			bo = xe_bo_create(fd, 0, bo_size,
-					  vram_if_possible(fd, eci->gt_id),
-					  DRM_XE_GEM_CREATE_FLAG_NEEDS_VISIBLE_VRAM);
-		data = xe_bo_map(fd, bo, bo_size);
-	}
-	memset(data, 0, bo_size);
-
-	for (i = 0; i < n_exec_queues; i++) {
-		exec_queues[i] = xe_exec_queue_create(fd, vm, eci, 0);
-		if (flags & BIND_EXEC_QUEUE)
-			bind_exec_queues[i] =
-				xe_bind_exec_queue_create(fd, vm, 0);
-		else
-			bind_exec_queues[i] = 0;
-	};
-
-	sync[0].addr = to_user_pointer(&data[0].vm_sync);
-	if (bo)
-		xe_vm_bind_async(fd, vm, bind_exec_queues[0], bo, 0, addr, bo_size, sync, 1);
-	else
-		xe_vm_bind_userptr_async(fd, vm, bind_exec_queues[0],
-					 to_user_pointer(data), addr,
-					 bo_size, sync, 1);
-#define ONE_SEC	MS_TO_NS(1000)
-	xe_wait_ufence(fd, &data[0].vm_sync, USER_FENCE_VALUE,
-		       bind_exec_queues[0], ONE_SEC);
-	data[0].vm_sync = 0;
-
-	if (flags & PREFETCH) {
-		/* Should move to system memory */
-		xe_vm_prefetch_async(fd, vm, bind_exec_queues[0], 0, addr,
-				     bo_size, sync, 1, 0);
-		xe_wait_ufence(fd, &data[0].vm_sync, USER_FENCE_VALUE,
-			       bind_exec_queues[0], ONE_SEC);
-		data[0].vm_sync = 0;
-	}
-
-	for (i = 0; i < n_execs; i++) {
-		uint64_t batch_offset = (char *)&data[i].batch - (char *)data;
-		uint64_t batch_addr = addr + batch_offset;
-		uint64_t sdi_offset = (char *)&data[i].data - (char *)data;
-		uint64_t sdi_addr = addr + sdi_offset;
-		int e = i % n_exec_queues;
+	placement = (flags & PREFETCH) ? all_memory_regions(fd) |
+		vram_if_possible(fd, 0) : vram_if_possible(fd, eci->gt_id);
 
-		b = 0;
-		if (flags & INVALID_VA)
-			sdi_addr = 0x1fffffffffff000;
+	data = create_xe_bo(fd, vm, &bo, bo_size, placement, flags);
 
-		data[i].batch[b++] = MI_STORE_DWORD_IMM_GEN4;
-		data[i].batch[b++] = sdi_addr;
-		data[i].batch[b++] = sdi_addr >> 32;
-		data[i].batch[b++] = 0xc0ffee;
-		data[i].batch[b++] = MI_BATCH_BUFFER_END;
-		igt_assert(b <= ARRAY_SIZE(data[i].batch));
+	create_exec_queue(fd, vm, exec_queues, bind_exec_queues,
+			n_exec_queues, eci, flags);
 
-		sync[0].addr = addr + (char *)&data[i].exec_sync - (char *)data;
+	xe_bo_bind(fd, vm, bind_exec_queues[0], bo, bo_size, addr, sync, data, flags);
 
-		exec.exec_queue_id = exec_queues[e];
-		exec.address = batch_addr;
-		xe_exec(fd, &exec);
+	xe_execbuf(fd, vm, &exec, n_execs, exec_queues, bind_exec_queues,
+			n_exec_queues, bo, bo_size, sync, addr, data, &map_fd, flags);
 
-		if (flags & REBIND && i + 1 != n_execs) {
-			xe_wait_ufence(fd, &data[i].exec_sync, USER_FENCE_VALUE,
-				       exec_queues[e], ONE_SEC);
-			xe_vm_unbind_async(fd, vm, bind_exec_queues[e], 0,
-					   addr, bo_size, NULL, 0);
-
-			sync[0].addr = to_user_pointer(&data[0].vm_sync);
-			addr += bo_size;
-			if (bo)
-				xe_vm_bind_async(fd, vm, bind_exec_queues[e], bo,
-						 0, addr, bo_size, sync, 1);
-			else
-				xe_vm_bind_userptr_async(fd, vm,
-							 bind_exec_queues[e],
-							 to_user_pointer(data),
-							 addr, bo_size, sync,
-							 1);
-			xe_wait_ufence(fd, &data[0].vm_sync, USER_FENCE_VALUE,
-				       bind_exec_queues[e], ONE_SEC);
-			data[0].vm_sync = 0;
-		}
-
-		if (flags & INVALIDATE && i + 1 != n_execs) {
-			if (!(flags & RACE)) {
-				/*
-				 * Wait for exec completion and check data as
-				 * userptr will likely change to different
-				 * physical memory on next mmap call triggering
-				 * an invalidate.
-				 */
-				xe_wait_ufence(fd, &data[i].exec_sync,
-					       USER_FENCE_VALUE, exec_queues[e],
-					       ONE_SEC);
-				igt_assert_eq(data[i].data, 0xc0ffee);
-			} else if (i * 2 != n_execs) {
-				/*
-				 * We issue 1 mmap which races against running
-				 * jobs. No real check here aside from this test
-				 * not faulting on the GPU.
-				 */
-				continue;
-			}
-
-			if (flags & RACE) {
-				map_fd = open("/tmp", O_TMPFILE | O_RDWR,
-					      0x666);
-				write(map_fd, data, bo_size);
-				data = mmap((void *)MAP_ADDRESS, bo_size,
-					    PROT_READ | PROT_WRITE, MAP_SHARED |
-					    MAP_FIXED, map_fd, 0);
-			} else {
-				data = mmap((void *)MAP_ADDRESS, bo_size,
-					    PROT_READ | PROT_WRITE, MAP_SHARED |
-					    MAP_FIXED | MAP_ANONYMOUS, -1, 0);
-			}
-			igt_assert(data != MAP_FAILED);
-		}
-	}
 	if (!(flags & INVALID_FAULT)) {
 		int64_t timeout = ONE_SEC;
 
@@ -286,22 +144,15 @@ test_exec(int fd, struct drm_xe_engine_class_instance *eci,
 							       exec_queues[i % n_exec_queues], &timeout), 0);
 		}
 	}
-	sync[0].addr = to_user_pointer(&data[0].vm_sync);
-	xe_vm_unbind_async(fd, vm, bind_exec_queues[0], 0, addr, bo_size,
-			   sync, 1);
-	xe_wait_ufence(fd, &data[0].vm_sync, USER_FENCE_VALUE,
-		       bind_exec_queues[0], ONE_SEC);
+
+	xe_vm_unbind(fd, vm, bind_exec_queues[0], sync, data, addr, bo_size);
 
 	if (!(flags & INVALID_FAULT) && !(flags & INVALID_VA)) {
 		for (i = j; i < n_execs; i++)
 				igt_assert_eq(data[i].data, 0xc0ffee);
 	}
 
-	for (i = 0; i < n_exec_queues; i++) {
-		xe_exec_queue_destroy(fd, exec_queues[i]);
-		if (bind_exec_queues[i])
-			xe_exec_queue_destroy(fd, bind_exec_queues[i]);
-	}
+	destory_exec_queue(fd, exec_queues, bind_exec_queues, n_exec_queues);
 
 	if (bo) {
 		munmap(data, bo_size);
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* RE: [PATCH] lib/xe/xe_util: Creating the helper functions
  2024-03-31 18:59 [PATCH] lib/xe/xe_util: Creating the helper functions Bommu Krishnaiah
@ 2024-03-31 19:04 ` Bommu, Krishnaiah
  2024-03-31 19:05 ` ✗ CI.Patch_applied: failure for " Patchwork
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 9+ messages in thread
From: Bommu, Krishnaiah @ 2024-03-31 19:04 UTC (permalink / raw)
  To: intel-xe@lists.freedesktop.org
  Cc: Kumar, Janga Rahul, Ghimiray, Himal Prasad, Zeng, Oak



> -----Original Message-----
> From: Bommu, Krishnaiah <krishnaiah.bommu@intel.com>
> Sent: Monday, April 1, 2024 12:30 AM
> To: intel-xe@lists.freedesktop.org
> Cc: Bommu, Krishnaiah <krishnaiah.bommu@intel.com>; Kumar, Janga Rahul
> <janga.rahul.kumar@intel.com>; Ghimiray, Himal Prasad
> <himal.prasad.ghimiray@intel.com>; Zeng, Oak <oak.zeng@intel.com>
> Subject: [PATCH] lib/xe/xe_util: Creating the helper functions
> 
> Creating the helper functions which can be reused by many tests, as part of this
> patch xe_exec_compute_mode and xe_exec_fault_mode are used this helper
> functions.
> 
> Signed-off-by: Bommu Krishnaiah <krishnaiah.bommu@intel.com>
> Cc: Janga Rahul Kumar <janga.rahul.kumar@intel.com>
> Cc: Himal Prasad Ghimiray <himal.prasad.ghimiray@intel.com>
> Cc: Oak Zeng <oak.zeng@intel.com>
> ---
>  lib/xe/xe_util.c                   | 194 +++++++++++++++++++++++++++++
>  lib/xe/xe_util.h                   |  48 +++++++
>  tests/intel/xe_exec_compute_mode.c | 190 ++++------------------------
>  tests/intel/xe_exec_fault_mode.c   | 179 +++-----------------------
>  4 files changed, 282 insertions(+), 329 deletions(-)
> 
> diff --git a/lib/xe/xe_util.c b/lib/xe/xe_util.c index 050162b5e..17cd45e45
> 100644
> --- a/lib/xe/xe_util.c
> +++ b/lib/xe/xe_util.c
> @@ -255,3 +255,197 @@ bool xe_is_gt_in_c6(int fd, int gt)
> 
>  	return strcmp(gt_c_state, "gt-c6") == 0;  }
> +
> +struct cpu_va *create_xe_bo(int fd, uint32_t vm, uint32_t *bo,
> +		uint32_t bo_size, uint64_t placement, unsigned int flags) {
> +	struct cpu_va *data;
> +
> +	if (flags & USERPTR) {
> +		if (flags & INVALIDATE) {
> +			data = mmap((void *)MAP_ADDRESS, bo_size,
> PROT_READ | PROT_WRITE,
> +					MAP_SHARED | MAP_FIXED |
> MAP_ANONYMOUS, -1, 0);
> +			igt_assert(data != MAP_FAILED);
> +		} else {
> +			data = aligned_alloc(xe_get_default_alignment(fd),
> bo_size);
> +			igt_assert(data);
> +		}
> +	} else {
> +		*bo = xe_bo_create(fd, flags & VM_FOR_BO ? vm : 0, bo_size,
> placement,
> +
> 	DRM_XE_GEM_CREATE_FLAG_NEEDS_VISIBLE_VRAM);
> +		data = xe_bo_map(fd, *bo, bo_size);
> +	}
> +
> +	memset(data, 0, bo_size);
> +
> +	return data;
> +}
> +
> +void create_exec_queue(int fd, uint32_t vm, uint32_t *exec_queues,
> +		uint32_t *bind_exec_queues, int n_exec_queues,
> +		struct drm_xe_engine_class_instance *eci, unsigned int flags) {
> +	int i;
> +
> +	for (i = 0; i < n_exec_queues; i++) {
> +		exec_queues[i] = xe_exec_queue_create(fd, vm, eci, 0);
> +		if (flags & BIND_EXEC_QUEUE)
> +			bind_exec_queues[i] = xe_bind_exec_queue_create(fd,
> vm, 0);
> +		else
> +			bind_exec_queues[i] = 0;
> +	};
> +}
> +
> +void xe_bo_bind(int fd, uint32_t vm, uint32_t bind_exec_queues, uint32_t bo,
> +		uint32_t bo_size, uint64_t addr, struct drm_xe_sync *sync,
> +		struct cpu_va *data, unsigned int flags) {
> +	int64_t fence_timeout = igt_run_in_simulation() ? HUNDRED_SEC :
> +ONE_SEC;
> +
> +	sync[0].addr = to_user_pointer(&data[0].vm_sync);
> +	if (bo)
> +		xe_vm_bind_async(fd, vm, bind_exec_queues, bo, 0, addr,
> bo_size, sync, 1);
> +	else
> +		xe_vm_bind_userptr_async(fd, vm, bind_exec_queues,
> +				to_user_pointer(data), addr,
> +				bo_size, sync, 1);
> +
> +	xe_wait_ufence(fd, &data[0].vm_sync, USER_FENCE_VALUE,
> +			bind_exec_queues, fence_timeout);
> +	data[0].vm_sync = 0;
> +
> +	if (flags & PREFETCH) {
> +		/* Should move to system memory */
> +		xe_vm_prefetch_async(fd, vm, bind_exec_queues, 0, addr,
> +				bo_size, sync, 1, 0);
> +		xe_wait_ufence(fd, &data[0].vm_sync, USER_FENCE_VALUE,
> +				bind_exec_queues, fence_timeout);
> +		data[0].vm_sync = 0;
> +	}
> +}
> +
> +void insert_store(uint64_t dst, struct cpu_va *data, uint32_t val, int
> +i) {
> +	int b = 0;
> +	data[i].batch[b++] = MI_STORE_DWORD_IMM_GEN4;
> +	data[i].batch[b++] = dst;
> +	data[i].batch[b++] = dst >> 32;
> +	data[i].batch[b++] = val;
> +	data[i].batch[b++] = MI_BATCH_BUFFER_END;
> +	igt_assert(b <= ARRAY_SIZE(data[i].batch)); }
> +
Since number of arguments are more, I will add passing arguments to one structure and pass, I will send new patch with this changes.
Regards,
Krishna.
> +void xe_execbuf(int fd, uint32_t vm, struct drm_xe_exec *exec, int n_execs,
> +		uint32_t *exec_queues, uint32_t *bind_exec_queues,
> +		int n_exec_queues, uint32_t bo, uint32_t bo_size,
> +		struct drm_xe_sync *sync, uint64_t addr, struct cpu_va *data,
> +		int *map_fd, unsigned int flags)
> +{
> +	int i;
> +	int64_t fence_timeout = igt_run_in_simulation() ? HUNDRED_SEC :
> +ONE_SEC;
> +
> +	for (i = 0; i < n_execs; i++) {
> +		uint64_t batch_offset = (char *)&data[i].batch - (char *)data;
> +		uint64_t batch_addr = addr + batch_offset;
> +		uint64_t sdi_offset = (char *)&data[i].data - (char *)data;
> +		uint64_t sdi_addr = addr + sdi_offset;
> +		int e = i % n_exec_queues;
> +
> +		if (flags & INVALID_VA)
> +			sdi_addr = 0x1fffffffffff000;
> +
> +		insert_store(sdi_addr, data, 0xc0ffee, i);
> +
> +		sync[0].addr = addr + (char *)&data[i].exec_sync - (char *)data;
> +
> +		exec->exec_queue_id = exec_queues[e];
> +		exec->address = batch_addr;
> +		xe_exec(fd, exec);
> +
> +		if (flags & REBIND && i + 1 != n_execs) {
> +			xe_wait_ufence(fd, &data[i].exec_sync,
> USER_FENCE_VALUE,
> +					exec_queues[e], fence_timeout);
> +			xe_vm_unbind_async(fd, vm, bind_exec_queues[e], 0,
> +					addr, bo_size, NULL, 0);
> +
> +			sync[0].addr = to_user_pointer(&data[0].vm_sync);
> +			addr += bo_size;
> +
> +			if (bo)
> +				xe_vm_bind_async(fd, vm,
> bind_exec_queues[e], bo,
> +						0, addr, bo_size, sync, 1);
> +			else
> +				xe_vm_bind_userptr_async(fd, vm,
> +						bind_exec_queues[e],
> +						to_user_pointer(data),
> +						addr, bo_size, sync, 1);
> +
> +			xe_wait_ufence(fd, &data[0].vm_sync,
> USER_FENCE_VALUE,
> +					bind_exec_queues[e], fence_timeout);
> +			data[0].vm_sync = 0;
> +		}
> +
> +		if (flags & INVALIDATE && i + 1 != n_execs) {
> +			if (!(flags & RACE)) {
> +				/*
> +				 * Wait for exec completion and check data as
> +				 * userptr will likely change to different
> +				 * physical memory on next mmap call
> triggering
> +				 * an invalidate.
> +				 */
> +				xe_wait_ufence(fd, &data[i].exec_sync,
> +						USER_FENCE_VALUE,
> exec_queues[e],
> +						fence_timeout);
> +				igt_assert_eq(data[i].data, 0xc0ffee);
> +			} else if (i * 2 != n_execs) {
> +				/*
> +				 * We issue 1 mmap which races against
> running
> +				 * jobs. No real check here aside from this test
> +				 * not faulting on the GPU.
> +				 */
> +				continue;
> +			}
> +
> +			if (flags & RACE) {
> +				*map_fd = open("/tmp", O_TMPFILE |
> O_RDWR,
> +						0x666);
> +				write(*map_fd, data, bo_size);
> +				data = mmap((void *)MAP_ADDRESS, bo_size,
> +						PROT_READ | PROT_WRITE,
> MAP_SHARED |
> +						MAP_FIXED, *map_fd, 0);
> +			} else {
> +				data = mmap((void *)MAP_ADDRESS, bo_size,
> +						PROT_READ | PROT_WRITE,
> MAP_SHARED |
> +						MAP_FIXED |
> MAP_ANONYMOUS, -1, 0);
> +			}
> +
> +			igt_assert(data != MAP_FAILED);
> +		}
> +	}
> +}
> +
> +void xe_vm_unbind(int fd, uint32_t vm, uint32_t bind_exec_queues,
> +		struct drm_xe_sync *sync, struct cpu_va *data, uint64_t addr,
> +		uint32_t bo_size)
> +{
> +	int64_t fence_timeout = igt_run_in_simulation() ? HUNDRED_SEC :
> +ONE_SEC;
> +
> +	sync[0].addr = to_user_pointer(&data[0].vm_sync);
> +	xe_vm_unbind_async(fd, vm, bind_exec_queues, 0, addr, bo_size,
> +			sync, 1);
> +	xe_wait_ufence(fd, &data[0].vm_sync, USER_FENCE_VALUE,
> +			bind_exec_queues, fence_timeout);
> +}
> +
> +void destory_exec_queue(int fd, uint32_t *exec_queues,
> +		uint32_t *bind_exec_queues, int n_exec_queues) {
> +	int i;
> +
> +	for (i = 0; i < n_exec_queues; i++) {
> +		xe_exec_queue_destroy(fd, exec_queues[i]);
> +		if (bind_exec_queues[i])
> +			xe_exec_queue_destroy(fd, bind_exec_queues[i]);
> +	}
> +}
> +
> diff --git a/lib/xe/xe_util.h b/lib/xe/xe_util.h index 6480ea01a..aa1b0dcc1
> 100644
> --- a/lib/xe/xe_util.h
> +++ b/lib/xe/xe_util.h
> @@ -47,4 +47,52 @@ void xe_bind_unbind_async(int fd, uint32_t vm, uint32_t
> bind_engine,
> 
>  bool xe_is_gt_in_c6(int fd, int gt);
> 
> +
> +#define USERPTR			(0x1 << 0)
> +#define REBIND			(0x1 << 1)
> +#define INVALIDATE		(0x1 << 2)
> +#define RACE			(0x1 << 3)
> +#define BIND_EXEC_QUEUE		(0x1 << 4)
> +#define PREFETCH		(0x1 << 5)
> +#define INVALID_FAULT		(0x1 << 6)
> +#define INVALID_VA		(0x1 << 7)
> +#define ENABLE_SCRATCH		(0x1 << 8)
> +#define VM_FOR_BO		(0x1 << 9)
> +#define EXEC_QUEUE_EARLY	(0x1 << 10)
> +
> +#define MAX_N_EXEC_QUEUES	16
> +#define USER_FENCE_VALUE	0xdeadbeefdeadbeefull
> +#define MAP_ADDRESS		0x00007fadeadbe000
> +
> +#define ONE_SEC		MS_TO_NS(1000)
> +#define HUNDRED_SEC	MS_TO_NS(100000)
> +
> +struct cpu_va {
> +	uint32_t batch[16];
> +	uint64_t pad;
> +	uint64_t vm_sync;
> +	uint64_t exec_sync;
> +	uint32_t data;
> +};
> +
> +struct cpu_va *create_xe_bo(int fd, uint32_t vm, uint32_t *bo,
> +		uint32_t bo_size, uint64_t placement,  unsigned int flags); void
> +create_exec_queue(int fd, uint32_t vm, uint32_t *exec_queues,
> +		uint32_t *bind_exec_queues, int n_exec_queues,
> +		struct drm_xe_engine_class_instance *eci, unsigned int flags);
> void
> +xe_bo_bind(int fd, uint32_t vm, uint32_t bind_exec_queues, uint32_t bo,
> +		uint32_t bo_size, uint64_t addr, struct drm_xe_sync *sync,
> +		struct cpu_va *data, unsigned int flags); void xe_execbuf(int fd,
> +uint32_t vm, struct drm_xe_exec *exec, int n_execs,
> +		uint32_t *exec_queues, uint32_t *bind_exec_queues,
> +		int n_exec_queues, uint32_t bo, uint32_t bo_size,
> +		struct drm_xe_sync *sync, uint64_t addr, struct cpu_va *data,
> +		int *map_fd, unsigned int flags);
> +void insert_store(uint64_t dst, struct cpu_va *data, uint32_t val, int
> +i); void xe_vm_unbind(int fd, uint32_t vm, uint32_t bind_exec_queues,
> +		struct drm_xe_sync *sync, struct cpu_va *data,
> +		uint64_t addr, uint32_t bo_size);
> +void destory_exec_queue(int fd, uint32_t *exec_queues,
> +		uint32_t *bind_exec_queues, int n_exec_queues);
> +
>  #endif /* XE_UTIL_H */
> diff --git a/tests/intel/xe_exec_compute_mode.c
> b/tests/intel/xe_exec_compute_mode.c
> index 3ec848164..e8d82cc69 100644
> --- a/tests/intel/xe_exec_compute_mode.c
> +++ b/tests/intel/xe_exec_compute_mode.c
> @@ -15,6 +15,7 @@
>  #include "igt.h"
>  #include "lib/igt_syncobj.h"
>  #include "lib/intel_reg.h"
> +#include "lib/xe/xe_util.h"
>  #include <sys/ioctl.h>
>  #include "xe_drm.h"
> 
> @@ -23,15 +24,6 @@
>  #include "xe/xe_spin.h"
>  #include <string.h>
> 
> -#define MAX_N_EXECQUEUES 	16
> -#define USERPTR				(0x1 << 0)
> -#define REBIND				(0x1 << 1)
> -#define INVALIDATE			(0x1 << 2)
> -#define RACE				(0x1 << 3)
> -#define BIND_EXECQUEUE		(0x1 << 4)
> -#define VM_FOR_BO			(0x1 << 5)
> -#define EXEC_QUEUE_EARLY	(0x1 << 6)
> -
>  /**
>   * SUBTEST: twice-%s
>   * Description: Run %arg[1] compute machine test twice @@ -88,7 +80,6 @@
> test_exec(int fd, struct drm_xe_engine_class_instance *eci,  {
>  	uint32_t vm;
>  	uint64_t addr = 0x1a0000;
> -#define USER_FENCE_VALUE	0xdeadbeefdeadbeefull
>  	struct drm_xe_sync sync[1] = {
>  		{ .type = DRM_XE_SYNC_TYPE_USER_FENCE,
>  		  .flags = DRM_XE_SYNC_FLAG_SIGNAL,
> @@ -99,161 +90,38 @@ test_exec(int fd, struct drm_xe_engine_class_instance
> *eci,
>  		.num_syncs = 1,
>  		.syncs = to_user_pointer(sync),
>  	};
> -	uint32_t exec_queues[MAX_N_EXECQUEUES];
> -	uint32_t bind_exec_queues[MAX_N_EXECQUEUES];
> +	uint32_t exec_queues[MAX_N_EXEC_QUEUES];
> +	uint32_t bind_exec_queues[MAX_N_EXEC_QUEUES];
>  	size_t bo_size;
>  	uint32_t bo = 0;
> -	struct {
> -		uint32_t batch[16];
> -		uint64_t pad;
> -		uint64_t vm_sync;
> -		uint64_t exec_sync;
> -		uint32_t data;
> -	} *data;
> -	int i, j, b;
> +	struct cpu_va *data;
> +	int i, j;
>  	int map_fd = -1;
>  	int64_t fence_timeout;
> +	uint64_t placement;
> 
> -	igt_assert(n_exec_queues <= MAX_N_EXECQUEUES);
> +	igt_assert(n_exec_queues <= MAX_N_EXEC_QUEUES);
> 
>  	vm = xe_vm_create(fd, DRM_XE_VM_CREATE_FLAG_LR_MODE, 0);
>  	bo_size = sizeof(*data) * n_execs;
>  	bo_size = xe_bb_size(fd, bo_size);
> 
> -	for (i = 0; (flags & EXEC_QUEUE_EARLY) && i < n_exec_queues; i++) {
> -		exec_queues[i] = xe_exec_queue_create(fd, vm, eci, 0);
> -		if (flags & BIND_EXECQUEUE)
> -			bind_exec_queues[i] =
> -				xe_bind_exec_queue_create(fd, vm, 0);
> -		else
> -			bind_exec_queues[i] = 0;
> -	};
> +	if (flags & EXEC_QUEUE_EARLY)
> +		create_exec_queue(fd, vm, exec_queues, bind_exec_queues,
> +n_exec_queues, eci, flags);
> 
> -	if (flags & USERPTR) {
> -#define	MAP_ADDRESS	0x00007fadeadbe000
> -		if (flags & INVALIDATE) {
> -			data = mmap((void *)MAP_ADDRESS, bo_size,
> PROT_READ |
> -				    PROT_WRITE, MAP_SHARED | MAP_FIXED |
> -				    MAP_ANONYMOUS, -1, 0);
> -			igt_assert(data != MAP_FAILED);
> -		} else {
> -			data = aligned_alloc(xe_get_default_alignment(fd),
> -					     bo_size);
> -			igt_assert(data);
> -		}
> -	} else {
> -		bo = xe_bo_create(fd, flags & VM_FOR_BO ? vm : 0,
> -				  bo_size, vram_if_possible(fd, eci->gt_id),
> -
> DRM_XE_GEM_CREATE_FLAG_NEEDS_VISIBLE_VRAM);
> -		data = xe_bo_map(fd, bo, bo_size);
> -	}
> -	memset(data, 0, bo_size);
> +	placement = vram_if_possible(fd, eci->gt_id);
> 
> -	for (i = 0; !(flags & EXEC_QUEUE_EARLY) && i < n_exec_queues; i++) {
> -		exec_queues[i] = xe_exec_queue_create(fd, vm, eci, 0);
> -		if (flags & BIND_EXECQUEUE)
> -			bind_exec_queues[i] =
> -				xe_bind_exec_queue_create(fd, vm, 0);
> -		else
> -			bind_exec_queues[i] = 0;
> -	};
> +	data = create_xe_bo(fd, vm, &bo, bo_size, placement, flags);
> 
> -	sync[0].addr = to_user_pointer(&data[0].vm_sync);
> -	if (bo)
> -		xe_vm_bind_async(fd, vm, bind_exec_queues[0], bo, 0, addr,
> -				 bo_size, sync, 1);
> -	else
> -		xe_vm_bind_userptr_async(fd, vm, bind_exec_queues[0],
> -					 to_user_pointer(data), addr,
> -					 bo_size, sync, 1);
> -#define ONE_SEC	MS_TO_NS(1000)
> -#define HUNDRED_SEC	MS_TO_NS(100000)
> +	if(!(flags & EXEC_QUEUE_EARLY))
> +		create_exec_queue(fd, vm, exec_queues, bind_exec_queues,
> +n_exec_queues, eci, flags);
> +
> +	xe_bo_bind(fd, vm, bind_exec_queues[0], bo, bo_size, addr, sync, data,
> +flags);
> 
>  	fence_timeout = igt_run_in_simulation() ? HUNDRED_SEC : ONE_SEC;
> 
> -	xe_wait_ufence(fd, &data[0].vm_sync, USER_FENCE_VALUE,
> -		       bind_exec_queues[0], fence_timeout);
> -	data[0].vm_sync = 0;
> -
> -	for (i = 0; i < n_execs; i++) {
> -		uint64_t batch_offset = (char *)&data[i].batch - (char *)data;
> -		uint64_t batch_addr = addr + batch_offset;
> -		uint64_t sdi_offset = (char *)&data[i].data - (char *)data;
> -		uint64_t sdi_addr = addr + sdi_offset;
> -		int e = i % n_exec_queues;
> -
> -		b = 0;
> -		data[i].batch[b++] = MI_STORE_DWORD_IMM_GEN4;
> -		data[i].batch[b++] = sdi_addr;
> -		data[i].batch[b++] = sdi_addr >> 32;
> -		data[i].batch[b++] = 0xc0ffee;
> -		data[i].batch[b++] = MI_BATCH_BUFFER_END;
> -		igt_assert(b <= ARRAY_SIZE(data[i].batch));
> -
> -		sync[0].addr = addr + (char *)&data[i].exec_sync - (char *)data;
> -
> -		exec.exec_queue_id = exec_queues[e];
> -		exec.address = batch_addr;
> -		xe_exec(fd, &exec);
> -
> -		if (flags & REBIND && i + 1 != n_execs) {
> -			xe_wait_ufence(fd, &data[i].exec_sync,
> USER_FENCE_VALUE,
> -				       exec_queues[e], fence_timeout);
> -			xe_vm_unbind_async(fd, vm, bind_exec_queues[e], 0,
> -					   addr, bo_size, NULL, 0);
> -
> -			sync[0].addr = to_user_pointer(&data[0].vm_sync);
> -			addr += bo_size;
> -			if (bo)
> -				xe_vm_bind_async(fd, vm,
> bind_exec_queues[e], bo,
> -						 0, addr, bo_size, sync, 1);
> -			else
> -				xe_vm_bind_userptr_async(fd, vm,
> -							 bind_exec_queues[e],
> -
> to_user_pointer(data),
> -							 addr, bo_size, sync,
> -							 1);
> -			xe_wait_ufence(fd, &data[0].vm_sync,
> USER_FENCE_VALUE,
> -				       bind_exec_queues[e], fence_timeout);
> -			data[0].vm_sync = 0;
> -		}
> -
> -		if (flags & INVALIDATE && i + 1 != n_execs) {
> -			if (!(flags & RACE)) {
> -				/*
> -				 * Wait for exec completion and check data as
> -				 * userptr will likely change to different
> -				 * physical memory on next mmap call
> triggering
> -				 * an invalidate.
> -				 */
> -				xe_wait_ufence(fd, &data[i].exec_sync,
> -					       USER_FENCE_VALUE,
> exec_queues[e],
> -					       fence_timeout);
> -				igt_assert_eq(data[i].data, 0xc0ffee);
> -			} else if (i * 2 != n_execs) {
> -				/*
> -				 * We issue 1 mmap which races against
> running
> -				 * jobs. No real check here aside from this test
> -				 * not faulting on the GPU.
> -				 */
> -				continue;
> -			}
> -
> -			if (flags & RACE) {
> -				map_fd = open("/tmp", O_TMPFILE | O_RDWR,
> -					      0x666);
> -				write(map_fd, data, bo_size);
> -				data = mmap((void *)MAP_ADDRESS, bo_size,
> -					    PROT_READ | PROT_WRITE,
> MAP_SHARED |
> -					    MAP_FIXED, map_fd, 0);
> -			} else {
> -				data = mmap((void *)MAP_ADDRESS, bo_size,
> -					    PROT_READ | PROT_WRITE,
> MAP_SHARED |
> -					    MAP_FIXED | MAP_ANONYMOUS, -
> 1, 0);
> -			}
> -			igt_assert(data != MAP_FAILED);
> -		}
> -	}
> +	xe_execbuf(fd, vm, &exec, n_execs, exec_queues, bind_exec_queues,
> +			n_exec_queues, bo, bo_size, sync, addr, data,
> &map_fd, flags);
> 
>  	j = flags & INVALIDATE ? n_execs - 1 : 0;
>  	for (i = j; i < n_execs; i++)
> @@ -264,20 +132,12 @@ test_exec(int fd, struct
> drm_xe_engine_class_instance *eci,
>  	if (flags & INVALIDATE)
>  		usleep(250000);
> 
> -	sync[0].addr = to_user_pointer(&data[0].vm_sync);
> -	xe_vm_unbind_async(fd, vm, bind_exec_queues[0], 0, addr, bo_size,
> -			   sync, 1);
> -	xe_wait_ufence(fd, &data[0].vm_sync, USER_FENCE_VALUE,
> -		       bind_exec_queues[0], fence_timeout);
> +	xe_vm_unbind(fd, vm, bind_exec_queues[0], sync, data, addr, bo_size);
> 
>  	for (i = j; i < n_execs; i++)
>  		igt_assert_eq(data[i].data, 0xc0ffee);
> 
> -	for (i = 0; i < n_exec_queues; i++) {
> -		xe_exec_queue_destroy(fd, exec_queues[i]);
> -		if (bind_exec_queues[i])
> -			xe_exec_queue_destroy(fd, bind_exec_queues[i]);
> -	}
> +	destory_exec_queue(fd, exec_queues, bind_exec_queues,
> n_exec_queues);
> 
>  	if (bo) {
>  		munmap(data, bo_size);
> @@ -492,14 +352,14 @@ igt_main
>  		{ "userptr-rebind", USERPTR | REBIND },
>  		{ "userptr-invalidate", USERPTR | INVALIDATE },
>  		{ "userptr-invalidate-race", USERPTR | INVALIDATE | RACE },
> -		{ "bindexecqueue", BIND_EXECQUEUE },
> -		{ "bindexecqueue-userptr", BIND_EXECQUEUE | USERPTR },
> -		{ "bindexecqueue-rebind",  BIND_EXECQUEUE | REBIND },
> -		{ "bindexecqueue-userptr-rebind",  BIND_EXECQUEUE |
> USERPTR |
> +		{ "bindexecqueue", BIND_EXEC_QUEUE },
> +		{ "bindexecqueue-userptr", BIND_EXEC_QUEUE | USERPTR },
> +		{ "bindexecqueue-rebind",  BIND_EXEC_QUEUE | REBIND },
> +		{ "bindexecqueue-userptr-rebind",  BIND_EXEC_QUEUE |
> USERPTR |
>  			REBIND },
> -		{ "bindexecqueue-userptr-invalidate",  BIND_EXECQUEUE |
> USERPTR |
> +		{ "bindexecqueue-userptr-invalidate",  BIND_EXEC_QUEUE |
> USERPTR |
>  			INVALIDATE },
> -		{ "bindexecqueue-userptr-invalidate-race", BIND_EXECQUEUE |
> USERPTR |
> +		{ "bindexecqueue-userptr-invalidate-race", BIND_EXEC_QUEUE
> | USERPTR
> +|
>  			INVALIDATE | RACE },
>  		{ NULL },
>  	};
> diff --git a/tests/intel/xe_exec_fault_mode.c
> b/tests/intel/xe_exec_fault_mode.c
> index 40fe1743e..7b2fef224 100644
> --- a/tests/intel/xe_exec_fault_mode.c
> +++ b/tests/intel/xe_exec_fault_mode.c
> @@ -16,24 +16,13 @@
>  #include "igt.h"
>  #include "lib/igt_syncobj.h"
>  #include "lib/intel_reg.h"
> +#include "lib/xe/xe_util.h"
>  #include "xe_drm.h"
> 
>  #include "xe/xe_ioctl.h"
>  #include "xe/xe_query.h"
>  #include <string.h>
> 
> -#define MAX_N_EXEC_QUEUES	16
> -
> -#define USERPTR		(0x1 << 0)
> -#define REBIND		(0x1 << 1)
> -#define INVALIDATE	(0x1 << 2)
> -#define RACE		(0x1 << 3)
> -#define BIND_EXEC_QUEUE	(0x1 << 4)
> -#define PREFETCH	(0x1 << 5)
> -#define INVALID_FAULT	(0x1 << 6)
> -#define INVALID_VA	(0x1 << 7)
> -#define ENABLE_SCRATCH  (0x1 << 8)
> -
>  /**
>   * SUBTEST: invalid-va
>   * Description: Access invalid va and check for EIO through user fence.
> @@ -99,7 +88,6 @@ test_exec(int fd, struct drm_xe_engine_class_instance
> *eci,  {
>  	uint32_t vm;
>  	uint64_t addr = 0x1a0000;
> -#define USER_FENCE_VALUE	0xdeadbeefdeadbeefull
>  	struct drm_xe_sync sync[1] = {
>  		{ .type = DRM_XE_SYNC_TYPE_USER_FENCE, .flags =
> DRM_XE_SYNC_FLAG_SIGNAL,
>  	          .timeline_value = USER_FENCE_VALUE }, @@ -113,14 +101,9 @@
> test_exec(int fd, struct drm_xe_engine_class_instance *eci,
>  	uint32_t bind_exec_queues[MAX_N_EXEC_QUEUES];
>  	size_t bo_size;
>  	uint32_t bo = 0;
> -	struct {
> -		uint32_t batch[16];
> -		uint64_t pad;
> -		uint64_t vm_sync;
> -		uint64_t exec_sync;
> -		uint32_t data;
> -	} *data;
> -	int i, j, b;
> +	struct cpu_va *data;
> +	uint64_t placement;
> +	int i, j;
>  	int map_fd = -1;
> 
>  	igt_assert(n_exec_queues <= MAX_N_EXEC_QUEUES); @@ -134,144
> +117,19 @@ test_exec(int fd, struct drm_xe_engine_class_instance *eci,
>  	bo_size = sizeof(*data) * n_execs;
>  	bo_size = xe_bb_size(fd, bo_size);
> 
> -	if (flags & USERPTR) {
> -#define	MAP_ADDRESS	0x00007fadeadbe000
> -		if (flags & INVALIDATE) {
> -			data = mmap((void *)MAP_ADDRESS, bo_size,
> PROT_READ |
> -				    PROT_WRITE, MAP_SHARED | MAP_FIXED |
> -				    MAP_ANONYMOUS, -1, 0);
> -			igt_assert(data != MAP_FAILED);
> -		} else {
> -			data = aligned_alloc(xe_get_default_alignment(fd),
> -					     bo_size);
> -			igt_assert(data);
> -		}
> -	} else {
> -		if (flags & PREFETCH)
> -			bo = xe_bo_create(fd, 0, bo_size,
> -					  all_memory_regions(fd) |
> -					  vram_if_possible(fd, 0),
> -
> DRM_XE_GEM_CREATE_FLAG_NEEDS_VISIBLE_VRAM);
> -		else
> -			bo = xe_bo_create(fd, 0, bo_size,
> -					  vram_if_possible(fd, eci->gt_id),
> -
> DRM_XE_GEM_CREATE_FLAG_NEEDS_VISIBLE_VRAM);
> -		data = xe_bo_map(fd, bo, bo_size);
> -	}
> -	memset(data, 0, bo_size);
> -
> -	for (i = 0; i < n_exec_queues; i++) {
> -		exec_queues[i] = xe_exec_queue_create(fd, vm, eci, 0);
> -		if (flags & BIND_EXEC_QUEUE)
> -			bind_exec_queues[i] =
> -				xe_bind_exec_queue_create(fd, vm, 0);
> -		else
> -			bind_exec_queues[i] = 0;
> -	};
> -
> -	sync[0].addr = to_user_pointer(&data[0].vm_sync);
> -	if (bo)
> -		xe_vm_bind_async(fd, vm, bind_exec_queues[0], bo, 0, addr,
> bo_size, sync, 1);
> -	else
> -		xe_vm_bind_userptr_async(fd, vm, bind_exec_queues[0],
> -					 to_user_pointer(data), addr,
> -					 bo_size, sync, 1);
> -#define ONE_SEC	MS_TO_NS(1000)
> -	xe_wait_ufence(fd, &data[0].vm_sync, USER_FENCE_VALUE,
> -		       bind_exec_queues[0], ONE_SEC);
> -	data[0].vm_sync = 0;
> -
> -	if (flags & PREFETCH) {
> -		/* Should move to system memory */
> -		xe_vm_prefetch_async(fd, vm, bind_exec_queues[0], 0, addr,
> -				     bo_size, sync, 1, 0);
> -		xe_wait_ufence(fd, &data[0].vm_sync, USER_FENCE_VALUE,
> -			       bind_exec_queues[0], ONE_SEC);
> -		data[0].vm_sync = 0;
> -	}
> -
> -	for (i = 0; i < n_execs; i++) {
> -		uint64_t batch_offset = (char *)&data[i].batch - (char *)data;
> -		uint64_t batch_addr = addr + batch_offset;
> -		uint64_t sdi_offset = (char *)&data[i].data - (char *)data;
> -		uint64_t sdi_addr = addr + sdi_offset;
> -		int e = i % n_exec_queues;
> +	placement = (flags & PREFETCH) ? all_memory_regions(fd) |
> +		vram_if_possible(fd, 0) : vram_if_possible(fd, eci->gt_id);
> 
> -		b = 0;
> -		if (flags & INVALID_VA)
> -			sdi_addr = 0x1fffffffffff000;
> +	data = create_xe_bo(fd, vm, &bo, bo_size, placement, flags);
> 
> -		data[i].batch[b++] = MI_STORE_DWORD_IMM_GEN4;
> -		data[i].batch[b++] = sdi_addr;
> -		data[i].batch[b++] = sdi_addr >> 32;
> -		data[i].batch[b++] = 0xc0ffee;
> -		data[i].batch[b++] = MI_BATCH_BUFFER_END;
> -		igt_assert(b <= ARRAY_SIZE(data[i].batch));
> +	create_exec_queue(fd, vm, exec_queues, bind_exec_queues,
> +			n_exec_queues, eci, flags);
> 
> -		sync[0].addr = addr + (char *)&data[i].exec_sync - (char *)data;
> +	xe_bo_bind(fd, vm, bind_exec_queues[0], bo, bo_size, addr, sync, data,
> +flags);
> 
> -		exec.exec_queue_id = exec_queues[e];
> -		exec.address = batch_addr;
> -		xe_exec(fd, &exec);
> +	xe_execbuf(fd, vm, &exec, n_execs, exec_queues, bind_exec_queues,
> +			n_exec_queues, bo, bo_size, sync, addr, data,
> &map_fd, flags);
> 
> -		if (flags & REBIND && i + 1 != n_execs) {
> -			xe_wait_ufence(fd, &data[i].exec_sync,
> USER_FENCE_VALUE,
> -				       exec_queues[e], ONE_SEC);
> -			xe_vm_unbind_async(fd, vm, bind_exec_queues[e], 0,
> -					   addr, bo_size, NULL, 0);
> -
> -			sync[0].addr = to_user_pointer(&data[0].vm_sync);
> -			addr += bo_size;
> -			if (bo)
> -				xe_vm_bind_async(fd, vm,
> bind_exec_queues[e], bo,
> -						 0, addr, bo_size, sync, 1);
> -			else
> -				xe_vm_bind_userptr_async(fd, vm,
> -							 bind_exec_queues[e],
> -
> to_user_pointer(data),
> -							 addr, bo_size, sync,
> -							 1);
> -			xe_wait_ufence(fd, &data[0].vm_sync,
> USER_FENCE_VALUE,
> -				       bind_exec_queues[e], ONE_SEC);
> -			data[0].vm_sync = 0;
> -		}
> -
> -		if (flags & INVALIDATE && i + 1 != n_execs) {
> -			if (!(flags & RACE)) {
> -				/*
> -				 * Wait for exec completion and check data as
> -				 * userptr will likely change to different
> -				 * physical memory on next mmap call
> triggering
> -				 * an invalidate.
> -				 */
> -				xe_wait_ufence(fd, &data[i].exec_sync,
> -					       USER_FENCE_VALUE,
> exec_queues[e],
> -					       ONE_SEC);
> -				igt_assert_eq(data[i].data, 0xc0ffee);
> -			} else if (i * 2 != n_execs) {
> -				/*
> -				 * We issue 1 mmap which races against
> running
> -				 * jobs. No real check here aside from this test
> -				 * not faulting on the GPU.
> -				 */
> -				continue;
> -			}
> -
> -			if (flags & RACE) {
> -				map_fd = open("/tmp", O_TMPFILE | O_RDWR,
> -					      0x666);
> -				write(map_fd, data, bo_size);
> -				data = mmap((void *)MAP_ADDRESS, bo_size,
> -					    PROT_READ | PROT_WRITE,
> MAP_SHARED |
> -					    MAP_FIXED, map_fd, 0);
> -			} else {
> -				data = mmap((void *)MAP_ADDRESS, bo_size,
> -					    PROT_READ | PROT_WRITE,
> MAP_SHARED |
> -					    MAP_FIXED | MAP_ANONYMOUS, -
> 1, 0);
> -			}
> -			igt_assert(data != MAP_FAILED);
> -		}
> -	}
>  	if (!(flags & INVALID_FAULT)) {
>  		int64_t timeout = ONE_SEC;
> 
> @@ -286,22 +144,15 @@ test_exec(int fd, struct
> drm_xe_engine_class_instance *eci,
>  							       exec_queues[i %
> n_exec_queues], &timeout), 0);
>  		}
>  	}
> -	sync[0].addr = to_user_pointer(&data[0].vm_sync);
> -	xe_vm_unbind_async(fd, vm, bind_exec_queues[0], 0, addr, bo_size,
> -			   sync, 1);
> -	xe_wait_ufence(fd, &data[0].vm_sync, USER_FENCE_VALUE,
> -		       bind_exec_queues[0], ONE_SEC);
> +
> +	xe_vm_unbind(fd, vm, bind_exec_queues[0], sync, data, addr, bo_size);
> 
>  	if (!(flags & INVALID_FAULT) && !(flags & INVALID_VA)) {
>  		for (i = j; i < n_execs; i++)
>  				igt_assert_eq(data[i].data, 0xc0ffee);
>  	}
> 
> -	for (i = 0; i < n_exec_queues; i++) {
> -		xe_exec_queue_destroy(fd, exec_queues[i]);
> -		if (bind_exec_queues[i])
> -			xe_exec_queue_destroy(fd, bind_exec_queues[i]);
> -	}
> +	destory_exec_queue(fd, exec_queues, bind_exec_queues,
> n_exec_queues);
> 
>  	if (bo) {
>  		munmap(data, bo_size);
> --
> 2.25.1


^ permalink raw reply	[flat|nested] 9+ messages in thread

* ✗ CI.Patch_applied: failure for lib/xe/xe_util: Creating the helper functions
  2024-03-31 18:59 [PATCH] lib/xe/xe_util: Creating the helper functions Bommu Krishnaiah
  2024-03-31 19:04 ` Bommu, Krishnaiah
@ 2024-03-31 19:05 ` Patchwork
  2024-03-31 21:05 ` [PATCH] " Matthew Brost
  2024-04-01 17:39 ` Zeng, Oak
  3 siblings, 0 replies; 9+ messages in thread
From: Patchwork @ 2024-03-31 19:05 UTC (permalink / raw)
  To: Bommu, Krishnaiah; +Cc: intel-xe

== Series Details ==

Series: lib/xe/xe_util: Creating the helper functions
URL   : https://patchwork.freedesktop.org/series/131869/
State : failure

== Summary ==

=== Applying kernel patches on branch 'drm-tip' with base: ===
Base commit: b334e21cd003 drm-tip: 2024y-03m-29d-19h-58m-28s UTC integration manifest
=== git am output follows ===
error: lib/xe/xe_util.c: does not exist in index
error: lib/xe/xe_util.h: does not exist in index
error: tests/intel/xe_exec_compute_mode.c: does not exist in index
error: tests/intel/xe_exec_fault_mode.c: does not exist in index
hint: Use 'git am --show-current-patch=diff' to see the failed patch
Applying: lib/xe/xe_util: Creating the helper functions
Patch failed at 0001 lib/xe/xe_util: Creating the helper functions
When you have resolved this problem, run "git am --continue".
If you prefer to skip this patch, run "git am --skip" instead.
To restore the original branch and stop patching, run "git am --abort".

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] lib/xe/xe_util: Creating the helper functions
  2024-03-31 18:59 [PATCH] lib/xe/xe_util: Creating the helper functions Bommu Krishnaiah
  2024-03-31 19:04 ` Bommu, Krishnaiah
  2024-03-31 19:05 ` ✗ CI.Patch_applied: failure for " Patchwork
@ 2024-03-31 21:05 ` Matthew Brost
  2024-04-01 16:10   ` Zeng, Oak
  2024-04-01 17:39 ` Zeng, Oak
  3 siblings, 1 reply; 9+ messages in thread
From: Matthew Brost @ 2024-03-31 21:05 UTC (permalink / raw)
  To: Bommu Krishnaiah
  Cc: intel-xe, Janga Rahul Kumar, Himal Prasad Ghimiray, Oak Zeng

On Mon, Apr 01, 2024 at 12:29:49AM +0530, Bommu Krishnaiah wrote:
> Creating the helper functions which can be reused by many tests,
> as part of this patch xe_exec_compute_mode and xe_exec_fault_mode
> are used this helper functions.
>

As the author of most these tests and IGTs, I can say while not great to
have copy paste between tests I much prefer it to adding helpers to try
to jam the needs of each test into the helpers. IMO this just creates a
headache in the long run. e.g. Let's say I want to change the behavior
of xe_exec_compute_mode but not xe_exec_fault_mode, with shared helpers
this gets harder.

I'd much rather leave this as is.

Maybe I can buy a helper or two to program the batches, i.e.
insert_store LGTM, but certainly not the logic doing bind, invalidates,
loop, etc...

Matt
 
> Signed-off-by: Bommu Krishnaiah <krishnaiah.bommu@intel.com>
> Cc: Janga Rahul Kumar <janga.rahul.kumar@intel.com>
> Cc: Himal Prasad Ghimiray <himal.prasad.ghimiray@intel.com>
> Cc: Oak Zeng <oak.zeng@intel.com>
> ---
>  lib/xe/xe_util.c                   | 194 +++++++++++++++++++++++++++++
>  lib/xe/xe_util.h                   |  48 +++++++
>  tests/intel/xe_exec_compute_mode.c | 190 ++++------------------------
>  tests/intel/xe_exec_fault_mode.c   | 179 +++-----------------------
>  4 files changed, 282 insertions(+), 329 deletions(-)
> 
> diff --git a/lib/xe/xe_util.c b/lib/xe/xe_util.c
> index 050162b5e..17cd45e45 100644
> --- a/lib/xe/xe_util.c
> +++ b/lib/xe/xe_util.c
> @@ -255,3 +255,197 @@ bool xe_is_gt_in_c6(int fd, int gt)
>  
>  	return strcmp(gt_c_state, "gt-c6") == 0;
>  }
> +
> +struct cpu_va *create_xe_bo(int fd, uint32_t vm, uint32_t *bo,
> +		uint32_t bo_size, uint64_t placement, unsigned int flags)
> +{
> +	struct cpu_va *data;
> +
> +	if (flags & USERPTR) {
> +		if (flags & INVALIDATE) {
> +			data = mmap((void *)MAP_ADDRESS, bo_size, PROT_READ | PROT_WRITE,
> +					MAP_SHARED | MAP_FIXED | MAP_ANONYMOUS, -1, 0);
> +			igt_assert(data != MAP_FAILED);
> +		} else {
> +			data = aligned_alloc(xe_get_default_alignment(fd), bo_size);
> +			igt_assert(data);
> +		}
> +	} else {
> +		*bo = xe_bo_create(fd, flags & VM_FOR_BO ? vm : 0, bo_size, placement,
> +				DRM_XE_GEM_CREATE_FLAG_NEEDS_VISIBLE_VRAM);
> +		data = xe_bo_map(fd, *bo, bo_size);
> +	}
> +
> +	memset(data, 0, bo_size);
> +
> +	return data;
> +}
> +
> +void create_exec_queue(int fd, uint32_t vm, uint32_t *exec_queues,
> +		uint32_t *bind_exec_queues, int n_exec_queues,
> +		struct drm_xe_engine_class_instance *eci, unsigned int flags)
> +{
> +	int i;
> +
> +	for (i = 0; i < n_exec_queues; i++) {
> +		exec_queues[i] = xe_exec_queue_create(fd, vm, eci, 0);
> +		if (flags & BIND_EXEC_QUEUE)
> +			bind_exec_queues[i] = xe_bind_exec_queue_create(fd, vm, 0);
> +		else
> +			bind_exec_queues[i] = 0;
> +	};
> +}
> +
> +void xe_bo_bind(int fd, uint32_t vm, uint32_t bind_exec_queues, uint32_t bo,
> +		uint32_t bo_size, uint64_t addr, struct drm_xe_sync *sync,
> +		struct cpu_va *data, unsigned int flags)
> +{
> +	int64_t fence_timeout = igt_run_in_simulation() ? HUNDRED_SEC : ONE_SEC;
> +
> +	sync[0].addr = to_user_pointer(&data[0].vm_sync);
> +	if (bo)
> +		xe_vm_bind_async(fd, vm, bind_exec_queues, bo, 0, addr, bo_size, sync, 1);
> +	else
> +		xe_vm_bind_userptr_async(fd, vm, bind_exec_queues,
> +				to_user_pointer(data), addr,
> +				bo_size, sync, 1);
> +
> +	xe_wait_ufence(fd, &data[0].vm_sync, USER_FENCE_VALUE,
> +			bind_exec_queues, fence_timeout);
> +	data[0].vm_sync = 0;
> +
> +	if (flags & PREFETCH) {
> +		/* Should move to system memory */
> +		xe_vm_prefetch_async(fd, vm, bind_exec_queues, 0, addr,
> +				bo_size, sync, 1, 0);
> +		xe_wait_ufence(fd, &data[0].vm_sync, USER_FENCE_VALUE,
> +				bind_exec_queues, fence_timeout);
> +		data[0].vm_sync = 0;
> +	}
> +}
> +
> +void insert_store(uint64_t dst, struct cpu_va *data, uint32_t val, int i)
> +{
> +	int b = 0;
> +	data[i].batch[b++] = MI_STORE_DWORD_IMM_GEN4;
> +	data[i].batch[b++] = dst;
> +	data[i].batch[b++] = dst >> 32;
> +	data[i].batch[b++] = val;
> +	data[i].batch[b++] = MI_BATCH_BUFFER_END;
> +	igt_assert(b <= ARRAY_SIZE(data[i].batch));
> +}
> +
> +void xe_execbuf(int fd, uint32_t vm, struct drm_xe_exec *exec, int n_execs,
> +		uint32_t *exec_queues, uint32_t *bind_exec_queues,
> +		int n_exec_queues, uint32_t bo, uint32_t bo_size,
> +		struct drm_xe_sync *sync, uint64_t addr, struct cpu_va *data,
> +		int *map_fd, unsigned int flags)
> +{
> +	int i;
> +	int64_t fence_timeout = igt_run_in_simulation() ? HUNDRED_SEC : ONE_SEC;
> +
> +	for (i = 0; i < n_execs; i++) {
> +		uint64_t batch_offset = (char *)&data[i].batch - (char *)data;
> +		uint64_t batch_addr = addr + batch_offset;
> +		uint64_t sdi_offset = (char *)&data[i].data - (char *)data;
> +		uint64_t sdi_addr = addr + sdi_offset;
> +		int e = i % n_exec_queues;
> +
> +		if (flags & INVALID_VA)
> +			sdi_addr = 0x1fffffffffff000;
> +
> +		insert_store(sdi_addr, data, 0xc0ffee, i);
> +
> +		sync[0].addr = addr + (char *)&data[i].exec_sync - (char *)data;
> +
> +		exec->exec_queue_id = exec_queues[e];
> +		exec->address = batch_addr;
> +		xe_exec(fd, exec);
> +
> +		if (flags & REBIND && i + 1 != n_execs) {
> +			xe_wait_ufence(fd, &data[i].exec_sync, USER_FENCE_VALUE,
> +					exec_queues[e], fence_timeout);
> +			xe_vm_unbind_async(fd, vm, bind_exec_queues[e], 0,
> +					addr, bo_size, NULL, 0);
> +
> +			sync[0].addr = to_user_pointer(&data[0].vm_sync);
> +			addr += bo_size;
> +
> +			if (bo)
> +				xe_vm_bind_async(fd, vm, bind_exec_queues[e], bo,
> +						0, addr, bo_size, sync, 1);
> +			else
> +				xe_vm_bind_userptr_async(fd, vm,
> +						bind_exec_queues[e],
> +						to_user_pointer(data),
> +						addr, bo_size, sync, 1);
> +
> +			xe_wait_ufence(fd, &data[0].vm_sync, USER_FENCE_VALUE,
> +					bind_exec_queues[e], fence_timeout);
> +			data[0].vm_sync = 0;
> +		}
> +
> +		if (flags & INVALIDATE && i + 1 != n_execs) {
> +			if (!(flags & RACE)) {
> +				/*
> +				 * Wait for exec completion and check data as
> +				 * userptr will likely change to different
> +				 * physical memory on next mmap call triggering
> +				 * an invalidate.
> +				 */
> +				xe_wait_ufence(fd, &data[i].exec_sync,
> +						USER_FENCE_VALUE, exec_queues[e],
> +						fence_timeout);
> +				igt_assert_eq(data[i].data, 0xc0ffee);
> +			} else if (i * 2 != n_execs) {
> +				/*
> +				 * We issue 1 mmap which races against running
> +				 * jobs. No real check here aside from this test
> +				 * not faulting on the GPU.
> +				 */
> +				continue;
> +			}
> +
> +			if (flags & RACE) {
> +				*map_fd = open("/tmp", O_TMPFILE | O_RDWR,
> +						0x666);
> +				write(*map_fd, data, bo_size);
> +				data = mmap((void *)MAP_ADDRESS, bo_size,
> +						PROT_READ | PROT_WRITE, MAP_SHARED |
> +						MAP_FIXED, *map_fd, 0);
> +			} else {
> +				data = mmap((void *)MAP_ADDRESS, bo_size,
> +						PROT_READ | PROT_WRITE, MAP_SHARED |
> +						MAP_FIXED | MAP_ANONYMOUS, -1, 0);
> +			}
> +
> +			igt_assert(data != MAP_FAILED);
> +		}
> +	}
> +}
> +
> +void xe_vm_unbind(int fd, uint32_t vm, uint32_t bind_exec_queues,
> +		struct drm_xe_sync *sync, struct cpu_va *data, uint64_t addr,
> +		uint32_t bo_size)
> +{
> +	int64_t fence_timeout = igt_run_in_simulation() ? HUNDRED_SEC : ONE_SEC;
> +
> +	sync[0].addr = to_user_pointer(&data[0].vm_sync);
> +	xe_vm_unbind_async(fd, vm, bind_exec_queues, 0, addr, bo_size,
> +			sync, 1);
> +	xe_wait_ufence(fd, &data[0].vm_sync, USER_FENCE_VALUE,
> +			bind_exec_queues, fence_timeout);
> +}
> +
> +void destory_exec_queue(int fd, uint32_t *exec_queues,
> +		uint32_t *bind_exec_queues, int n_exec_queues)
> +{
> +	int i;
> +
> +	for (i = 0; i < n_exec_queues; i++) {
> +		xe_exec_queue_destroy(fd, exec_queues[i]);
> +		if (bind_exec_queues[i])
> +			xe_exec_queue_destroy(fd, bind_exec_queues[i]);
> +	}
> +}
> +
> diff --git a/lib/xe/xe_util.h b/lib/xe/xe_util.h
> index 6480ea01a..aa1b0dcc1 100644
> --- a/lib/xe/xe_util.h
> +++ b/lib/xe/xe_util.h
> @@ -47,4 +47,52 @@ void xe_bind_unbind_async(int fd, uint32_t vm, uint32_t bind_engine,
>  
>  bool xe_is_gt_in_c6(int fd, int gt);
>  
> +
> +#define USERPTR			(0x1 << 0)
> +#define REBIND			(0x1 << 1)
> +#define INVALIDATE		(0x1 << 2)
> +#define RACE			(0x1 << 3)
> +#define BIND_EXEC_QUEUE		(0x1 << 4)
> +#define PREFETCH		(0x1 << 5)
> +#define INVALID_FAULT		(0x1 << 6)
> +#define INVALID_VA		(0x1 << 7)
> +#define ENABLE_SCRATCH		(0x1 << 8)
> +#define VM_FOR_BO		(0x1 << 9)
> +#define EXEC_QUEUE_EARLY	(0x1 << 10)
> +
> +#define MAX_N_EXEC_QUEUES	16
> +#define USER_FENCE_VALUE	0xdeadbeefdeadbeefull
> +#define MAP_ADDRESS		0x00007fadeadbe000
> +
> +#define ONE_SEC		MS_TO_NS(1000)
> +#define HUNDRED_SEC	MS_TO_NS(100000)
> +
> +struct cpu_va {
> +	uint32_t batch[16];
> +	uint64_t pad;
> +	uint64_t vm_sync;
> +	uint64_t exec_sync;
> +	uint32_t data;
> +};
> +
> +struct cpu_va *create_xe_bo(int fd, uint32_t vm, uint32_t *bo,
> +		uint32_t bo_size, uint64_t placement,  unsigned int flags);
> +void create_exec_queue(int fd, uint32_t vm, uint32_t *exec_queues,
> +		uint32_t *bind_exec_queues, int n_exec_queues,
> +		struct drm_xe_engine_class_instance *eci, unsigned int flags);
> +void xe_bo_bind(int fd, uint32_t vm, uint32_t bind_exec_queues, uint32_t bo,
> +		uint32_t bo_size, uint64_t addr, struct drm_xe_sync *sync,
> +		struct cpu_va *data, unsigned int flags);
> +void xe_execbuf(int fd, uint32_t vm, struct drm_xe_exec *exec, int n_execs,
> +		uint32_t *exec_queues, uint32_t *bind_exec_queues,
> +		int n_exec_queues, uint32_t bo, uint32_t bo_size,
> +		struct drm_xe_sync *sync, uint64_t addr, struct cpu_va *data,
> +		int *map_fd, unsigned int flags);
> +void insert_store(uint64_t dst, struct cpu_va *data, uint32_t val, int i);
> +void xe_vm_unbind(int fd, uint32_t vm, uint32_t bind_exec_queues,
> +		struct drm_xe_sync *sync, struct cpu_va *data,
> +		uint64_t addr, uint32_t bo_size);
> +void destory_exec_queue(int fd, uint32_t *exec_queues,
> +		uint32_t *bind_exec_queues, int n_exec_queues);
> +
>  #endif /* XE_UTIL_H */
> diff --git a/tests/intel/xe_exec_compute_mode.c b/tests/intel/xe_exec_compute_mode.c
> index 3ec848164..e8d82cc69 100644
> --- a/tests/intel/xe_exec_compute_mode.c
> +++ b/tests/intel/xe_exec_compute_mode.c
> @@ -15,6 +15,7 @@
>  #include "igt.h"
>  #include "lib/igt_syncobj.h"
>  #include "lib/intel_reg.h"
> +#include "lib/xe/xe_util.h"
>  #include <sys/ioctl.h>
>  #include "xe_drm.h"
>  
> @@ -23,15 +24,6 @@
>  #include "xe/xe_spin.h"
>  #include <string.h>
>  
> -#define MAX_N_EXECQUEUES 	16
> -#define USERPTR				(0x1 << 0)
> -#define REBIND				(0x1 << 1)
> -#define INVALIDATE			(0x1 << 2)
> -#define RACE				(0x1 << 3)
> -#define BIND_EXECQUEUE		(0x1 << 4)
> -#define VM_FOR_BO			(0x1 << 5)
> -#define EXEC_QUEUE_EARLY	(0x1 << 6)
> -
>  /**
>   * SUBTEST: twice-%s
>   * Description: Run %arg[1] compute machine test twice
> @@ -88,7 +80,6 @@ test_exec(int fd, struct drm_xe_engine_class_instance *eci,
>  {
>  	uint32_t vm;
>  	uint64_t addr = 0x1a0000;
> -#define USER_FENCE_VALUE	0xdeadbeefdeadbeefull
>  	struct drm_xe_sync sync[1] = {
>  		{ .type = DRM_XE_SYNC_TYPE_USER_FENCE,
>  		  .flags = DRM_XE_SYNC_FLAG_SIGNAL,
> @@ -99,161 +90,38 @@ test_exec(int fd, struct drm_xe_engine_class_instance *eci,
>  		.num_syncs = 1,
>  		.syncs = to_user_pointer(sync),
>  	};
> -	uint32_t exec_queues[MAX_N_EXECQUEUES];
> -	uint32_t bind_exec_queues[MAX_N_EXECQUEUES];
> +	uint32_t exec_queues[MAX_N_EXEC_QUEUES];
> +	uint32_t bind_exec_queues[MAX_N_EXEC_QUEUES];
>  	size_t bo_size;
>  	uint32_t bo = 0;
> -	struct {
> -		uint32_t batch[16];
> -		uint64_t pad;
> -		uint64_t vm_sync;
> -		uint64_t exec_sync;
> -		uint32_t data;
> -	} *data;
> -	int i, j, b;
> +	struct cpu_va *data;
> +	int i, j;
>  	int map_fd = -1;
>  	int64_t fence_timeout;
> +	uint64_t placement;
>  
> -	igt_assert(n_exec_queues <= MAX_N_EXECQUEUES);
> +	igt_assert(n_exec_queues <= MAX_N_EXEC_QUEUES);
>  
>  	vm = xe_vm_create(fd, DRM_XE_VM_CREATE_FLAG_LR_MODE, 0);
>  	bo_size = sizeof(*data) * n_execs;
>  	bo_size = xe_bb_size(fd, bo_size);
>  
> -	for (i = 0; (flags & EXEC_QUEUE_EARLY) && i < n_exec_queues; i++) {
> -		exec_queues[i] = xe_exec_queue_create(fd, vm, eci, 0);
> -		if (flags & BIND_EXECQUEUE)
> -			bind_exec_queues[i] =
> -				xe_bind_exec_queue_create(fd, vm, 0);
> -		else
> -			bind_exec_queues[i] = 0;
> -	};
> +	if (flags & EXEC_QUEUE_EARLY)
> +		create_exec_queue(fd, vm, exec_queues, bind_exec_queues, n_exec_queues, eci, flags);
>  
> -	if (flags & USERPTR) {
> -#define	MAP_ADDRESS	0x00007fadeadbe000
> -		if (flags & INVALIDATE) {
> -			data = mmap((void *)MAP_ADDRESS, bo_size, PROT_READ |
> -				    PROT_WRITE, MAP_SHARED | MAP_FIXED |
> -				    MAP_ANONYMOUS, -1, 0);
> -			igt_assert(data != MAP_FAILED);
> -		} else {
> -			data = aligned_alloc(xe_get_default_alignment(fd),
> -					     bo_size);
> -			igt_assert(data);
> -		}
> -	} else {
> -		bo = xe_bo_create(fd, flags & VM_FOR_BO ? vm : 0,
> -				  bo_size, vram_if_possible(fd, eci->gt_id),
> -				  DRM_XE_GEM_CREATE_FLAG_NEEDS_VISIBLE_VRAM);
> -		data = xe_bo_map(fd, bo, bo_size);
> -	}
> -	memset(data, 0, bo_size);
> +	placement = vram_if_possible(fd, eci->gt_id);
>  
> -	for (i = 0; !(flags & EXEC_QUEUE_EARLY) && i < n_exec_queues; i++) {
> -		exec_queues[i] = xe_exec_queue_create(fd, vm, eci, 0);
> -		if (flags & BIND_EXECQUEUE)
> -			bind_exec_queues[i] =
> -				xe_bind_exec_queue_create(fd, vm, 0);
> -		else
> -			bind_exec_queues[i] = 0;
> -	};
> +	data = create_xe_bo(fd, vm, &bo, bo_size, placement, flags);
>  
> -	sync[0].addr = to_user_pointer(&data[0].vm_sync);
> -	if (bo)
> -		xe_vm_bind_async(fd, vm, bind_exec_queues[0], bo, 0, addr,
> -				 bo_size, sync, 1);
> -	else
> -		xe_vm_bind_userptr_async(fd, vm, bind_exec_queues[0],
> -					 to_user_pointer(data), addr,
> -					 bo_size, sync, 1);
> -#define ONE_SEC	MS_TO_NS(1000)
> -#define HUNDRED_SEC	MS_TO_NS(100000)
> +	if(!(flags & EXEC_QUEUE_EARLY))
> +		create_exec_queue(fd, vm, exec_queues, bind_exec_queues, n_exec_queues, eci, flags);
> +
> +	xe_bo_bind(fd, vm, bind_exec_queues[0], bo, bo_size, addr, sync, data, flags);
>  
>  	fence_timeout = igt_run_in_simulation() ? HUNDRED_SEC : ONE_SEC;
>  
> -	xe_wait_ufence(fd, &data[0].vm_sync, USER_FENCE_VALUE,
> -		       bind_exec_queues[0], fence_timeout);
> -	data[0].vm_sync = 0;
> -
> -	for (i = 0; i < n_execs; i++) {
> -		uint64_t batch_offset = (char *)&data[i].batch - (char *)data;
> -		uint64_t batch_addr = addr + batch_offset;
> -		uint64_t sdi_offset = (char *)&data[i].data - (char *)data;
> -		uint64_t sdi_addr = addr + sdi_offset;
> -		int e = i % n_exec_queues;
> -
> -		b = 0;
> -		data[i].batch[b++] = MI_STORE_DWORD_IMM_GEN4;
> -		data[i].batch[b++] = sdi_addr;
> -		data[i].batch[b++] = sdi_addr >> 32;
> -		data[i].batch[b++] = 0xc0ffee;
> -		data[i].batch[b++] = MI_BATCH_BUFFER_END;
> -		igt_assert(b <= ARRAY_SIZE(data[i].batch));
> -
> -		sync[0].addr = addr + (char *)&data[i].exec_sync - (char *)data;
> -
> -		exec.exec_queue_id = exec_queues[e];
> -		exec.address = batch_addr;
> -		xe_exec(fd, &exec);
> -
> -		if (flags & REBIND && i + 1 != n_execs) {
> -			xe_wait_ufence(fd, &data[i].exec_sync, USER_FENCE_VALUE,
> -				       exec_queues[e], fence_timeout);
> -			xe_vm_unbind_async(fd, vm, bind_exec_queues[e], 0,
> -					   addr, bo_size, NULL, 0);
> -
> -			sync[0].addr = to_user_pointer(&data[0].vm_sync);
> -			addr += bo_size;
> -			if (bo)
> -				xe_vm_bind_async(fd, vm, bind_exec_queues[e], bo,
> -						 0, addr, bo_size, sync, 1);
> -			else
> -				xe_vm_bind_userptr_async(fd, vm,
> -							 bind_exec_queues[e],
> -							 to_user_pointer(data),
> -							 addr, bo_size, sync,
> -							 1);
> -			xe_wait_ufence(fd, &data[0].vm_sync, USER_FENCE_VALUE,
> -				       bind_exec_queues[e], fence_timeout);
> -			data[0].vm_sync = 0;
> -		}
> -
> -		if (flags & INVALIDATE && i + 1 != n_execs) {
> -			if (!(flags & RACE)) {
> -				/*
> -				 * Wait for exec completion and check data as
> -				 * userptr will likely change to different
> -				 * physical memory on next mmap call triggering
> -				 * an invalidate.
> -				 */
> -				xe_wait_ufence(fd, &data[i].exec_sync,
> -					       USER_FENCE_VALUE, exec_queues[e],
> -					       fence_timeout);
> -				igt_assert_eq(data[i].data, 0xc0ffee);
> -			} else if (i * 2 != n_execs) {
> -				/*
> -				 * We issue 1 mmap which races against running
> -				 * jobs. No real check here aside from this test
> -				 * not faulting on the GPU.
> -				 */
> -				continue;
> -			}
> -
> -			if (flags & RACE) {
> -				map_fd = open("/tmp", O_TMPFILE | O_RDWR,
> -					      0x666);
> -				write(map_fd, data, bo_size);
> -				data = mmap((void *)MAP_ADDRESS, bo_size,
> -					    PROT_READ | PROT_WRITE, MAP_SHARED |
> -					    MAP_FIXED, map_fd, 0);
> -			} else {
> -				data = mmap((void *)MAP_ADDRESS, bo_size,
> -					    PROT_READ | PROT_WRITE, MAP_SHARED |
> -					    MAP_FIXED | MAP_ANONYMOUS, -1, 0);
> -			}
> -			igt_assert(data != MAP_FAILED);
> -		}
> -	}
> +	xe_execbuf(fd, vm, &exec, n_execs, exec_queues, bind_exec_queues,
> +			n_exec_queues, bo, bo_size, sync, addr, data, &map_fd, flags);
>  
>  	j = flags & INVALIDATE ? n_execs - 1 : 0;
>  	for (i = j; i < n_execs; i++)
> @@ -264,20 +132,12 @@ test_exec(int fd, struct drm_xe_engine_class_instance *eci,
>  	if (flags & INVALIDATE)
>  		usleep(250000);
>  
> -	sync[0].addr = to_user_pointer(&data[0].vm_sync);
> -	xe_vm_unbind_async(fd, vm, bind_exec_queues[0], 0, addr, bo_size,
> -			   sync, 1);
> -	xe_wait_ufence(fd, &data[0].vm_sync, USER_FENCE_VALUE,
> -		       bind_exec_queues[0], fence_timeout);
> +	xe_vm_unbind(fd, vm, bind_exec_queues[0], sync, data, addr, bo_size);
>  
>  	for (i = j; i < n_execs; i++)
>  		igt_assert_eq(data[i].data, 0xc0ffee);
>  
> -	for (i = 0; i < n_exec_queues; i++) {
> -		xe_exec_queue_destroy(fd, exec_queues[i]);
> -		if (bind_exec_queues[i])
> -			xe_exec_queue_destroy(fd, bind_exec_queues[i]);
> -	}
> +	destory_exec_queue(fd, exec_queues, bind_exec_queues, n_exec_queues);
>  
>  	if (bo) {
>  		munmap(data, bo_size);
> @@ -492,14 +352,14 @@ igt_main
>  		{ "userptr-rebind", USERPTR | REBIND },
>  		{ "userptr-invalidate", USERPTR | INVALIDATE },
>  		{ "userptr-invalidate-race", USERPTR | INVALIDATE | RACE },
> -		{ "bindexecqueue", BIND_EXECQUEUE },
> -		{ "bindexecqueue-userptr", BIND_EXECQUEUE | USERPTR },
> -		{ "bindexecqueue-rebind",  BIND_EXECQUEUE | REBIND },
> -		{ "bindexecqueue-userptr-rebind",  BIND_EXECQUEUE | USERPTR |
> +		{ "bindexecqueue", BIND_EXEC_QUEUE },
> +		{ "bindexecqueue-userptr", BIND_EXEC_QUEUE | USERPTR },
> +		{ "bindexecqueue-rebind",  BIND_EXEC_QUEUE | REBIND },
> +		{ "bindexecqueue-userptr-rebind",  BIND_EXEC_QUEUE | USERPTR |
>  			REBIND },
> -		{ "bindexecqueue-userptr-invalidate",  BIND_EXECQUEUE | USERPTR |
> +		{ "bindexecqueue-userptr-invalidate",  BIND_EXEC_QUEUE | USERPTR |
>  			INVALIDATE },
> -		{ "bindexecqueue-userptr-invalidate-race", BIND_EXECQUEUE | USERPTR |
> +		{ "bindexecqueue-userptr-invalidate-race", BIND_EXEC_QUEUE | USERPTR |
>  			INVALIDATE | RACE },
>  		{ NULL },
>  	};
> diff --git a/tests/intel/xe_exec_fault_mode.c b/tests/intel/xe_exec_fault_mode.c
> index 40fe1743e..7b2fef224 100644
> --- a/tests/intel/xe_exec_fault_mode.c
> +++ b/tests/intel/xe_exec_fault_mode.c
> @@ -16,24 +16,13 @@
>  #include "igt.h"
>  #include "lib/igt_syncobj.h"
>  #include "lib/intel_reg.h"
> +#include "lib/xe/xe_util.h"
>  #include "xe_drm.h"
>  
>  #include "xe/xe_ioctl.h"
>  #include "xe/xe_query.h"
>  #include <string.h>
>  
> -#define MAX_N_EXEC_QUEUES	16
> -
> -#define USERPTR		(0x1 << 0)
> -#define REBIND		(0x1 << 1)
> -#define INVALIDATE	(0x1 << 2)
> -#define RACE		(0x1 << 3)
> -#define BIND_EXEC_QUEUE	(0x1 << 4)
> -#define PREFETCH	(0x1 << 5)
> -#define INVALID_FAULT	(0x1 << 6)
> -#define INVALID_VA	(0x1 << 7)
> -#define ENABLE_SCRATCH  (0x1 << 8)
> -
>  /**
>   * SUBTEST: invalid-va
>   * Description: Access invalid va and check for EIO through user fence.
> @@ -99,7 +88,6 @@ test_exec(int fd, struct drm_xe_engine_class_instance *eci,
>  {
>  	uint32_t vm;
>  	uint64_t addr = 0x1a0000;
> -#define USER_FENCE_VALUE	0xdeadbeefdeadbeefull
>  	struct drm_xe_sync sync[1] = {
>  		{ .type = DRM_XE_SYNC_TYPE_USER_FENCE, .flags = DRM_XE_SYNC_FLAG_SIGNAL,
>  	          .timeline_value = USER_FENCE_VALUE },
> @@ -113,14 +101,9 @@ test_exec(int fd, struct drm_xe_engine_class_instance *eci,
>  	uint32_t bind_exec_queues[MAX_N_EXEC_QUEUES];
>  	size_t bo_size;
>  	uint32_t bo = 0;
> -	struct {
> -		uint32_t batch[16];
> -		uint64_t pad;
> -		uint64_t vm_sync;
> -		uint64_t exec_sync;
> -		uint32_t data;
> -	} *data;
> -	int i, j, b;
> +	struct cpu_va *data;
> +	uint64_t placement;
> +	int i, j;
>  	int map_fd = -1;
>  
>  	igt_assert(n_exec_queues <= MAX_N_EXEC_QUEUES);
> @@ -134,144 +117,19 @@ test_exec(int fd, struct drm_xe_engine_class_instance *eci,
>  	bo_size = sizeof(*data) * n_execs;
>  	bo_size = xe_bb_size(fd, bo_size);
>  
> -	if (flags & USERPTR) {
> -#define	MAP_ADDRESS	0x00007fadeadbe000
> -		if (flags & INVALIDATE) {
> -			data = mmap((void *)MAP_ADDRESS, bo_size, PROT_READ |
> -				    PROT_WRITE, MAP_SHARED | MAP_FIXED |
> -				    MAP_ANONYMOUS, -1, 0);
> -			igt_assert(data != MAP_FAILED);
> -		} else {
> -			data = aligned_alloc(xe_get_default_alignment(fd),
> -					     bo_size);
> -			igt_assert(data);
> -		}
> -	} else {
> -		if (flags & PREFETCH)
> -			bo = xe_bo_create(fd, 0, bo_size,
> -					  all_memory_regions(fd) |
> -					  vram_if_possible(fd, 0),
> -					  DRM_XE_GEM_CREATE_FLAG_NEEDS_VISIBLE_VRAM);
> -		else
> -			bo = xe_bo_create(fd, 0, bo_size,
> -					  vram_if_possible(fd, eci->gt_id),
> -					  DRM_XE_GEM_CREATE_FLAG_NEEDS_VISIBLE_VRAM);
> -		data = xe_bo_map(fd, bo, bo_size);
> -	}
> -	memset(data, 0, bo_size);
> -
> -	for (i = 0; i < n_exec_queues; i++) {
> -		exec_queues[i] = xe_exec_queue_create(fd, vm, eci, 0);
> -		if (flags & BIND_EXEC_QUEUE)
> -			bind_exec_queues[i] =
> -				xe_bind_exec_queue_create(fd, vm, 0);
> -		else
> -			bind_exec_queues[i] = 0;
> -	};
> -
> -	sync[0].addr = to_user_pointer(&data[0].vm_sync);
> -	if (bo)
> -		xe_vm_bind_async(fd, vm, bind_exec_queues[0], bo, 0, addr, bo_size, sync, 1);
> -	else
> -		xe_vm_bind_userptr_async(fd, vm, bind_exec_queues[0],
> -					 to_user_pointer(data), addr,
> -					 bo_size, sync, 1);
> -#define ONE_SEC	MS_TO_NS(1000)
> -	xe_wait_ufence(fd, &data[0].vm_sync, USER_FENCE_VALUE,
> -		       bind_exec_queues[0], ONE_SEC);
> -	data[0].vm_sync = 0;
> -
> -	if (flags & PREFETCH) {
> -		/* Should move to system memory */
> -		xe_vm_prefetch_async(fd, vm, bind_exec_queues[0], 0, addr,
> -				     bo_size, sync, 1, 0);
> -		xe_wait_ufence(fd, &data[0].vm_sync, USER_FENCE_VALUE,
> -			       bind_exec_queues[0], ONE_SEC);
> -		data[0].vm_sync = 0;
> -	}
> -
> -	for (i = 0; i < n_execs; i++) {
> -		uint64_t batch_offset = (char *)&data[i].batch - (char *)data;
> -		uint64_t batch_addr = addr + batch_offset;
> -		uint64_t sdi_offset = (char *)&data[i].data - (char *)data;
> -		uint64_t sdi_addr = addr + sdi_offset;
> -		int e = i % n_exec_queues;
> +	placement = (flags & PREFETCH) ? all_memory_regions(fd) |
> +		vram_if_possible(fd, 0) : vram_if_possible(fd, eci->gt_id);
>  
> -		b = 0;
> -		if (flags & INVALID_VA)
> -			sdi_addr = 0x1fffffffffff000;
> +	data = create_xe_bo(fd, vm, &bo, bo_size, placement, flags);
>  
> -		data[i].batch[b++] = MI_STORE_DWORD_IMM_GEN4;
> -		data[i].batch[b++] = sdi_addr;
> -		data[i].batch[b++] = sdi_addr >> 32;
> -		data[i].batch[b++] = 0xc0ffee;
> -		data[i].batch[b++] = MI_BATCH_BUFFER_END;
> -		igt_assert(b <= ARRAY_SIZE(data[i].batch));
> +	create_exec_queue(fd, vm, exec_queues, bind_exec_queues,
> +			n_exec_queues, eci, flags);
>  
> -		sync[0].addr = addr + (char *)&data[i].exec_sync - (char *)data;
> +	xe_bo_bind(fd, vm, bind_exec_queues[0], bo, bo_size, addr, sync, data, flags);
>  
> -		exec.exec_queue_id = exec_queues[e];
> -		exec.address = batch_addr;
> -		xe_exec(fd, &exec);
> +	xe_execbuf(fd, vm, &exec, n_execs, exec_queues, bind_exec_queues,
> +			n_exec_queues, bo, bo_size, sync, addr, data, &map_fd, flags);
>  
> -		if (flags & REBIND && i + 1 != n_execs) {
> -			xe_wait_ufence(fd, &data[i].exec_sync, USER_FENCE_VALUE,
> -				       exec_queues[e], ONE_SEC);
> -			xe_vm_unbind_async(fd, vm, bind_exec_queues[e], 0,
> -					   addr, bo_size, NULL, 0);
> -
> -			sync[0].addr = to_user_pointer(&data[0].vm_sync);
> -			addr += bo_size;
> -			if (bo)
> -				xe_vm_bind_async(fd, vm, bind_exec_queues[e], bo,
> -						 0, addr, bo_size, sync, 1);
> -			else
> -				xe_vm_bind_userptr_async(fd, vm,
> -							 bind_exec_queues[e],
> -							 to_user_pointer(data),
> -							 addr, bo_size, sync,
> -							 1);
> -			xe_wait_ufence(fd, &data[0].vm_sync, USER_FENCE_VALUE,
> -				       bind_exec_queues[e], ONE_SEC);
> -			data[0].vm_sync = 0;
> -		}
> -
> -		if (flags & INVALIDATE && i + 1 != n_execs) {
> -			if (!(flags & RACE)) {
> -				/*
> -				 * Wait for exec completion and check data as
> -				 * userptr will likely change to different
> -				 * physical memory on next mmap call triggering
> -				 * an invalidate.
> -				 */
> -				xe_wait_ufence(fd, &data[i].exec_sync,
> -					       USER_FENCE_VALUE, exec_queues[e],
> -					       ONE_SEC);
> -				igt_assert_eq(data[i].data, 0xc0ffee);
> -			} else if (i * 2 != n_execs) {
> -				/*
> -				 * We issue 1 mmap which races against running
> -				 * jobs. No real check here aside from this test
> -				 * not faulting on the GPU.
> -				 */
> -				continue;
> -			}
> -
> -			if (flags & RACE) {
> -				map_fd = open("/tmp", O_TMPFILE | O_RDWR,
> -					      0x666);
> -				write(map_fd, data, bo_size);
> -				data = mmap((void *)MAP_ADDRESS, bo_size,
> -					    PROT_READ | PROT_WRITE, MAP_SHARED |
> -					    MAP_FIXED, map_fd, 0);
> -			} else {
> -				data = mmap((void *)MAP_ADDRESS, bo_size,
> -					    PROT_READ | PROT_WRITE, MAP_SHARED |
> -					    MAP_FIXED | MAP_ANONYMOUS, -1, 0);
> -			}
> -			igt_assert(data != MAP_FAILED);
> -		}
> -	}
>  	if (!(flags & INVALID_FAULT)) {
>  		int64_t timeout = ONE_SEC;
>  
> @@ -286,22 +144,15 @@ test_exec(int fd, struct drm_xe_engine_class_instance *eci,
>  							       exec_queues[i % n_exec_queues], &timeout), 0);
>  		}
>  	}
> -	sync[0].addr = to_user_pointer(&data[0].vm_sync);
> -	xe_vm_unbind_async(fd, vm, bind_exec_queues[0], 0, addr, bo_size,
> -			   sync, 1);
> -	xe_wait_ufence(fd, &data[0].vm_sync, USER_FENCE_VALUE,
> -		       bind_exec_queues[0], ONE_SEC);
> +
> +	xe_vm_unbind(fd, vm, bind_exec_queues[0], sync, data, addr, bo_size);
>  
>  	if (!(flags & INVALID_FAULT) && !(flags & INVALID_VA)) {
>  		for (i = j; i < n_execs; i++)
>  				igt_assert_eq(data[i].data, 0xc0ffee);
>  	}
>  
> -	for (i = 0; i < n_exec_queues; i++) {
> -		xe_exec_queue_destroy(fd, exec_queues[i]);
> -		if (bind_exec_queues[i])
> -			xe_exec_queue_destroy(fd, bind_exec_queues[i]);
> -	}
> +	destory_exec_queue(fd, exec_queues, bind_exec_queues, n_exec_queues);
>  
>  	if (bo) {
>  		munmap(data, bo_size);
> -- 
> 2.25.1
> 

^ permalink raw reply	[flat|nested] 9+ messages in thread

* RE: [PATCH] lib/xe/xe_util: Creating the helper functions
  2024-03-31 21:05 ` [PATCH] " Matthew Brost
@ 2024-04-01 16:10   ` Zeng, Oak
  2024-04-01 18:40     ` Matthew Brost
  0 siblings, 1 reply; 9+ messages in thread
From: Zeng, Oak @ 2024-04-01 16:10 UTC (permalink / raw)
  To: Brost, Matthew, Bommu, Krishnaiah
  Cc: intel-xe@lists.freedesktop.org, Kumar,  Janga Rahul,
	Ghimiray, Himal Prasad, Chehab, Mauro

Hi Matt,

> -----Original Message-----
> From: Brost, Matthew <matthew.brost@intel.com>
> Sent: Sunday, March 31, 2024 5:05 PM
> To: Bommu, Krishnaiah <krishnaiah.bommu@intel.com>
> Cc: intel-xe@lists.freedesktop.org; Kumar, Janga Rahul
> <janga.rahul.kumar@intel.com>; Ghimiray, Himal Prasad
> <himal.prasad.ghimiray@intel.com>; Zeng, Oak <oak.zeng@intel.com>
> Subject: Re: [PATCH] lib/xe/xe_util: Creating the helper functions
> 
> On Mon, Apr 01, 2024 at 12:29:49AM +0530, Bommu Krishnaiah wrote:
> > Creating the helper functions which can be reused by many tests,
> > as part of this patch xe_exec_compute_mode and xe_exec_fault_mode
> > are used this helper functions.
> >
> 
> As the author of most these tests and IGTs, I can say while not great to
> have copy paste between tests I much prefer it to adding helpers to try
> to jam the needs of each test into the helpers. IMO this just creates a
> headache in the long run. e.g. Let's say I want to change the behavior
> of xe_exec_compute_mode but not xe_exec_fault_mode, with shared helpers
> this gets harder.

Duplicated codes is not a good practice in software. When you have a fix, you need to fix all the places where you have the duplicated codes.

In the example you give, it depends on how we make the helpers. If the helpers are not abstract well enough, we will run into the problem you said. 

The helper idea is inherited from i915. With those helpers, we can write a test with less than 50 lines of codes. In many cases, just ~20 lines of code, very easy to read and debug. Just take a look of i915 gem_vm_pagefault.c and i915_svm.c.

Xe igt is rather complicated compared above i915 tests. In the past few days, I had to debug into xe-exec-threads, it took me long time to really understand the test. The main reason is, we have a huge function such as test_legacy_mode, and flags is used to control the behavior of the test. This is hard to read, maintain and debug.

> 
> I'd much rather leave this as is.
> 
> Maybe I can buy a helper or two to program the batches, i.e.
> insert_store LGTM, but certainly not the logic doing bind, invalidates,
> loop, etc...


Let me take a look of the helpers, and see whether we can make it better. Our plan is to use those helpers for system allocator test. We use existing tests to test those helpers because right now we don't have system allocator ready. So let's improve it first. We might end up something which is good for both existing test and coming system allocator test. 

We will let you decide whether you want to use those helpers for existing tests. If we can't make something good for existing tests, we will target to using those helpers only for the coming system allocator tests.

Thanks,
Oak

> 
> Matt
> 
> > Signed-off-by: Bommu Krishnaiah <krishnaiah.bommu@intel.com>
> > Cc: Janga Rahul Kumar <janga.rahul.kumar@intel.com>
> > Cc: Himal Prasad Ghimiray <himal.prasad.ghimiray@intel.com>
> > Cc: Oak Zeng <oak.zeng@intel.com>
> > ---
> >  lib/xe/xe_util.c                   | 194 +++++++++++++++++++++++++++++
> >  lib/xe/xe_util.h                   |  48 +++++++
> >  tests/intel/xe_exec_compute_mode.c | 190 ++++------------------------
> >  tests/intel/xe_exec_fault_mode.c   | 179 +++-----------------------
> >  4 files changed, 282 insertions(+), 329 deletions(-)
> >
> > diff --git a/lib/xe/xe_util.c b/lib/xe/xe_util.c
> > index 050162b5e..17cd45e45 100644
> > --- a/lib/xe/xe_util.c
> > +++ b/lib/xe/xe_util.c
> > @@ -255,3 +255,197 @@ bool xe_is_gt_in_c6(int fd, int gt)
> >
> >  	return strcmp(gt_c_state, "gt-c6") == 0;
> >  }
> > +
> > +struct cpu_va *create_xe_bo(int fd, uint32_t vm, uint32_t *bo,
> > +		uint32_t bo_size, uint64_t placement, unsigned int flags)
> > +{
> > +	struct cpu_va *data;
> > +
> > +	if (flags & USERPTR) {
> > +		if (flags & INVALIDATE) {
> > +			data = mmap((void *)MAP_ADDRESS, bo_size,
> PROT_READ | PROT_WRITE,
> > +					MAP_SHARED | MAP_FIXED |
> MAP_ANONYMOUS, -1, 0);
> > +			igt_assert(data != MAP_FAILED);
> > +		} else {
> > +			data = aligned_alloc(xe_get_default_alignment(fd),
> bo_size);
> > +			igt_assert(data);
> > +		}
> > +	} else {
> > +		*bo = xe_bo_create(fd, flags & VM_FOR_BO ? vm : 0, bo_size,
> placement,
> > +
> 	DRM_XE_GEM_CREATE_FLAG_NEEDS_VISIBLE_VRAM);
> > +		data = xe_bo_map(fd, *bo, bo_size);
> > +	}
> > +
> > +	memset(data, 0, bo_size);
> > +
> > +	return data;
> > +}
> > +
> > +void create_exec_queue(int fd, uint32_t vm, uint32_t *exec_queues,
> > +		uint32_t *bind_exec_queues, int n_exec_queues,
> > +		struct drm_xe_engine_class_instance *eci, unsigned int flags)
> > +{
> > +	int i;
> > +
> > +	for (i = 0; i < n_exec_queues; i++) {
> > +		exec_queues[i] = xe_exec_queue_create(fd, vm, eci, 0);
> > +		if (flags & BIND_EXEC_QUEUE)
> > +			bind_exec_queues[i] = xe_bind_exec_queue_create(fd,
> vm, 0);
> > +		else
> > +			bind_exec_queues[i] = 0;
> > +	};
> > +}
> > +
> > +void xe_bo_bind(int fd, uint32_t vm, uint32_t bind_exec_queues, uint32_t bo,
> > +		uint32_t bo_size, uint64_t addr, struct drm_xe_sync *sync,
> > +		struct cpu_va *data, unsigned int flags)
> > +{
> > +	int64_t fence_timeout = igt_run_in_simulation() ? HUNDRED_SEC :
> ONE_SEC;
> > +
> > +	sync[0].addr = to_user_pointer(&data[0].vm_sync);
> > +	if (bo)
> > +		xe_vm_bind_async(fd, vm, bind_exec_queues, bo, 0, addr,
> bo_size, sync, 1);
> > +	else
> > +		xe_vm_bind_userptr_async(fd, vm, bind_exec_queues,
> > +				to_user_pointer(data), addr,
> > +				bo_size, sync, 1);
> > +
> > +	xe_wait_ufence(fd, &data[0].vm_sync, USER_FENCE_VALUE,
> > +			bind_exec_queues, fence_timeout);
> > +	data[0].vm_sync = 0;
> > +
> > +	if (flags & PREFETCH) {
> > +		/* Should move to system memory */
> > +		xe_vm_prefetch_async(fd, vm, bind_exec_queues, 0, addr,
> > +				bo_size, sync, 1, 0);
> > +		xe_wait_ufence(fd, &data[0].vm_sync, USER_FENCE_VALUE,
> > +				bind_exec_queues, fence_timeout);
> > +		data[0].vm_sync = 0;
> > +	}
> > +}
> > +
> > +void insert_store(uint64_t dst, struct cpu_va *data, uint32_t val, int i)
> > +{
> > +	int b = 0;
> > +	data[i].batch[b++] = MI_STORE_DWORD_IMM_GEN4;
> > +	data[i].batch[b++] = dst;
> > +	data[i].batch[b++] = dst >> 32;
> > +	data[i].batch[b++] = val;
> > +	data[i].batch[b++] = MI_BATCH_BUFFER_END;
> > +	igt_assert(b <= ARRAY_SIZE(data[i].batch));
> > +}
> > +
> > +void xe_execbuf(int fd, uint32_t vm, struct drm_xe_exec *exec, int n_execs,
> > +		uint32_t *exec_queues, uint32_t *bind_exec_queues,
> > +		int n_exec_queues, uint32_t bo, uint32_t bo_size,
> > +		struct drm_xe_sync *sync, uint64_t addr, struct cpu_va *data,
> > +		int *map_fd, unsigned int flags)
> > +{
> > +	int i;
> > +	int64_t fence_timeout = igt_run_in_simulation() ? HUNDRED_SEC :
> ONE_SEC;
> > +
> > +	for (i = 0; i < n_execs; i++) {
> > +		uint64_t batch_offset = (char *)&data[i].batch - (char *)data;
> > +		uint64_t batch_addr = addr + batch_offset;
> > +		uint64_t sdi_offset = (char *)&data[i].data - (char *)data;
> > +		uint64_t sdi_addr = addr + sdi_offset;
> > +		int e = i % n_exec_queues;
> > +
> > +		if (flags & INVALID_VA)
> > +			sdi_addr = 0x1fffffffffff000;
> > +
> > +		insert_store(sdi_addr, data, 0xc0ffee, i);
> > +
> > +		sync[0].addr = addr + (char *)&data[i].exec_sync - (char *)data;
> > +
> > +		exec->exec_queue_id = exec_queues[e];
> > +		exec->address = batch_addr;
> > +		xe_exec(fd, exec);
> > +
> > +		if (flags & REBIND && i + 1 != n_execs) {
> > +			xe_wait_ufence(fd, &data[i].exec_sync,
> USER_FENCE_VALUE,
> > +					exec_queues[e], fence_timeout);
> > +			xe_vm_unbind_async(fd, vm, bind_exec_queues[e], 0,
> > +					addr, bo_size, NULL, 0);
> > +
> > +			sync[0].addr = to_user_pointer(&data[0].vm_sync);
> > +			addr += bo_size;
> > +
> > +			if (bo)
> > +				xe_vm_bind_async(fd, vm,
> bind_exec_queues[e], bo,
> > +						0, addr, bo_size, sync, 1);
> > +			else
> > +				xe_vm_bind_userptr_async(fd, vm,
> > +						bind_exec_queues[e],
> > +						to_user_pointer(data),
> > +						addr, bo_size, sync, 1);
> > +
> > +			xe_wait_ufence(fd, &data[0].vm_sync,
> USER_FENCE_VALUE,
> > +					bind_exec_queues[e], fence_timeout);
> > +			data[0].vm_sync = 0;
> > +		}
> > +
> > +		if (flags & INVALIDATE && i + 1 != n_execs) {
> > +			if (!(flags & RACE)) {
> > +				/*
> > +				 * Wait for exec completion and check data as
> > +				 * userptr will likely change to different
> > +				 * physical memory on next mmap call triggering
> > +				 * an invalidate.
> > +				 */
> > +				xe_wait_ufence(fd, &data[i].exec_sync,
> > +						USER_FENCE_VALUE,
> exec_queues[e],
> > +						fence_timeout);
> > +				igt_assert_eq(data[i].data, 0xc0ffee);
> > +			} else if (i * 2 != n_execs) {
> > +				/*
> > +				 * We issue 1 mmap which races against running
> > +				 * jobs. No real check here aside from this test
> > +				 * not faulting on the GPU.
> > +				 */
> > +				continue;
> > +			}
> > +
> > +			if (flags & RACE) {
> > +				*map_fd = open("/tmp", O_TMPFILE | O_RDWR,
> > +						0x666);
> > +				write(*map_fd, data, bo_size);
> > +				data = mmap((void *)MAP_ADDRESS, bo_size,
> > +						PROT_READ | PROT_WRITE,
> MAP_SHARED |
> > +						MAP_FIXED, *map_fd, 0);
> > +			} else {
> > +				data = mmap((void *)MAP_ADDRESS, bo_size,
> > +						PROT_READ | PROT_WRITE,
> MAP_SHARED |
> > +						MAP_FIXED |
> MAP_ANONYMOUS, -1, 0);
> > +			}
> > +
> > +			igt_assert(data != MAP_FAILED);
> > +		}
> > +	}
> > +}
> > +
> > +void xe_vm_unbind(int fd, uint32_t vm, uint32_t bind_exec_queues,
> > +		struct drm_xe_sync *sync, struct cpu_va *data, uint64_t addr,
> > +		uint32_t bo_size)
> > +{
> > +	int64_t fence_timeout = igt_run_in_simulation() ? HUNDRED_SEC :
> ONE_SEC;
> > +
> > +	sync[0].addr = to_user_pointer(&data[0].vm_sync);
> > +	xe_vm_unbind_async(fd, vm, bind_exec_queues, 0, addr, bo_size,
> > +			sync, 1);
> > +	xe_wait_ufence(fd, &data[0].vm_sync, USER_FENCE_VALUE,
> > +			bind_exec_queues, fence_timeout);
> > +}
> > +
> > +void destory_exec_queue(int fd, uint32_t *exec_queues,
> > +		uint32_t *bind_exec_queues, int n_exec_queues)
> > +{
> > +	int i;
> > +
> > +	for (i = 0; i < n_exec_queues; i++) {
> > +		xe_exec_queue_destroy(fd, exec_queues[i]);
> > +		if (bind_exec_queues[i])
> > +			xe_exec_queue_destroy(fd, bind_exec_queues[i]);
> > +	}
> > +}
> > +
> > diff --git a/lib/xe/xe_util.h b/lib/xe/xe_util.h
> > index 6480ea01a..aa1b0dcc1 100644
> > --- a/lib/xe/xe_util.h
> > +++ b/lib/xe/xe_util.h
> > @@ -47,4 +47,52 @@ void xe_bind_unbind_async(int fd, uint32_t vm, uint32_t
> bind_engine,
> >
> >  bool xe_is_gt_in_c6(int fd, int gt);
> >
> > +
> > +#define USERPTR			(0x1 << 0)
> > +#define REBIND			(0x1 << 1)
> > +#define INVALIDATE		(0x1 << 2)
> > +#define RACE			(0x1 << 3)
> > +#define BIND_EXEC_QUEUE		(0x1 << 4)
> > +#define PREFETCH		(0x1 << 5)
> > +#define INVALID_FAULT		(0x1 << 6)
> > +#define INVALID_VA		(0x1 << 7)
> > +#define ENABLE_SCRATCH		(0x1 << 8)
> > +#define VM_FOR_BO		(0x1 << 9)
> > +#define EXEC_QUEUE_EARLY	(0x1 << 10)
> > +
> > +#define MAX_N_EXEC_QUEUES	16
> > +#define USER_FENCE_VALUE	0xdeadbeefdeadbeefull
> > +#define MAP_ADDRESS		0x00007fadeadbe000
> > +
> > +#define ONE_SEC		MS_TO_NS(1000)
> > +#define HUNDRED_SEC	MS_TO_NS(100000)
> > +
> > +struct cpu_va {
> > +	uint32_t batch[16];
> > +	uint64_t pad;
> > +	uint64_t vm_sync;
> > +	uint64_t exec_sync;
> > +	uint32_t data;
> > +};
> > +
> > +struct cpu_va *create_xe_bo(int fd, uint32_t vm, uint32_t *bo,
> > +		uint32_t bo_size, uint64_t placement,  unsigned int flags);
> > +void create_exec_queue(int fd, uint32_t vm, uint32_t *exec_queues,
> > +		uint32_t *bind_exec_queues, int n_exec_queues,
> > +		struct drm_xe_engine_class_instance *eci, unsigned int flags);
> > +void xe_bo_bind(int fd, uint32_t vm, uint32_t bind_exec_queues, uint32_t bo,
> > +		uint32_t bo_size, uint64_t addr, struct drm_xe_sync *sync,
> > +		struct cpu_va *data, unsigned int flags);
> > +void xe_execbuf(int fd, uint32_t vm, struct drm_xe_exec *exec, int n_execs,
> > +		uint32_t *exec_queues, uint32_t *bind_exec_queues,
> > +		int n_exec_queues, uint32_t bo, uint32_t bo_size,
> > +		struct drm_xe_sync *sync, uint64_t addr, struct cpu_va *data,
> > +		int *map_fd, unsigned int flags);
> > +void insert_store(uint64_t dst, struct cpu_va *data, uint32_t val, int i);
> > +void xe_vm_unbind(int fd, uint32_t vm, uint32_t bind_exec_queues,
> > +		struct drm_xe_sync *sync, struct cpu_va *data,
> > +		uint64_t addr, uint32_t bo_size);
> > +void destory_exec_queue(int fd, uint32_t *exec_queues,
> > +		uint32_t *bind_exec_queues, int n_exec_queues);
> > +
> >  #endif /* XE_UTIL_H */
> > diff --git a/tests/intel/xe_exec_compute_mode.c
> b/tests/intel/xe_exec_compute_mode.c
> > index 3ec848164..e8d82cc69 100644
> > --- a/tests/intel/xe_exec_compute_mode.c
> > +++ b/tests/intel/xe_exec_compute_mode.c
> > @@ -15,6 +15,7 @@
> >  #include "igt.h"
> >  #include "lib/igt_syncobj.h"
> >  #include "lib/intel_reg.h"
> > +#include "lib/xe/xe_util.h"
> >  #include <sys/ioctl.h>
> >  #include "xe_drm.h"
> >
> > @@ -23,15 +24,6 @@
> >  #include "xe/xe_spin.h"
> >  #include <string.h>
> >
> > -#define MAX_N_EXECQUEUES 	16
> > -#define USERPTR				(0x1 << 0)
> > -#define REBIND				(0x1 << 1)
> > -#define INVALIDATE			(0x1 << 2)
> > -#define RACE				(0x1 << 3)
> > -#define BIND_EXECQUEUE		(0x1 << 4)
> > -#define VM_FOR_BO			(0x1 << 5)
> > -#define EXEC_QUEUE_EARLY	(0x1 << 6)
> > -
> >  /**
> >   * SUBTEST: twice-%s
> >   * Description: Run %arg[1] compute machine test twice
> > @@ -88,7 +80,6 @@ test_exec(int fd, struct drm_xe_engine_class_instance
> *eci,
> >  {
> >  	uint32_t vm;
> >  	uint64_t addr = 0x1a0000;
> > -#define USER_FENCE_VALUE	0xdeadbeefdeadbeefull
> >  	struct drm_xe_sync sync[1] = {
> >  		{ .type = DRM_XE_SYNC_TYPE_USER_FENCE,
> >  		  .flags = DRM_XE_SYNC_FLAG_SIGNAL,
> > @@ -99,161 +90,38 @@ test_exec(int fd, struct drm_xe_engine_class_instance
> *eci,
> >  		.num_syncs = 1,
> >  		.syncs = to_user_pointer(sync),
> >  	};
> > -	uint32_t exec_queues[MAX_N_EXECQUEUES];
> > -	uint32_t bind_exec_queues[MAX_N_EXECQUEUES];
> > +	uint32_t exec_queues[MAX_N_EXEC_QUEUES];
> > +	uint32_t bind_exec_queues[MAX_N_EXEC_QUEUES];
> >  	size_t bo_size;
> >  	uint32_t bo = 0;
> > -	struct {
> > -		uint32_t batch[16];
> > -		uint64_t pad;
> > -		uint64_t vm_sync;
> > -		uint64_t exec_sync;
> > -		uint32_t data;
> > -	} *data;
> > -	int i, j, b;
> > +	struct cpu_va *data;
> > +	int i, j;
> >  	int map_fd = -1;
> >  	int64_t fence_timeout;
> > +	uint64_t placement;
> >
> > -	igt_assert(n_exec_queues <= MAX_N_EXECQUEUES);
> > +	igt_assert(n_exec_queues <= MAX_N_EXEC_QUEUES);
> >
> >  	vm = xe_vm_create(fd, DRM_XE_VM_CREATE_FLAG_LR_MODE, 0);
> >  	bo_size = sizeof(*data) * n_execs;
> >  	bo_size = xe_bb_size(fd, bo_size);
> >
> > -	for (i = 0; (flags & EXEC_QUEUE_EARLY) && i < n_exec_queues; i++) {
> > -		exec_queues[i] = xe_exec_queue_create(fd, vm, eci, 0);
> > -		if (flags & BIND_EXECQUEUE)
> > -			bind_exec_queues[i] =
> > -				xe_bind_exec_queue_create(fd, vm, 0);
> > -		else
> > -			bind_exec_queues[i] = 0;
> > -	};
> > +	if (flags & EXEC_QUEUE_EARLY)
> > +		create_exec_queue(fd, vm, exec_queues, bind_exec_queues,
> n_exec_queues, eci, flags);
> >
> > -	if (flags & USERPTR) {
> > -#define	MAP_ADDRESS	0x00007fadeadbe000
> > -		if (flags & INVALIDATE) {
> > -			data = mmap((void *)MAP_ADDRESS, bo_size,
> PROT_READ |
> > -				    PROT_WRITE, MAP_SHARED | MAP_FIXED |
> > -				    MAP_ANONYMOUS, -1, 0);
> > -			igt_assert(data != MAP_FAILED);
> > -		} else {
> > -			data = aligned_alloc(xe_get_default_alignment(fd),
> > -					     bo_size);
> > -			igt_assert(data);
> > -		}
> > -	} else {
> > -		bo = xe_bo_create(fd, flags & VM_FOR_BO ? vm : 0,
> > -				  bo_size, vram_if_possible(fd, eci->gt_id),
> > -
> DRM_XE_GEM_CREATE_FLAG_NEEDS_VISIBLE_VRAM);
> > -		data = xe_bo_map(fd, bo, bo_size);
> > -	}
> > -	memset(data, 0, bo_size);
> > +	placement = vram_if_possible(fd, eci->gt_id);
> >
> > -	for (i = 0; !(flags & EXEC_QUEUE_EARLY) && i < n_exec_queues; i++) {
> > -		exec_queues[i] = xe_exec_queue_create(fd, vm, eci, 0);
> > -		if (flags & BIND_EXECQUEUE)
> > -			bind_exec_queues[i] =
> > -				xe_bind_exec_queue_create(fd, vm, 0);
> > -		else
> > -			bind_exec_queues[i] = 0;
> > -	};
> > +	data = create_xe_bo(fd, vm, &bo, bo_size, placement, flags);
> >
> > -	sync[0].addr = to_user_pointer(&data[0].vm_sync);
> > -	if (bo)
> > -		xe_vm_bind_async(fd, vm, bind_exec_queues[0], bo, 0, addr,
> > -				 bo_size, sync, 1);
> > -	else
> > -		xe_vm_bind_userptr_async(fd, vm, bind_exec_queues[0],
> > -					 to_user_pointer(data), addr,
> > -					 bo_size, sync, 1);
> > -#define ONE_SEC	MS_TO_NS(1000)
> > -#define HUNDRED_SEC	MS_TO_NS(100000)
> > +	if(!(flags & EXEC_QUEUE_EARLY))
> > +		create_exec_queue(fd, vm, exec_queues, bind_exec_queues,
> n_exec_queues, eci, flags);
> > +
> > +	xe_bo_bind(fd, vm, bind_exec_queues[0], bo, bo_size, addr, sync, data,
> flags);
> >
> >  	fence_timeout = igt_run_in_simulation() ? HUNDRED_SEC : ONE_SEC;
> >
> > -	xe_wait_ufence(fd, &data[0].vm_sync, USER_FENCE_VALUE,
> > -		       bind_exec_queues[0], fence_timeout);
> > -	data[0].vm_sync = 0;
> > -
> > -	for (i = 0; i < n_execs; i++) {
> > -		uint64_t batch_offset = (char *)&data[i].batch - (char *)data;
> > -		uint64_t batch_addr = addr + batch_offset;
> > -		uint64_t sdi_offset = (char *)&data[i].data - (char *)data;
> > -		uint64_t sdi_addr = addr + sdi_offset;
> > -		int e = i % n_exec_queues;
> > -
> > -		b = 0;
> > -		data[i].batch[b++] = MI_STORE_DWORD_IMM_GEN4;
> > -		data[i].batch[b++] = sdi_addr;
> > -		data[i].batch[b++] = sdi_addr >> 32;
> > -		data[i].batch[b++] = 0xc0ffee;
> > -		data[i].batch[b++] = MI_BATCH_BUFFER_END;
> > -		igt_assert(b <= ARRAY_SIZE(data[i].batch));
> > -
> > -		sync[0].addr = addr + (char *)&data[i].exec_sync - (char *)data;
> > -
> > -		exec.exec_queue_id = exec_queues[e];
> > -		exec.address = batch_addr;
> > -		xe_exec(fd, &exec);
> > -
> > -		if (flags & REBIND && i + 1 != n_execs) {
> > -			xe_wait_ufence(fd, &data[i].exec_sync,
> USER_FENCE_VALUE,
> > -				       exec_queues[e], fence_timeout);
> > -			xe_vm_unbind_async(fd, vm, bind_exec_queues[e], 0,
> > -					   addr, bo_size, NULL, 0);
> > -
> > -			sync[0].addr = to_user_pointer(&data[0].vm_sync);
> > -			addr += bo_size;
> > -			if (bo)
> > -				xe_vm_bind_async(fd, vm,
> bind_exec_queues[e], bo,
> > -						 0, addr, bo_size, sync, 1);
> > -			else
> > -				xe_vm_bind_userptr_async(fd, vm,
> > -							 bind_exec_queues[e],
> > -							 to_user_pointer(data),
> > -							 addr, bo_size, sync,
> > -							 1);
> > -			xe_wait_ufence(fd, &data[0].vm_sync,
> USER_FENCE_VALUE,
> > -				       bind_exec_queues[e], fence_timeout);
> > -			data[0].vm_sync = 0;
> > -		}
> > -
> > -		if (flags & INVALIDATE && i + 1 != n_execs) {
> > -			if (!(flags & RACE)) {
> > -				/*
> > -				 * Wait for exec completion and check data as
> > -				 * userptr will likely change to different
> > -				 * physical memory on next mmap call triggering
> > -				 * an invalidate.
> > -				 */
> > -				xe_wait_ufence(fd, &data[i].exec_sync,
> > -					       USER_FENCE_VALUE,
> exec_queues[e],
> > -					       fence_timeout);
> > -				igt_assert_eq(data[i].data, 0xc0ffee);
> > -			} else if (i * 2 != n_execs) {
> > -				/*
> > -				 * We issue 1 mmap which races against running
> > -				 * jobs. No real check here aside from this test
> > -				 * not faulting on the GPU.
> > -				 */
> > -				continue;
> > -			}
> > -
> > -			if (flags & RACE) {
> > -				map_fd = open("/tmp", O_TMPFILE | O_RDWR,
> > -					      0x666);
> > -				write(map_fd, data, bo_size);
> > -				data = mmap((void *)MAP_ADDRESS, bo_size,
> > -					    PROT_READ | PROT_WRITE,
> MAP_SHARED |
> > -					    MAP_FIXED, map_fd, 0);
> > -			} else {
> > -				data = mmap((void *)MAP_ADDRESS, bo_size,
> > -					    PROT_READ | PROT_WRITE,
> MAP_SHARED |
> > -					    MAP_FIXED | MAP_ANONYMOUS, -1,
> 0);
> > -			}
> > -			igt_assert(data != MAP_FAILED);
> > -		}
> > -	}
> > +	xe_execbuf(fd, vm, &exec, n_execs, exec_queues, bind_exec_queues,
> > +			n_exec_queues, bo, bo_size, sync, addr, data, &map_fd,
> flags);
> >
> >  	j = flags & INVALIDATE ? n_execs - 1 : 0;
> >  	for (i = j; i < n_execs; i++)
> > @@ -264,20 +132,12 @@ test_exec(int fd, struct
> drm_xe_engine_class_instance *eci,
> >  	if (flags & INVALIDATE)
> >  		usleep(250000);
> >
> > -	sync[0].addr = to_user_pointer(&data[0].vm_sync);
> > -	xe_vm_unbind_async(fd, vm, bind_exec_queues[0], 0, addr, bo_size,
> > -			   sync, 1);
> > -	xe_wait_ufence(fd, &data[0].vm_sync, USER_FENCE_VALUE,
> > -		       bind_exec_queues[0], fence_timeout);
> > +	xe_vm_unbind(fd, vm, bind_exec_queues[0], sync, data, addr, bo_size);
> >
> >  	for (i = j; i < n_execs; i++)
> >  		igt_assert_eq(data[i].data, 0xc0ffee);
> >
> > -	for (i = 0; i < n_exec_queues; i++) {
> > -		xe_exec_queue_destroy(fd, exec_queues[i]);
> > -		if (bind_exec_queues[i])
> > -			xe_exec_queue_destroy(fd, bind_exec_queues[i]);
> > -	}
> > +	destory_exec_queue(fd, exec_queues, bind_exec_queues,
> n_exec_queues);
> >
> >  	if (bo) {
> >  		munmap(data, bo_size);
> > @@ -492,14 +352,14 @@ igt_main
> >  		{ "userptr-rebind", USERPTR | REBIND },
> >  		{ "userptr-invalidate", USERPTR | INVALIDATE },
> >  		{ "userptr-invalidate-race", USERPTR | INVALIDATE | RACE },
> > -		{ "bindexecqueue", BIND_EXECQUEUE },
> > -		{ "bindexecqueue-userptr", BIND_EXECQUEUE | USERPTR },
> > -		{ "bindexecqueue-rebind",  BIND_EXECQUEUE | REBIND },
> > -		{ "bindexecqueue-userptr-rebind",  BIND_EXECQUEUE |
> USERPTR |
> > +		{ "bindexecqueue", BIND_EXEC_QUEUE },
> > +		{ "bindexecqueue-userptr", BIND_EXEC_QUEUE | USERPTR },
> > +		{ "bindexecqueue-rebind",  BIND_EXEC_QUEUE | REBIND },
> > +		{ "bindexecqueue-userptr-rebind",  BIND_EXEC_QUEUE |
> USERPTR |
> >  			REBIND },
> > -		{ "bindexecqueue-userptr-invalidate",  BIND_EXECQUEUE |
> USERPTR |
> > +		{ "bindexecqueue-userptr-invalidate",  BIND_EXEC_QUEUE |
> USERPTR |
> >  			INVALIDATE },
> > -		{ "bindexecqueue-userptr-invalidate-race", BIND_EXECQUEUE |
> USERPTR |
> > +		{ "bindexecqueue-userptr-invalidate-race", BIND_EXEC_QUEUE |
> USERPTR |
> >  			INVALIDATE | RACE },
> >  		{ NULL },
> >  	};
> > diff --git a/tests/intel/xe_exec_fault_mode.c
> b/tests/intel/xe_exec_fault_mode.c
> > index 40fe1743e..7b2fef224 100644
> > --- a/tests/intel/xe_exec_fault_mode.c
> > +++ b/tests/intel/xe_exec_fault_mode.c
> > @@ -16,24 +16,13 @@
> >  #include "igt.h"
> >  #include "lib/igt_syncobj.h"
> >  #include "lib/intel_reg.h"
> > +#include "lib/xe/xe_util.h"
> >  #include "xe_drm.h"
> >
> >  #include "xe/xe_ioctl.h"
> >  #include "xe/xe_query.h"
> >  #include <string.h>
> >
> > -#define MAX_N_EXEC_QUEUES	16
> > -
> > -#define USERPTR		(0x1 << 0)
> > -#define REBIND		(0x1 << 1)
> > -#define INVALIDATE	(0x1 << 2)
> > -#define RACE		(0x1 << 3)
> > -#define BIND_EXEC_QUEUE	(0x1 << 4)
> > -#define PREFETCH	(0x1 << 5)
> > -#define INVALID_FAULT	(0x1 << 6)
> > -#define INVALID_VA	(0x1 << 7)
> > -#define ENABLE_SCRATCH  (0x1 << 8)
> > -
> >  /**
> >   * SUBTEST: invalid-va
> >   * Description: Access invalid va and check for EIO through user fence.
> > @@ -99,7 +88,6 @@ test_exec(int fd, struct drm_xe_engine_class_instance
> *eci,
> >  {
> >  	uint32_t vm;
> >  	uint64_t addr = 0x1a0000;
> > -#define USER_FENCE_VALUE	0xdeadbeefdeadbeefull
> >  	struct drm_xe_sync sync[1] = {
> >  		{ .type = DRM_XE_SYNC_TYPE_USER_FENCE, .flags =
> DRM_XE_SYNC_FLAG_SIGNAL,
> >  	          .timeline_value = USER_FENCE_VALUE },
> > @@ -113,14 +101,9 @@ test_exec(int fd, struct drm_xe_engine_class_instance
> *eci,
> >  	uint32_t bind_exec_queues[MAX_N_EXEC_QUEUES];
> >  	size_t bo_size;
> >  	uint32_t bo = 0;
> > -	struct {
> > -		uint32_t batch[16];
> > -		uint64_t pad;
> > -		uint64_t vm_sync;
> > -		uint64_t exec_sync;
> > -		uint32_t data;
> > -	} *data;
> > -	int i, j, b;
> > +	struct cpu_va *data;
> > +	uint64_t placement;
> > +	int i, j;
> >  	int map_fd = -1;
> >
> >  	igt_assert(n_exec_queues <= MAX_N_EXEC_QUEUES);
> > @@ -134,144 +117,19 @@ test_exec(int fd, struct
> drm_xe_engine_class_instance *eci,
> >  	bo_size = sizeof(*data) * n_execs;
> >  	bo_size = xe_bb_size(fd, bo_size);
> >
> > -	if (flags & USERPTR) {
> > -#define	MAP_ADDRESS	0x00007fadeadbe000
> > -		if (flags & INVALIDATE) {
> > -			data = mmap((void *)MAP_ADDRESS, bo_size,
> PROT_READ |
> > -				    PROT_WRITE, MAP_SHARED | MAP_FIXED |
> > -				    MAP_ANONYMOUS, -1, 0);
> > -			igt_assert(data != MAP_FAILED);
> > -		} else {
> > -			data = aligned_alloc(xe_get_default_alignment(fd),
> > -					     bo_size);
> > -			igt_assert(data);
> > -		}
> > -	} else {
> > -		if (flags & PREFETCH)
> > -			bo = xe_bo_create(fd, 0, bo_size,
> > -					  all_memory_regions(fd) |
> > -					  vram_if_possible(fd, 0),
> > -
> DRM_XE_GEM_CREATE_FLAG_NEEDS_VISIBLE_VRAM);
> > -		else
> > -			bo = xe_bo_create(fd, 0, bo_size,
> > -					  vram_if_possible(fd, eci->gt_id),
> > -
> DRM_XE_GEM_CREATE_FLAG_NEEDS_VISIBLE_VRAM);
> > -		data = xe_bo_map(fd, bo, bo_size);
> > -	}
> > -	memset(data, 0, bo_size);
> > -
> > -	for (i = 0; i < n_exec_queues; i++) {
> > -		exec_queues[i] = xe_exec_queue_create(fd, vm, eci, 0);
> > -		if (flags & BIND_EXEC_QUEUE)
> > -			bind_exec_queues[i] =
> > -				xe_bind_exec_queue_create(fd, vm, 0);
> > -		else
> > -			bind_exec_queues[i] = 0;
> > -	};
> > -
> > -	sync[0].addr = to_user_pointer(&data[0].vm_sync);
> > -	if (bo)
> > -		xe_vm_bind_async(fd, vm, bind_exec_queues[0], bo, 0, addr,
> bo_size, sync, 1);
> > -	else
> > -		xe_vm_bind_userptr_async(fd, vm, bind_exec_queues[0],
> > -					 to_user_pointer(data), addr,
> > -					 bo_size, sync, 1);
> > -#define ONE_SEC	MS_TO_NS(1000)
> > -	xe_wait_ufence(fd, &data[0].vm_sync, USER_FENCE_VALUE,
> > -		       bind_exec_queues[0], ONE_SEC);
> > -	data[0].vm_sync = 0;
> > -
> > -	if (flags & PREFETCH) {
> > -		/* Should move to system memory */
> > -		xe_vm_prefetch_async(fd, vm, bind_exec_queues[0], 0, addr,
> > -				     bo_size, sync, 1, 0);
> > -		xe_wait_ufence(fd, &data[0].vm_sync, USER_FENCE_VALUE,
> > -			       bind_exec_queues[0], ONE_SEC);
> > -		data[0].vm_sync = 0;
> > -	}
> > -
> > -	for (i = 0; i < n_execs; i++) {
> > -		uint64_t batch_offset = (char *)&data[i].batch - (char *)data;
> > -		uint64_t batch_addr = addr + batch_offset;
> > -		uint64_t sdi_offset = (char *)&data[i].data - (char *)data;
> > -		uint64_t sdi_addr = addr + sdi_offset;
> > -		int e = i % n_exec_queues;
> > +	placement = (flags & PREFETCH) ? all_memory_regions(fd) |
> > +		vram_if_possible(fd, 0) : vram_if_possible(fd, eci->gt_id);
> >
> > -		b = 0;
> > -		if (flags & INVALID_VA)
> > -			sdi_addr = 0x1fffffffffff000;
> > +	data = create_xe_bo(fd, vm, &bo, bo_size, placement, flags);
> >
> > -		data[i].batch[b++] = MI_STORE_DWORD_IMM_GEN4;
> > -		data[i].batch[b++] = sdi_addr;
> > -		data[i].batch[b++] = sdi_addr >> 32;
> > -		data[i].batch[b++] = 0xc0ffee;
> > -		data[i].batch[b++] = MI_BATCH_BUFFER_END;
> > -		igt_assert(b <= ARRAY_SIZE(data[i].batch));
> > +	create_exec_queue(fd, vm, exec_queues, bind_exec_queues,
> > +			n_exec_queues, eci, flags);
> >
> > -		sync[0].addr = addr + (char *)&data[i].exec_sync - (char *)data;
> > +	xe_bo_bind(fd, vm, bind_exec_queues[0], bo, bo_size, addr, sync, data,
> flags);
> >
> > -		exec.exec_queue_id = exec_queues[e];
> > -		exec.address = batch_addr;
> > -		xe_exec(fd, &exec);
> > +	xe_execbuf(fd, vm, &exec, n_execs, exec_queues, bind_exec_queues,
> > +			n_exec_queues, bo, bo_size, sync, addr, data, &map_fd,
> flags);
> >
> > -		if (flags & REBIND && i + 1 != n_execs) {
> > -			xe_wait_ufence(fd, &data[i].exec_sync,
> USER_FENCE_VALUE,
> > -				       exec_queues[e], ONE_SEC);
> > -			xe_vm_unbind_async(fd, vm, bind_exec_queues[e], 0,
> > -					   addr, bo_size, NULL, 0);
> > -
> > -			sync[0].addr = to_user_pointer(&data[0].vm_sync);
> > -			addr += bo_size;
> > -			if (bo)
> > -				xe_vm_bind_async(fd, vm,
> bind_exec_queues[e], bo,
> > -						 0, addr, bo_size, sync, 1);
> > -			else
> > -				xe_vm_bind_userptr_async(fd, vm,
> > -							 bind_exec_queues[e],
> > -							 to_user_pointer(data),
> > -							 addr, bo_size, sync,
> > -							 1);
> > -			xe_wait_ufence(fd, &data[0].vm_sync,
> USER_FENCE_VALUE,
> > -				       bind_exec_queues[e], ONE_SEC);
> > -			data[0].vm_sync = 0;
> > -		}
> > -
> > -		if (flags & INVALIDATE && i + 1 != n_execs) {
> > -			if (!(flags & RACE)) {
> > -				/*
> > -				 * Wait for exec completion and check data as
> > -				 * userptr will likely change to different
> > -				 * physical memory on next mmap call triggering
> > -				 * an invalidate.
> > -				 */
> > -				xe_wait_ufence(fd, &data[i].exec_sync,
> > -					       USER_FENCE_VALUE,
> exec_queues[e],
> > -					       ONE_SEC);
> > -				igt_assert_eq(data[i].data, 0xc0ffee);
> > -			} else if (i * 2 != n_execs) {
> > -				/*
> > -				 * We issue 1 mmap which races against running
> > -				 * jobs. No real check here aside from this test
> > -				 * not faulting on the GPU.
> > -				 */
> > -				continue;
> > -			}
> > -
> > -			if (flags & RACE) {
> > -				map_fd = open("/tmp", O_TMPFILE | O_RDWR,
> > -					      0x666);
> > -				write(map_fd, data, bo_size);
> > -				data = mmap((void *)MAP_ADDRESS, bo_size,
> > -					    PROT_READ | PROT_WRITE,
> MAP_SHARED |
> > -					    MAP_FIXED, map_fd, 0);
> > -			} else {
> > -				data = mmap((void *)MAP_ADDRESS, bo_size,
> > -					    PROT_READ | PROT_WRITE,
> MAP_SHARED |
> > -					    MAP_FIXED | MAP_ANONYMOUS, -1,
> 0);
> > -			}
> > -			igt_assert(data != MAP_FAILED);
> > -		}
> > -	}
> >  	if (!(flags & INVALID_FAULT)) {
> >  		int64_t timeout = ONE_SEC;
> >
> > @@ -286,22 +144,15 @@ test_exec(int fd, struct
> drm_xe_engine_class_instance *eci,
> >  							       exec_queues[i %
> n_exec_queues], &timeout), 0);
> >  		}
> >  	}
> > -	sync[0].addr = to_user_pointer(&data[0].vm_sync);
> > -	xe_vm_unbind_async(fd, vm, bind_exec_queues[0], 0, addr, bo_size,
> > -			   sync, 1);
> > -	xe_wait_ufence(fd, &data[0].vm_sync, USER_FENCE_VALUE,
> > -		       bind_exec_queues[0], ONE_SEC);
> > +
> > +	xe_vm_unbind(fd, vm, bind_exec_queues[0], sync, data, addr, bo_size);
> >
> >  	if (!(flags & INVALID_FAULT) && !(flags & INVALID_VA)) {
> >  		for (i = j; i < n_execs; i++)
> >  				igt_assert_eq(data[i].data, 0xc0ffee);
> >  	}
> >
> > -	for (i = 0; i < n_exec_queues; i++) {
> > -		xe_exec_queue_destroy(fd, exec_queues[i]);
> > -		if (bind_exec_queues[i])
> > -			xe_exec_queue_destroy(fd, bind_exec_queues[i]);
> > -	}
> > +	destory_exec_queue(fd, exec_queues, bind_exec_queues,
> n_exec_queues);
> >
> >  	if (bo) {
> >  		munmap(data, bo_size);
> > --
> > 2.25.1
> >

^ permalink raw reply	[flat|nested] 9+ messages in thread

* RE: [PATCH] lib/xe/xe_util: Creating the helper functions
  2024-03-31 18:59 [PATCH] lib/xe/xe_util: Creating the helper functions Bommu Krishnaiah
                   ` (2 preceding siblings ...)
  2024-03-31 21:05 ` [PATCH] " Matthew Brost
@ 2024-04-01 17:39 ` Zeng, Oak
  3 siblings, 0 replies; 9+ messages in thread
From: Zeng, Oak @ 2024-04-01 17:39 UTC (permalink / raw)
  To: Bommu, Krishnaiah, intel-xe@lists.freedesktop.org
  Cc: Kumar, Janga Rahul, Ghimiray, Himal Prasad, Brost, Matthew,
	Chehab, Mauro

Hi Krishna,

See comments inline

Cc'ed Matt, Mauro

Hi Matt,

When I looked at this series, I understand your concerns. I commented on the design of the helpers. The idea is:

1) create small helper such as create and map a buffer, destroy a buffer, create and fill a command buffer, submit a command buffer etc. hopefully those small helpers can be used by legacy tests

2)  behaviors such as rebind/invalidate etc will have to be part of the legacy test. It can't be in the helper functions. So with helpers the legacy tests can be simplified a little bit but not much. With this, we won't lost the flexibility as you concerned.

Thanks,
Oak

> -----Original Message-----
> From: Bommu, Krishnaiah <krishnaiah.bommu@intel.com>
> Sent: Sunday, March 31, 2024 3:00 PM
> To: intel-xe@lists.freedesktop.org
> Cc: Bommu, Krishnaiah <krishnaiah.bommu@intel.com>; Kumar, Janga Rahul
> <janga.rahul.kumar@intel.com>; Ghimiray, Himal Prasad
> <himal.prasad.ghimiray@intel.com>; Zeng, Oak <oak.zeng@intel.com>
> Subject: [PATCH] lib/xe/xe_util: Creating the helper functions

[PATCH i-g-t] prefix so patch can be picked up by correct CI

> 
> Creating the helper functions which can be reused by many tests,
> as part of this patch xe_exec_compute_mode and xe_exec_fault_mode
> are used this helper functions.

Please separate the helpers and applying helpers to existing tests as separate patches

You can also create one patch for one helper function. Smaller patch is always helpful for review.

> 
> Signed-off-by: Bommu Krishnaiah <krishnaiah.bommu@intel.com>
> Cc: Janga Rahul Kumar <janga.rahul.kumar@intel.com>
> Cc: Himal Prasad Ghimiray <himal.prasad.ghimiray@intel.com>
> Cc: Oak Zeng <oak.zeng@intel.com>
> ---
>  lib/xe/xe_util.c                   | 194 +++++++++++++++++++++++++++++
>  lib/xe/xe_util.h                   |  48 +++++++
>  tests/intel/xe_exec_compute_mode.c | 190 ++++------------------------
>  tests/intel/xe_exec_fault_mode.c   | 179 +++-----------------------
>  4 files changed, 282 insertions(+), 329 deletions(-)
> 
> diff --git a/lib/xe/xe_util.c b/lib/xe/xe_util.c
> index 050162b5e..17cd45e45 100644
> --- a/lib/xe/xe_util.c
> +++ b/lib/xe/xe_util.c
> @@ -255,3 +255,197 @@ bool xe_is_gt_in_c6(int fd, int gt)
> 
>  	return strcmp(gt_c_state, "gt-c6") == 0;
>  }
> +

Please add document for each helpers

For example, for this one the comment would be: create a buffer and map it for both CPU and GPU access

> +struct cpu_va *create_xe_bo(int fd, uint32_t vm, uint32_t *bo,
> +		uint32_t bo_size, uint64_t placement, unsigned int flags)

The function name is wrong as we don't have bo for userptr and system allocator. Xe_create_and_map_buffer

s/bo_size/size

you also need other parameters for gpu binding: uinit64_t gpu_va; define a flags for gpu binding such as GPU_BINDING

> +{
> +	struct cpu_va *data;

Shouldn't use a struct. Struct cpu_va is a concept in the existing test. It doesn't make sense to introduce this struct to helpers.

The return of this function is cpu_va which can be a void * 

So when you remove struct cpu_va concept from helper, the caller also need some change to drop struct cpu_va, for example, you will need create and map command buffer and dst buffer and sync object separately, instead of create them together and calculate an offset. I think it is okay.

> +
> +	if (flags & USERPTR) {
> +		if (flags & INVALIDATE) {


INVALIDATE is a concept in existing code. It doesn't make sense to have a INVALIDATE flag here. Maybe call it MMAP flag

> +			data = mmap((void *)MAP_ADDRESS, bo_size,
> PROT_READ | PROT_WRITE,
> +					MAP_SHARED | MAP_FIXED |
> MAP_ANONYMOUS, -1, 0);
> +			igt_assert(data != MAP_FAILED);
> +		} else {
> +			data = aligned_alloc(xe_get_default_alignment(fd),
> bo_size);
> +			igt_assert(data);
> +		}
> +	} else {
> +		*bo = xe_bo_create(fd, flags & VM_FOR_BO ? vm : 0, bo_size,
> placement,
> +
> 	DRM_XE_GEM_CREATE_FLAG_NEEDS_VISIBLE_VRAM);
> +		data = xe_bo_map(fd, *bo, bo_size);
> +	}
> +
> +	memset(data, 0, bo_size);
> +
> +	return data;
> +}
> +
> +void create_exec_queue(int fd, uint32_t vm, uint32_t *exec_queues,
> +		uint32_t *bind_exec_queues, int n_exec_queues,
> +		struct drm_xe_engine_class_instance *eci, unsigned int flags)
> +{
> +	int i;
> +
> +	for (i = 0; i < n_exec_queues; i++) {
> +		exec_queues[i] = xe_exec_queue_create(fd, vm, eci, 0);
> +		if (flags & BIND_EXEC_QUEUE)
> +			bind_exec_queues[i] = xe_bind_exec_queue_create(fd,
> vm, 0);
> +		else
> +			bind_exec_queues[i] = 0;
> +	};
> +}


I don't think we need this helper. It just put queue creation in a loop. It doesn't help much. 


> +
> +void xe_bo_bind(int fd, uint32_t vm, uint32_t bind_exec_queues, uint32_t bo,
> +		uint32_t bo_size, uint64_t addr, struct drm_xe_sync *sync,
> +		struct cpu_va *data, unsigned int flags)

Please make vm bind part of xe_create_and_map_buffer, as commented above


> +{
> +	int64_t fence_timeout = igt_run_in_simulation() ? HUNDRED_SEC :
> ONE_SEC;
> +
> +	sync[0].addr = to_user_pointer(&data[0].vm_sync);
> +	if (bo)
> +		xe_vm_bind_async(fd, vm, bind_exec_queues, bo, 0, addr,
> bo_size, sync, 1);
> +	else
> +		xe_vm_bind_userptr_async(fd, vm, bind_exec_queues,
> +				to_user_pointer(data), addr,
> +				bo_size, sync, 1);
> +
> +	xe_wait_ufence(fd, &data[0].vm_sync, USER_FENCE_VALUE,
> +			bind_exec_queues, fence_timeout);
> +	data[0].vm_sync = 0;
> +
> +	if (flags & PREFETCH) {


You can also drop the prefetch function. In most cases, we don't need prefetch. So it might be better the caller just do prefetch by himself. But I don't have a strong opinion here. If you keep it, I am also good.


> +		/* Should move to system memory */
> +		xe_vm_prefetch_async(fd, vm, bind_exec_queues, 0, addr,
> +				bo_size, sync, 1, 0);
> +		xe_wait_ufence(fd, &data[0].vm_sync, USER_FENCE_VALUE,
> +				bind_exec_queues, fence_timeout);
> +		data[0].vm_sync = 0;
> +	}
> +}
> +
> +void insert_store(uint64_t dst, struct cpu_va *data, uint32_t val, int i)

Parameter of this function is wrong.... you need:

void insert_store(uint32_t *batch, uint64_t dst_va, uint32_t val) where batch is the batch buffer's cpu address, dst_va is the gpu address where you want to store


> +{
> +	int b = 0;
> +	data[i].batch[b++] = MI_STORE_DWORD_IMM_GEN4;
> +	data[i].batch[b++] = dst;
> +	data[i].batch[b++] = dst >> 32;
> +	data[i].batch[b++] = val;
> +	data[i].batch[b++] = MI_BATCH_BUFFER_END;
> +	igt_assert(b <= ARRAY_SIZE(data[i].batch));
> +}
> +
> +void xe_execbuf(int fd, uint32_t vm, struct drm_xe_exec *exec, int n_execs,
> +		uint32_t *exec_queues, uint32_t *bind_exec_queues,
> +		int n_exec_queues, uint32_t bo, uint32_t bo_size,
> +		struct drm_xe_sync *sync, uint64_t addr, struct cpu_va *data,
> +		int *map_fd, unsigned int flags)

I don't think this is a good abstraction. I think we can create a few smaller helpers in stead of this big one. This one is not flexible. I think Matt had this concerns and I totally agree... 

You need small helpers to:
Create a command buffer
Submit command buffer to gpu
Wait for submission to complete

See below

> +{
> +	int i;
> +	int64_t fence_timeout = igt_run_in_simulation() ? HUNDRED_SEC :
> ONE_SEC;
> +
> +	for (i = 0; i < n_execs; i++) {

All such loop shouldn't be in a helper. 

> +		uint64_t batch_offset = (char *)&data[i].batch - (char *)data;
> +		uint64_t batch_addr = addr + batch_offset;
> +		uint64_t sdi_offset = (char *)&data[i].data - (char *)data;
> +		uint64_t sdi_addr = addr + sdi_offset;
> +		int e = i % n_exec_queues;
> +
> +		if (flags & INVALID_VA)
> +			sdi_addr = 0x1fffffffffff000;
> +
> +		insert_store(sdi_addr, data, 0xc0ffee, i);


Make a helper function to create command buffer/batch buffer, and fill buffer with commands

> +
> +		sync[0].addr = addr + (char *)&data[i].exec_sync - (char *)data;
> +
> +		exec->exec_queue_id = exec_queues[e];
> +		exec->address = batch_addr;

Make a helper to initialize a exec

> +		xe_exec(fd, exec);
> +
> +		if (flags & REBIND && i + 1 != n_execs) {


All this rebind behavior doesn't belong to helper. It should be part of caller/tester behavior

> +			xe_wait_ufence(fd, &data[i].exec_sync,
> USER_FENCE_VALUE,
> +					exec_queues[e], fence_timeout);
> +			xe_vm_unbind_async(fd, vm, bind_exec_queues[e], 0,
> +					addr, bo_size, NULL, 0);
> +
> +			sync[0].addr = to_user_pointer(&data[0].vm_sync);
> +			addr += bo_size;
> +
> +			if (bo)
> +				xe_vm_bind_async(fd, vm,
> bind_exec_queues[e], bo,
> +						0, addr, bo_size, sync, 1);
> +			else
> +				xe_vm_bind_userptr_async(fd, vm,
> +						bind_exec_queues[e],
> +						to_user_pointer(data),
> +						addr, bo_size, sync, 1);
> +
> +			xe_wait_ufence(fd, &data[0].vm_sync,
> USER_FENCE_VALUE,
> +					bind_exec_queues[e], fence_timeout);
> +			data[0].vm_sync = 0;
> +		}
> +
> +		if (flags & INVALIDATE && i + 1 != n_execs) {


All this invalidate behavior doesn't belong to helper. It should be part of caller/tester behavior

Remember our goal is not to make the existing test as simple as possible. Due to the design of the existing test, it is complex. With helpers, if we can simplify it a little, it is good. But don't try to simplify the existing tests as our goal.


> +			if (!(flags & RACE)) {
> +				/*
> +				 * Wait for exec completion and check data as
> +				 * userptr will likely change to different
> +				 * physical memory on next mmap call triggering
> +				 * an invalidate.
> +				 */
> +				xe_wait_ufence(fd, &data[i].exec_sync,
> +						USER_FENCE_VALUE,
> exec_queues[e],
> +						fence_timeout);
> +				igt_assert_eq(data[i].data, 0xc0ffee);
> +			} else if (i * 2 != n_execs) {
> +				/*
> +				 * We issue 1 mmap which races against running
> +				 * jobs. No real check here aside from this test
> +				 * not faulting on the GPU.
> +				 */
> +				continue;
> +			}
> +
> +			if (flags & RACE) {
> +				*map_fd = open("/tmp", O_TMPFILE | O_RDWR,
> +						0x666);
> +				write(*map_fd, data, bo_size);
> +				data = mmap((void *)MAP_ADDRESS, bo_size,
> +						PROT_READ | PROT_WRITE,
> MAP_SHARED |
> +						MAP_FIXED, *map_fd, 0);
> +			} else {
> +				data = mmap((void *)MAP_ADDRESS, bo_size,
> +						PROT_READ | PROT_WRITE,
> MAP_SHARED |
> +						MAP_FIXED |
> MAP_ANONYMOUS, -1, 0);
> +			}
> +
> +			igt_assert(data != MAP_FAILED);
> +		}
> +	}
> +}
> +
> +void xe_vm_unbind(int fd, uint32_t vm, uint32_t bind_exec_queues,
> +		struct drm_xe_sync *sync, struct cpu_va *data, uint64_t addr,
> +		uint32_t bo_size)
> +{
> +	int64_t fence_timeout = igt_run_in_simulation() ? HUNDRED_SEC :
> ONE_SEC;
> +
> +	sync[0].addr = to_user_pointer(&data[0].vm_sync);
> +	xe_vm_unbind_async(fd, vm, bind_exec_queues, 0, addr, bo_size,
> +			sync, 1);
> +	xe_wait_ufence(fd, &data[0].vm_sync, USER_FENCE_VALUE,
> +			bind_exec_queues, fence_timeout);
> +}


Please make a xe_destroy_buffer to destroy a buffer created through xe_create_buffer. The vm_unbind should be part of xe_destroy_buffer. No need of a separate helper

> +
> +void destory_exec_queue(int fd, uint32_t *exec_queues,
> +		uint32_t *bind_exec_queues, int n_exec_queues)


Please drop this one along with create_exec_queue


I will stop here. Let's improve the helpers in v2.

Oak

> +{
> +	int i;
> +
> +	for (i = 0; i < n_exec_queues; i++) {
> +		xe_exec_queue_destroy(fd, exec_queues[i]);
> +		if (bind_exec_queues[i])
> +			xe_exec_queue_destroy(fd, bind_exec_queues[i]);
> +	}
> +}
> +
> diff --git a/lib/xe/xe_util.h b/lib/xe/xe_util.h
> index 6480ea01a..aa1b0dcc1 100644
> --- a/lib/xe/xe_util.h
> +++ b/lib/xe/xe_util.h
> @@ -47,4 +47,52 @@ void xe_bind_unbind_async(int fd, uint32_t vm, uint32_t
> bind_engine,
> 
>  bool xe_is_gt_in_c6(int fd, int gt);
> 
> +
> +#define USERPTR			(0x1 << 0)
> +#define REBIND			(0x1 << 1)
> +#define INVALIDATE		(0x1 << 2)
> +#define RACE			(0x1 << 3)
> +#define BIND_EXEC_QUEUE		(0x1 << 4)
> +#define PREFETCH		(0x1 << 5)
> +#define INVALID_FAULT		(0x1 << 6)
> +#define INVALID_VA		(0x1 << 7)
> +#define ENABLE_SCRATCH		(0x1 << 8)
> +#define VM_FOR_BO		(0x1 << 9)
> +#define EXEC_QUEUE_EARLY	(0x1 << 10)
> +
> +#define MAX_N_EXEC_QUEUES	16
> +#define USER_FENCE_VALUE	0xdeadbeefdeadbeefull
> +#define MAP_ADDRESS		0x00007fadeadbe000
> +
> +#define ONE_SEC		MS_TO_NS(1000)
> +#define HUNDRED_SEC	MS_TO_NS(100000)
> +
> +struct cpu_va {
> +	uint32_t batch[16];
> +	uint64_t pad;
> +	uint64_t vm_sync;
> +	uint64_t exec_sync;
> +	uint32_t data;
> +};
> +
> +struct cpu_va *create_xe_bo(int fd, uint32_t vm, uint32_t *bo,
> +		uint32_t bo_size, uint64_t placement,  unsigned int flags);
> +void create_exec_queue(int fd, uint32_t vm, uint32_t *exec_queues,
> +		uint32_t *bind_exec_queues, int n_exec_queues,
> +		struct drm_xe_engine_class_instance *eci, unsigned int flags);
> +void xe_bo_bind(int fd, uint32_t vm, uint32_t bind_exec_queues, uint32_t bo,
> +		uint32_t bo_size, uint64_t addr, struct drm_xe_sync *sync,
> +		struct cpu_va *data, unsigned int flags);
> +void xe_execbuf(int fd, uint32_t vm, struct drm_xe_exec *exec, int n_execs,
> +		uint32_t *exec_queues, uint32_t *bind_exec_queues,
> +		int n_exec_queues, uint32_t bo, uint32_t bo_size,
> +		struct drm_xe_sync *sync, uint64_t addr, struct cpu_va *data,
> +		int *map_fd, unsigned int flags);
> +void insert_store(uint64_t dst, struct cpu_va *data, uint32_t val, int i);
> +void xe_vm_unbind(int fd, uint32_t vm, uint32_t bind_exec_queues,
> +		struct drm_xe_sync *sync, struct cpu_va *data,
> +		uint64_t addr, uint32_t bo_size);
> +void destory_exec_queue(int fd, uint32_t *exec_queues,
> +		uint32_t *bind_exec_queues, int n_exec_queues);
> +
>  #endif /* XE_UTIL_H */
> diff --git a/tests/intel/xe_exec_compute_mode.c
> b/tests/intel/xe_exec_compute_mode.c
> index 3ec848164..e8d82cc69 100644
> --- a/tests/intel/xe_exec_compute_mode.c
> +++ b/tests/intel/xe_exec_compute_mode.c
> @@ -15,6 +15,7 @@
>  #include "igt.h"
>  #include "lib/igt_syncobj.h"
>  #include "lib/intel_reg.h"
> +#include "lib/xe/xe_util.h"
>  #include <sys/ioctl.h>
>  #include "xe_drm.h"
> 
> @@ -23,15 +24,6 @@
>  #include "xe/xe_spin.h"
>  #include <string.h>
> 
> -#define MAX_N_EXECQUEUES 	16
> -#define USERPTR				(0x1 << 0)
> -#define REBIND				(0x1 << 1)
> -#define INVALIDATE			(0x1 << 2)
> -#define RACE				(0x1 << 3)
> -#define BIND_EXECQUEUE		(0x1 << 4)
> -#define VM_FOR_BO			(0x1 << 5)
> -#define EXEC_QUEUE_EARLY	(0x1 << 6)
> -
>  /**
>   * SUBTEST: twice-%s
>   * Description: Run %arg[1] compute machine test twice
> @@ -88,7 +80,6 @@ test_exec(int fd, struct drm_xe_engine_class_instance *eci,
>  {
>  	uint32_t vm;
>  	uint64_t addr = 0x1a0000;
> -#define USER_FENCE_VALUE	0xdeadbeefdeadbeefull
>  	struct drm_xe_sync sync[1] = {
>  		{ .type = DRM_XE_SYNC_TYPE_USER_FENCE,
>  		  .flags = DRM_XE_SYNC_FLAG_SIGNAL,
> @@ -99,161 +90,38 @@ test_exec(int fd, struct drm_xe_engine_class_instance
> *eci,
>  		.num_syncs = 1,
>  		.syncs = to_user_pointer(sync),
>  	};
> -	uint32_t exec_queues[MAX_N_EXECQUEUES];
> -	uint32_t bind_exec_queues[MAX_N_EXECQUEUES];
> +	uint32_t exec_queues[MAX_N_EXEC_QUEUES];
> +	uint32_t bind_exec_queues[MAX_N_EXEC_QUEUES];
>  	size_t bo_size;
>  	uint32_t bo = 0;
> -	struct {
> -		uint32_t batch[16];
> -		uint64_t pad;
> -		uint64_t vm_sync;
> -		uint64_t exec_sync;
> -		uint32_t data;
> -	} *data;
> -	int i, j, b;
> +	struct cpu_va *data;
> +	int i, j;
>  	int map_fd = -1;
>  	int64_t fence_timeout;
> +	uint64_t placement;
> 
> -	igt_assert(n_exec_queues <= MAX_N_EXECQUEUES);
> +	igt_assert(n_exec_queues <= MAX_N_EXEC_QUEUES);
> 
>  	vm = xe_vm_create(fd, DRM_XE_VM_CREATE_FLAG_LR_MODE, 0);
>  	bo_size = sizeof(*data) * n_execs;
>  	bo_size = xe_bb_size(fd, bo_size);
> 
> -	for (i = 0; (flags & EXEC_QUEUE_EARLY) && i < n_exec_queues; i++) {
> -		exec_queues[i] = xe_exec_queue_create(fd, vm, eci, 0);
> -		if (flags & BIND_EXECQUEUE)
> -			bind_exec_queues[i] =
> -				xe_bind_exec_queue_create(fd, vm, 0);
> -		else
> -			bind_exec_queues[i] = 0;
> -	};
> +	if (flags & EXEC_QUEUE_EARLY)
> +		create_exec_queue(fd, vm, exec_queues, bind_exec_queues,
> n_exec_queues, eci, flags);
> 
> -	if (flags & USERPTR) {
> -#define	MAP_ADDRESS	0x00007fadeadbe000
> -		if (flags & INVALIDATE) {
> -			data = mmap((void *)MAP_ADDRESS, bo_size,
> PROT_READ |
> -				    PROT_WRITE, MAP_SHARED | MAP_FIXED |
> -				    MAP_ANONYMOUS, -1, 0);
> -			igt_assert(data != MAP_FAILED);
> -		} else {
> -			data = aligned_alloc(xe_get_default_alignment(fd),
> -					     bo_size);
> -			igt_assert(data);
> -		}
> -	} else {
> -		bo = xe_bo_create(fd, flags & VM_FOR_BO ? vm : 0,
> -				  bo_size, vram_if_possible(fd, eci->gt_id),
> -
> DRM_XE_GEM_CREATE_FLAG_NEEDS_VISIBLE_VRAM);
> -		data = xe_bo_map(fd, bo, bo_size);
> -	}
> -	memset(data, 0, bo_size);
> +	placement = vram_if_possible(fd, eci->gt_id);
> 
> -	for (i = 0; !(flags & EXEC_QUEUE_EARLY) && i < n_exec_queues; i++) {
> -		exec_queues[i] = xe_exec_queue_create(fd, vm, eci, 0);
> -		if (flags & BIND_EXECQUEUE)
> -			bind_exec_queues[i] =
> -				xe_bind_exec_queue_create(fd, vm, 0);
> -		else
> -			bind_exec_queues[i] = 0;
> -	};
> +	data = create_xe_bo(fd, vm, &bo, bo_size, placement, flags);
> 
> -	sync[0].addr = to_user_pointer(&data[0].vm_sync);
> -	if (bo)
> -		xe_vm_bind_async(fd, vm, bind_exec_queues[0], bo, 0, addr,
> -				 bo_size, sync, 1);
> -	else
> -		xe_vm_bind_userptr_async(fd, vm, bind_exec_queues[0],
> -					 to_user_pointer(data), addr,
> -					 bo_size, sync, 1);
> -#define ONE_SEC	MS_TO_NS(1000)
> -#define HUNDRED_SEC	MS_TO_NS(100000)
> +	if(!(flags & EXEC_QUEUE_EARLY))
> +		create_exec_queue(fd, vm, exec_queues, bind_exec_queues,
> n_exec_queues, eci, flags);
> +
> +	xe_bo_bind(fd, vm, bind_exec_queues[0], bo, bo_size, addr, sync, data,
> flags);
> 
>  	fence_timeout = igt_run_in_simulation() ? HUNDRED_SEC : ONE_SEC;
> 
> -	xe_wait_ufence(fd, &data[0].vm_sync, USER_FENCE_VALUE,
> -		       bind_exec_queues[0], fence_timeout);
> -	data[0].vm_sync = 0;
> -
> -	for (i = 0; i < n_execs; i++) {
> -		uint64_t batch_offset = (char *)&data[i].batch - (char *)data;
> -		uint64_t batch_addr = addr + batch_offset;
> -		uint64_t sdi_offset = (char *)&data[i].data - (char *)data;
> -		uint64_t sdi_addr = addr + sdi_offset;
> -		int e = i % n_exec_queues;
> -
> -		b = 0;
> -		data[i].batch[b++] = MI_STORE_DWORD_IMM_GEN4;
> -		data[i].batch[b++] = sdi_addr;
> -		data[i].batch[b++] = sdi_addr >> 32;
> -		data[i].batch[b++] = 0xc0ffee;
> -		data[i].batch[b++] = MI_BATCH_BUFFER_END;
> -		igt_assert(b <= ARRAY_SIZE(data[i].batch));
> -
> -		sync[0].addr = addr + (char *)&data[i].exec_sync - (char *)data;
> -
> -		exec.exec_queue_id = exec_queues[e];
> -		exec.address = batch_addr;
> -		xe_exec(fd, &exec);
> -
> -		if (flags & REBIND && i + 1 != n_execs) {
> -			xe_wait_ufence(fd, &data[i].exec_sync,
> USER_FENCE_VALUE,
> -				       exec_queues[e], fence_timeout);
> -			xe_vm_unbind_async(fd, vm, bind_exec_queues[e], 0,
> -					   addr, bo_size, NULL, 0);
> -
> -			sync[0].addr = to_user_pointer(&data[0].vm_sync);
> -			addr += bo_size;
> -			if (bo)
> -				xe_vm_bind_async(fd, vm,
> bind_exec_queues[e], bo,
> -						 0, addr, bo_size, sync, 1);
> -			else
> -				xe_vm_bind_userptr_async(fd, vm,
> -							 bind_exec_queues[e],
> -							 to_user_pointer(data),
> -							 addr, bo_size, sync,
> -							 1);
> -			xe_wait_ufence(fd, &data[0].vm_sync,
> USER_FENCE_VALUE,
> -				       bind_exec_queues[e], fence_timeout);
> -			data[0].vm_sync = 0;
> -		}
> -
> -		if (flags & INVALIDATE && i + 1 != n_execs) {
> -			if (!(flags & RACE)) {
> -				/*
> -				 * Wait for exec completion and check data as
> -				 * userptr will likely change to different
> -				 * physical memory on next mmap call triggering
> -				 * an invalidate.
> -				 */
> -				xe_wait_ufence(fd, &data[i].exec_sync,
> -					       USER_FENCE_VALUE,
> exec_queues[e],
> -					       fence_timeout);
> -				igt_assert_eq(data[i].data, 0xc0ffee);
> -			} else if (i * 2 != n_execs) {
> -				/*
> -				 * We issue 1 mmap which races against running
> -				 * jobs. No real check here aside from this test
> -				 * not faulting on the GPU.
> -				 */
> -				continue;
> -			}
> -
> -			if (flags & RACE) {
> -				map_fd = open("/tmp", O_TMPFILE | O_RDWR,
> -					      0x666);
> -				write(map_fd, data, bo_size);
> -				data = mmap((void *)MAP_ADDRESS, bo_size,
> -					    PROT_READ | PROT_WRITE,
> MAP_SHARED |
> -					    MAP_FIXED, map_fd, 0);
> -			} else {
> -				data = mmap((void *)MAP_ADDRESS, bo_size,
> -					    PROT_READ | PROT_WRITE,
> MAP_SHARED |
> -					    MAP_FIXED | MAP_ANONYMOUS, -1,
> 0);
> -			}
> -			igt_assert(data != MAP_FAILED);
> -		}
> -	}
> +	xe_execbuf(fd, vm, &exec, n_execs, exec_queues, bind_exec_queues,
> +			n_exec_queues, bo, bo_size, sync, addr, data, &map_fd,
> flags);
> 
>  	j = flags & INVALIDATE ? n_execs - 1 : 0;
>  	for (i = j; i < n_execs; i++)
> @@ -264,20 +132,12 @@ test_exec(int fd, struct drm_xe_engine_class_instance
> *eci,
>  	if (flags & INVALIDATE)
>  		usleep(250000);
> 
> -	sync[0].addr = to_user_pointer(&data[0].vm_sync);
> -	xe_vm_unbind_async(fd, vm, bind_exec_queues[0], 0, addr, bo_size,
> -			   sync, 1);
> -	xe_wait_ufence(fd, &data[0].vm_sync, USER_FENCE_VALUE,
> -		       bind_exec_queues[0], fence_timeout);
> +	xe_vm_unbind(fd, vm, bind_exec_queues[0], sync, data, addr, bo_size);
> 
>  	for (i = j; i < n_execs; i++)
>  		igt_assert_eq(data[i].data, 0xc0ffee);
> 
> -	for (i = 0; i < n_exec_queues; i++) {
> -		xe_exec_queue_destroy(fd, exec_queues[i]);
> -		if (bind_exec_queues[i])
> -			xe_exec_queue_destroy(fd, bind_exec_queues[i]);
> -	}
> +	destory_exec_queue(fd, exec_queues, bind_exec_queues,
> n_exec_queues);
> 
>  	if (bo) {
>  		munmap(data, bo_size);
> @@ -492,14 +352,14 @@ igt_main
>  		{ "userptr-rebind", USERPTR | REBIND },
>  		{ "userptr-invalidate", USERPTR | INVALIDATE },
>  		{ "userptr-invalidate-race", USERPTR | INVALIDATE | RACE },
> -		{ "bindexecqueue", BIND_EXECQUEUE },
> -		{ "bindexecqueue-userptr", BIND_EXECQUEUE | USERPTR },
> -		{ "bindexecqueue-rebind",  BIND_EXECQUEUE | REBIND },
> -		{ "bindexecqueue-userptr-rebind",  BIND_EXECQUEUE |
> USERPTR |
> +		{ "bindexecqueue", BIND_EXEC_QUEUE },
> +		{ "bindexecqueue-userptr", BIND_EXEC_QUEUE | USERPTR },
> +		{ "bindexecqueue-rebind",  BIND_EXEC_QUEUE | REBIND },
> +		{ "bindexecqueue-userptr-rebind",  BIND_EXEC_QUEUE |
> USERPTR |
>  			REBIND },
> -		{ "bindexecqueue-userptr-invalidate",  BIND_EXECQUEUE |
> USERPTR |
> +		{ "bindexecqueue-userptr-invalidate",  BIND_EXEC_QUEUE |
> USERPTR |
>  			INVALIDATE },
> -		{ "bindexecqueue-userptr-invalidate-race", BIND_EXECQUEUE |
> USERPTR |
> +		{ "bindexecqueue-userptr-invalidate-race", BIND_EXEC_QUEUE |
> USERPTR |
>  			INVALIDATE | RACE },
>  		{ NULL },
>  	};
> diff --git a/tests/intel/xe_exec_fault_mode.c
> b/tests/intel/xe_exec_fault_mode.c
> index 40fe1743e..7b2fef224 100644
> --- a/tests/intel/xe_exec_fault_mode.c
> +++ b/tests/intel/xe_exec_fault_mode.c
> @@ -16,24 +16,13 @@
>  #include "igt.h"
>  #include "lib/igt_syncobj.h"
>  #include "lib/intel_reg.h"
> +#include "lib/xe/xe_util.h"
>  #include "xe_drm.h"
> 
>  #include "xe/xe_ioctl.h"
>  #include "xe/xe_query.h"
>  #include <string.h>
> 
> -#define MAX_N_EXEC_QUEUES	16
> -
> -#define USERPTR		(0x1 << 0)
> -#define REBIND		(0x1 << 1)
> -#define INVALIDATE	(0x1 << 2)
> -#define RACE		(0x1 << 3)
> -#define BIND_EXEC_QUEUE	(0x1 << 4)
> -#define PREFETCH	(0x1 << 5)
> -#define INVALID_FAULT	(0x1 << 6)
> -#define INVALID_VA	(0x1 << 7)
> -#define ENABLE_SCRATCH  (0x1 << 8)
> -
>  /**
>   * SUBTEST: invalid-va
>   * Description: Access invalid va and check for EIO through user fence.
> @@ -99,7 +88,6 @@ test_exec(int fd, struct drm_xe_engine_class_instance *eci,
>  {
>  	uint32_t vm;
>  	uint64_t addr = 0x1a0000;
> -#define USER_FENCE_VALUE	0xdeadbeefdeadbeefull
>  	struct drm_xe_sync sync[1] = {
>  		{ .type = DRM_XE_SYNC_TYPE_USER_FENCE, .flags =
> DRM_XE_SYNC_FLAG_SIGNAL,
>  	          .timeline_value = USER_FENCE_VALUE },
> @@ -113,14 +101,9 @@ test_exec(int fd, struct drm_xe_engine_class_instance
> *eci,
>  	uint32_t bind_exec_queues[MAX_N_EXEC_QUEUES];
>  	size_t bo_size;
>  	uint32_t bo = 0;
> -	struct {
> -		uint32_t batch[16];
> -		uint64_t pad;
> -		uint64_t vm_sync;
> -		uint64_t exec_sync;
> -		uint32_t data;
> -	} *data;
> -	int i, j, b;
> +	struct cpu_va *data;
> +	uint64_t placement;
> +	int i, j;
>  	int map_fd = -1;
> 
>  	igt_assert(n_exec_queues <= MAX_N_EXEC_QUEUES);
> @@ -134,144 +117,19 @@ test_exec(int fd, struct
> drm_xe_engine_class_instance *eci,
>  	bo_size = sizeof(*data) * n_execs;
>  	bo_size = xe_bb_size(fd, bo_size);
> 
> -	if (flags & USERPTR) {
> -#define	MAP_ADDRESS	0x00007fadeadbe000
> -		if (flags & INVALIDATE) {
> -			data = mmap((void *)MAP_ADDRESS, bo_size,
> PROT_READ |
> -				    PROT_WRITE, MAP_SHARED | MAP_FIXED |
> -				    MAP_ANONYMOUS, -1, 0);
> -			igt_assert(data != MAP_FAILED);
> -		} else {
> -			data = aligned_alloc(xe_get_default_alignment(fd),
> -					     bo_size);
> -			igt_assert(data);
> -		}
> -	} else {
> -		if (flags & PREFETCH)
> -			bo = xe_bo_create(fd, 0, bo_size,
> -					  all_memory_regions(fd) |
> -					  vram_if_possible(fd, 0),
> -
> DRM_XE_GEM_CREATE_FLAG_NEEDS_VISIBLE_VRAM);
> -		else
> -			bo = xe_bo_create(fd, 0, bo_size,
> -					  vram_if_possible(fd, eci->gt_id),
> -
> DRM_XE_GEM_CREATE_FLAG_NEEDS_VISIBLE_VRAM);
> -		data = xe_bo_map(fd, bo, bo_size);
> -	}
> -	memset(data, 0, bo_size);
> -
> -	for (i = 0; i < n_exec_queues; i++) {
> -		exec_queues[i] = xe_exec_queue_create(fd, vm, eci, 0);
> -		if (flags & BIND_EXEC_QUEUE)
> -			bind_exec_queues[i] =
> -				xe_bind_exec_queue_create(fd, vm, 0);
> -		else
> -			bind_exec_queues[i] = 0;
> -	};
> -
> -	sync[0].addr = to_user_pointer(&data[0].vm_sync);
> -	if (bo)
> -		xe_vm_bind_async(fd, vm, bind_exec_queues[0], bo, 0, addr,
> bo_size, sync, 1);
> -	else
> -		xe_vm_bind_userptr_async(fd, vm, bind_exec_queues[0],
> -					 to_user_pointer(data), addr,
> -					 bo_size, sync, 1);
> -#define ONE_SEC	MS_TO_NS(1000)
> -	xe_wait_ufence(fd, &data[0].vm_sync, USER_FENCE_VALUE,
> -		       bind_exec_queues[0], ONE_SEC);
> -	data[0].vm_sync = 0;
> -
> -	if (flags & PREFETCH) {
> -		/* Should move to system memory */
> -		xe_vm_prefetch_async(fd, vm, bind_exec_queues[0], 0, addr,
> -				     bo_size, sync, 1, 0);
> -		xe_wait_ufence(fd, &data[0].vm_sync, USER_FENCE_VALUE,
> -			       bind_exec_queues[0], ONE_SEC);
> -		data[0].vm_sync = 0;
> -	}
> -
> -	for (i = 0; i < n_execs; i++) {
> -		uint64_t batch_offset = (char *)&data[i].batch - (char *)data;
> -		uint64_t batch_addr = addr + batch_offset;
> -		uint64_t sdi_offset = (char *)&data[i].data - (char *)data;
> -		uint64_t sdi_addr = addr + sdi_offset;
> -		int e = i % n_exec_queues;
> +	placement = (flags & PREFETCH) ? all_memory_regions(fd) |
> +		vram_if_possible(fd, 0) : vram_if_possible(fd, eci->gt_id);
> 
> -		b = 0;
> -		if (flags & INVALID_VA)
> -			sdi_addr = 0x1fffffffffff000;
> +	data = create_xe_bo(fd, vm, &bo, bo_size, placement, flags);
> 
> -		data[i].batch[b++] = MI_STORE_DWORD_IMM_GEN4;
> -		data[i].batch[b++] = sdi_addr;
> -		data[i].batch[b++] = sdi_addr >> 32;
> -		data[i].batch[b++] = 0xc0ffee;
> -		data[i].batch[b++] = MI_BATCH_BUFFER_END;
> -		igt_assert(b <= ARRAY_SIZE(data[i].batch));
> +	create_exec_queue(fd, vm, exec_queues, bind_exec_queues,
> +			n_exec_queues, eci, flags);
> 
> -		sync[0].addr = addr + (char *)&data[i].exec_sync - (char *)data;
> +	xe_bo_bind(fd, vm, bind_exec_queues[0], bo, bo_size, addr, sync, data,
> flags);
> 
> -		exec.exec_queue_id = exec_queues[e];
> -		exec.address = batch_addr;
> -		xe_exec(fd, &exec);
> +	xe_execbuf(fd, vm, &exec, n_execs, exec_queues, bind_exec_queues,
> +			n_exec_queues, bo, bo_size, sync, addr, data, &map_fd,
> flags);
> 
> -		if (flags & REBIND && i + 1 != n_execs) {
> -			xe_wait_ufence(fd, &data[i].exec_sync,
> USER_FENCE_VALUE,
> -				       exec_queues[e], ONE_SEC);
> -			xe_vm_unbind_async(fd, vm, bind_exec_queues[e], 0,
> -					   addr, bo_size, NULL, 0);
> -
> -			sync[0].addr = to_user_pointer(&data[0].vm_sync);
> -			addr += bo_size;
> -			if (bo)
> -				xe_vm_bind_async(fd, vm,
> bind_exec_queues[e], bo,
> -						 0, addr, bo_size, sync, 1);
> -			else
> -				xe_vm_bind_userptr_async(fd, vm,
> -							 bind_exec_queues[e],
> -							 to_user_pointer(data),
> -							 addr, bo_size, sync,
> -							 1);
> -			xe_wait_ufence(fd, &data[0].vm_sync,
> USER_FENCE_VALUE,
> -				       bind_exec_queues[e], ONE_SEC);
> -			data[0].vm_sync = 0;
> -		}
> -
> -		if (flags & INVALIDATE && i + 1 != n_execs) {
> -			if (!(flags & RACE)) {
> -				/*
> -				 * Wait for exec completion and check data as
> -				 * userptr will likely change to different
> -				 * physical memory on next mmap call triggering
> -				 * an invalidate.
> -				 */
> -				xe_wait_ufence(fd, &data[i].exec_sync,
> -					       USER_FENCE_VALUE,
> exec_queues[e],
> -					       ONE_SEC);
> -				igt_assert_eq(data[i].data, 0xc0ffee);
> -			} else if (i * 2 != n_execs) {
> -				/*
> -				 * We issue 1 mmap which races against running
> -				 * jobs. No real check here aside from this test
> -				 * not faulting on the GPU.
> -				 */
> -				continue;
> -			}
> -
> -			if (flags & RACE) {
> -				map_fd = open("/tmp", O_TMPFILE | O_RDWR,
> -					      0x666);
> -				write(map_fd, data, bo_size);
> -				data = mmap((void *)MAP_ADDRESS, bo_size,
> -					    PROT_READ | PROT_WRITE,
> MAP_SHARED |
> -					    MAP_FIXED, map_fd, 0);
> -			} else {
> -				data = mmap((void *)MAP_ADDRESS, bo_size,
> -					    PROT_READ | PROT_WRITE,
> MAP_SHARED |
> -					    MAP_FIXED | MAP_ANONYMOUS, -1,
> 0);
> -			}
> -			igt_assert(data != MAP_FAILED);
> -		}
> -	}
>  	if (!(flags & INVALID_FAULT)) {
>  		int64_t timeout = ONE_SEC;
> 
> @@ -286,22 +144,15 @@ test_exec(int fd, struct drm_xe_engine_class_instance
> *eci,
>  							       exec_queues[i %
> n_exec_queues], &timeout), 0);
>  		}
>  	}
> -	sync[0].addr = to_user_pointer(&data[0].vm_sync);
> -	xe_vm_unbind_async(fd, vm, bind_exec_queues[0], 0, addr, bo_size,
> -			   sync, 1);
> -	xe_wait_ufence(fd, &data[0].vm_sync, USER_FENCE_VALUE,
> -		       bind_exec_queues[0], ONE_SEC);
> +
> +	xe_vm_unbind(fd, vm, bind_exec_queues[0], sync, data, addr, bo_size);
> 
>  	if (!(flags & INVALID_FAULT) && !(flags & INVALID_VA)) {
>  		for (i = j; i < n_execs; i++)
>  				igt_assert_eq(data[i].data, 0xc0ffee);
>  	}
> 
> -	for (i = 0; i < n_exec_queues; i++) {
> -		xe_exec_queue_destroy(fd, exec_queues[i]);
> -		if (bind_exec_queues[i])
> -			xe_exec_queue_destroy(fd, bind_exec_queues[i]);
> -	}
> +	destory_exec_queue(fd, exec_queues, bind_exec_queues,
> n_exec_queues);
> 
>  	if (bo) {
>  		munmap(data, bo_size);
> --
> 2.25.1


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] lib/xe/xe_util: Creating the helper functions
  2024-04-01 16:10   ` Zeng, Oak
@ 2024-04-01 18:40     ` Matthew Brost
  2024-04-01 20:28       ` Zeng, Oak
  0 siblings, 1 reply; 9+ messages in thread
From: Matthew Brost @ 2024-04-01 18:40 UTC (permalink / raw)
  To: Zeng, Oak
  Cc: Bommu, Krishnaiah, intel-xe@lists.freedesktop.org,
	Kumar, Janga Rahul, Ghimiray, Himal Prasad, Chehab, Mauro

On Mon, Apr 01, 2024 at 10:10:06AM -0600, Zeng, Oak wrote:
> Hi Matt,
> 
> > -----Original Message-----
> > From: Brost, Matthew <matthew.brost@intel.com>
> > Sent: Sunday, March 31, 2024 5:05 PM
> > To: Bommu, Krishnaiah <krishnaiah.bommu@intel.com>
> > Cc: intel-xe@lists.freedesktop.org; Kumar, Janga Rahul
> > <janga.rahul.kumar@intel.com>; Ghimiray, Himal Prasad
> > <himal.prasad.ghimiray@intel.com>; Zeng, Oak <oak.zeng@intel.com>
> > Subject: Re: [PATCH] lib/xe/xe_util: Creating the helper functions
> > 
> > On Mon, Apr 01, 2024 at 12:29:49AM +0530, Bommu Krishnaiah wrote:
> > > Creating the helper functions which can be reused by many tests,
> > > as part of this patch xe_exec_compute_mode and xe_exec_fault_mode
> > > are used this helper functions.
> > >
> > 
> > As the author of most these tests and IGTs, I can say while not great to
> > have copy paste between tests I much prefer it to adding helpers to try
> > to jam the needs of each test into the helpers. IMO this just creates a
> > headache in the long run. e.g. Let's say I want to change the behavior
> > of xe_exec_compute_mode but not xe_exec_fault_mode, with shared helpers
> > this gets harder.
> 
> Duplicated codes is not a good practice in software. When you have a fix, you need to fix all the places where you have the duplicated codes.
> 

I understand concept of why to not use duplicate codes but see below the
design philosophy of these tests.

> In the example you give, it depends on how we make the helpers. If the helpers are not abstract well enough, we will run into the problem you said. 
> 
> The helper idea is inherited from i915. With those helpers, we can write a test with less than 50 lines of codes. In many cases, just ~20 lines of code, very easy to read and debug. Just take a look of i915 gem_vm_pagefault.c and i915_svm.c.
>

gem_vm_pagefault.c is 2000 plus lines of code for 27 tests.

The test_exec function in xe_exec_compute_mode is ~150 lines of code
which generates 50+ plus tests. There is power to a single data path
controlled by various flags - this leads to the pattern of testing which
is not directed rather do a bunch of things all at the same time. That
is the intent of tests which use this style of coding (xe_exec_basic,
xe_exec_compute_mode, xe_exec_balancer, xe_exec_fault_mode, and
xe_exec_threads). I've used this style of coding to test hardware /
software over the years (started my career writing functional tests for
hardware) and can say these style of tests are by far the most effective
for finding bugs / producing coverage / confidence.

Also they should be easily extentable, see [1] which adds some new
sections to xe_exec_compute_mode to test [2]. New tests are added by
adding a new flag + update data path with new flags. This is my example
why for these types of tests helpers replacing the data-path is not
ideal.

With all of this, I'm not suggesting helpers are not useful nor I am
suggesting directed testing is not useful (see xe_vm, largely a
directed test one of the most essential tests in the suite) just in this
case I do not think the aforementioned tests will benefit from helpers
as written in this patch.

[1] https://patchwork.freedesktop.org/patch/583145/?series=131211&rev=1
[2] https://patchwork.freedesktop.org/series/130935/

> Xe igt is rather complicated compared above i915 tests. In the past few days, I had to debug into xe-exec-threads, it took me long time to really understand the test. The main reason is, we have a huge function such as test_legacy_mode, and flags is used to control the behavior of the test. This is hard to read, maintain and debug.
> 

Saying something is complicated is a matter of opinion. I think the i915
tests are incomprehensible and can easily understand the Xe tests but
course I wrote the Xe tests... I suspect the inverse is true in the i915
tests...

I will say if xe_exec_threads runs to completion the condfience level in
submission working Xe is very, very high. Again ~1500 lines of code
generates ~120 tests.

> > 
> > I'd much rather leave this as is.
> > 
> > Maybe I can buy a helper or two to program the batches, i.e.
> > insert_store LGTM, but certainly not the logic doing bind, invalidates,
> > loop, etc...
> 
> 
> Let me take a look of the helpers, and see whether we can make it better. Our plan is to use those helpers for system allocator test. We use existing tests to test those helpers because right now we don't have system allocator ready. So let's improve it first. We might end up something which is good for both existing test and coming system allocator test. 
> 

Again I don't like in general predicting the future... But I'd almostly
certainly say a test in the style of afformentioned tests will be
written too with me insisting or writing one myself given my experience
doing so.

Off the top of my head...

- various allocation sizes
- mix mmap / malloc / BO allocations
- aligned / unaligned allocations
- different madvise policies
- allocations that faulted in (CPU touches pages before GPU) or not
  faulted in (no backing store on GPU access)
- races between CPU / GPU access
- sparse access to allocations
- dynamic freeing of allocations
- intentionally leak memory
- invalid addresses
- allocate / GPU fault in more memory than VRAM available causing eviction

All of this within a single function controlled by flags.

Extend to multi-threaded versions with dedicated VMs or a shared VMs.

Extend to multi-process version with dedicated VMs and FDs.

So this will probably be 100s of tests with more or less 1 function
called with different arguments. I bet if a test like this run to
completion confidence in SA design is very, very high.

> We will let you decide whether you want to use those helpers for existing tests. If we can't make something good for existing tests, we will target to using those helpers only for the coming system allocator tests.
>

Always open to improvements but my main point here is - helpers
replacing the data-path is a tricky proposition.

Matt
 
> Thanks,
> Oak
> 
> > 
> > Matt
> > 
> > > Signed-off-by: Bommu Krishnaiah <krishnaiah.bommu@intel.com>
> > > Cc: Janga Rahul Kumar <janga.rahul.kumar@intel.com>
> > > Cc: Himal Prasad Ghimiray <himal.prasad.ghimiray@intel.com>
> > > Cc: Oak Zeng <oak.zeng@intel.com>
> > > ---
> > >  lib/xe/xe_util.c                   | 194 +++++++++++++++++++++++++++++
> > >  lib/xe/xe_util.h                   |  48 +++++++
> > >  tests/intel/xe_exec_compute_mode.c | 190 ++++------------------------
> > >  tests/intel/xe_exec_fault_mode.c   | 179 +++-----------------------
> > >  4 files changed, 282 insertions(+), 329 deletions(-)
> > >
> > > diff --git a/lib/xe/xe_util.c b/lib/xe/xe_util.c
> > > index 050162b5e..17cd45e45 100644
> > > --- a/lib/xe/xe_util.c
> > > +++ b/lib/xe/xe_util.c
> > > @@ -255,3 +255,197 @@ bool xe_is_gt_in_c6(int fd, int gt)
> > >
> > >  	return strcmp(gt_c_state, "gt-c6") == 0;
> > >  }
> > > +
> > > +struct cpu_va *create_xe_bo(int fd, uint32_t vm, uint32_t *bo,
> > > +		uint32_t bo_size, uint64_t placement, unsigned int flags)
> > > +{
> > > +	struct cpu_va *data;
> > > +
> > > +	if (flags & USERPTR) {
> > > +		if (flags & INVALIDATE) {
> > > +			data = mmap((void *)MAP_ADDRESS, bo_size,
> > PROT_READ | PROT_WRITE,
> > > +					MAP_SHARED | MAP_FIXED |
> > MAP_ANONYMOUS, -1, 0);
> > > +			igt_assert(data != MAP_FAILED);
> > > +		} else {
> > > +			data = aligned_alloc(xe_get_default_alignment(fd),
> > bo_size);
> > > +			igt_assert(data);
> > > +		}
> > > +	} else {
> > > +		*bo = xe_bo_create(fd, flags & VM_FOR_BO ? vm : 0, bo_size,
> > placement,
> > > +
> > 	DRM_XE_GEM_CREATE_FLAG_NEEDS_VISIBLE_VRAM);
> > > +		data = xe_bo_map(fd, *bo, bo_size);
> > > +	}
> > > +
> > > +	memset(data, 0, bo_size);
> > > +
> > > +	return data;
> > > +}
> > > +
> > > +void create_exec_queue(int fd, uint32_t vm, uint32_t *exec_queues,
> > > +		uint32_t *bind_exec_queues, int n_exec_queues,
> > > +		struct drm_xe_engine_class_instance *eci, unsigned int flags)
> > > +{
> > > +	int i;
> > > +
> > > +	for (i = 0; i < n_exec_queues; i++) {
> > > +		exec_queues[i] = xe_exec_queue_create(fd, vm, eci, 0);
> > > +		if (flags & BIND_EXEC_QUEUE)
> > > +			bind_exec_queues[i] = xe_bind_exec_queue_create(fd,
> > vm, 0);
> > > +		else
> > > +			bind_exec_queues[i] = 0;
> > > +	};
> > > +}
> > > +
> > > +void xe_bo_bind(int fd, uint32_t vm, uint32_t bind_exec_queues, uint32_t bo,
> > > +		uint32_t bo_size, uint64_t addr, struct drm_xe_sync *sync,
> > > +		struct cpu_va *data, unsigned int flags)
> > > +{
> > > +	int64_t fence_timeout = igt_run_in_simulation() ? HUNDRED_SEC :
> > ONE_SEC;
> > > +
> > > +	sync[0].addr = to_user_pointer(&data[0].vm_sync);
> > > +	if (bo)
> > > +		xe_vm_bind_async(fd, vm, bind_exec_queues, bo, 0, addr,
> > bo_size, sync, 1);
> > > +	else
> > > +		xe_vm_bind_userptr_async(fd, vm, bind_exec_queues,
> > > +				to_user_pointer(data), addr,
> > > +				bo_size, sync, 1);
> > > +
> > > +	xe_wait_ufence(fd, &data[0].vm_sync, USER_FENCE_VALUE,
> > > +			bind_exec_queues, fence_timeout);
> > > +	data[0].vm_sync = 0;
> > > +
> > > +	if (flags & PREFETCH) {
> > > +		/* Should move to system memory */
> > > +		xe_vm_prefetch_async(fd, vm, bind_exec_queues, 0, addr,
> > > +				bo_size, sync, 1, 0);
> > > +		xe_wait_ufence(fd, &data[0].vm_sync, USER_FENCE_VALUE,
> > > +				bind_exec_queues, fence_timeout);
> > > +		data[0].vm_sync = 0;
> > > +	}
> > > +}
> > > +
> > > +void insert_store(uint64_t dst, struct cpu_va *data, uint32_t val, int i)
> > > +{
> > > +	int b = 0;
> > > +	data[i].batch[b++] = MI_STORE_DWORD_IMM_GEN4;
> > > +	data[i].batch[b++] = dst;
> > > +	data[i].batch[b++] = dst >> 32;
> > > +	data[i].batch[b++] = val;
> > > +	data[i].batch[b++] = MI_BATCH_BUFFER_END;
> > > +	igt_assert(b <= ARRAY_SIZE(data[i].batch));
> > > +}
> > > +
> > > +void xe_execbuf(int fd, uint32_t vm, struct drm_xe_exec *exec, int n_execs,
> > > +		uint32_t *exec_queues, uint32_t *bind_exec_queues,
> > > +		int n_exec_queues, uint32_t bo, uint32_t bo_size,
> > > +		struct drm_xe_sync *sync, uint64_t addr, struct cpu_va *data,
> > > +		int *map_fd, unsigned int flags)
> > > +{
> > > +	int i;
> > > +	int64_t fence_timeout = igt_run_in_simulation() ? HUNDRED_SEC :
> > ONE_SEC;
> > > +
> > > +	for (i = 0; i < n_execs; i++) {
> > > +		uint64_t batch_offset = (char *)&data[i].batch - (char *)data;
> > > +		uint64_t batch_addr = addr + batch_offset;
> > > +		uint64_t sdi_offset = (char *)&data[i].data - (char *)data;
> > > +		uint64_t sdi_addr = addr + sdi_offset;
> > > +		int e = i % n_exec_queues;
> > > +
> > > +		if (flags & INVALID_VA)
> > > +			sdi_addr = 0x1fffffffffff000;
> > > +
> > > +		insert_store(sdi_addr, data, 0xc0ffee, i);
> > > +
> > > +		sync[0].addr = addr + (char *)&data[i].exec_sync - (char *)data;
> > > +
> > > +		exec->exec_queue_id = exec_queues[e];
> > > +		exec->address = batch_addr;
> > > +		xe_exec(fd, exec);
> > > +
> > > +		if (flags & REBIND && i + 1 != n_execs) {
> > > +			xe_wait_ufence(fd, &data[i].exec_sync,
> > USER_FENCE_VALUE,
> > > +					exec_queues[e], fence_timeout);
> > > +			xe_vm_unbind_async(fd, vm, bind_exec_queues[e], 0,
> > > +					addr, bo_size, NULL, 0);
> > > +
> > > +			sync[0].addr = to_user_pointer(&data[0].vm_sync);
> > > +			addr += bo_size;
> > > +
> > > +			if (bo)
> > > +				xe_vm_bind_async(fd, vm,
> > bind_exec_queues[e], bo,
> > > +						0, addr, bo_size, sync, 1);
> > > +			else
> > > +				xe_vm_bind_userptr_async(fd, vm,
> > > +						bind_exec_queues[e],
> > > +						to_user_pointer(data),
> > > +						addr, bo_size, sync, 1);
> > > +
> > > +			xe_wait_ufence(fd, &data[0].vm_sync,
> > USER_FENCE_VALUE,
> > > +					bind_exec_queues[e], fence_timeout);
> > > +			data[0].vm_sync = 0;
> > > +		}
> > > +
> > > +		if (flags & INVALIDATE && i + 1 != n_execs) {
> > > +			if (!(flags & RACE)) {
> > > +				/*
> > > +				 * Wait for exec completion and check data as
> > > +				 * userptr will likely change to different
> > > +				 * physical memory on next mmap call triggering
> > > +				 * an invalidate.
> > > +				 */
> > > +				xe_wait_ufence(fd, &data[i].exec_sync,
> > > +						USER_FENCE_VALUE,
> > exec_queues[e],
> > > +						fence_timeout);
> > > +				igt_assert_eq(data[i].data, 0xc0ffee);
> > > +			} else if (i * 2 != n_execs) {
> > > +				/*
> > > +				 * We issue 1 mmap which races against running
> > > +				 * jobs. No real check here aside from this test
> > > +				 * not faulting on the GPU.
> > > +				 */
> > > +				continue;
> > > +			}
> > > +
> > > +			if (flags & RACE) {
> > > +				*map_fd = open("/tmp", O_TMPFILE | O_RDWR,
> > > +						0x666);
> > > +				write(*map_fd, data, bo_size);
> > > +				data = mmap((void *)MAP_ADDRESS, bo_size,
> > > +						PROT_READ | PROT_WRITE,
> > MAP_SHARED |
> > > +						MAP_FIXED, *map_fd, 0);
> > > +			} else {
> > > +				data = mmap((void *)MAP_ADDRESS, bo_size,
> > > +						PROT_READ | PROT_WRITE,
> > MAP_SHARED |
> > > +						MAP_FIXED |
> > MAP_ANONYMOUS, -1, 0);
> > > +			}
> > > +
> > > +			igt_assert(data != MAP_FAILED);
> > > +		}
> > > +	}
> > > +}
> > > +
> > > +void xe_vm_unbind(int fd, uint32_t vm, uint32_t bind_exec_queues,
> > > +		struct drm_xe_sync *sync, struct cpu_va *data, uint64_t addr,
> > > +		uint32_t bo_size)
> > > +{
> > > +	int64_t fence_timeout = igt_run_in_simulation() ? HUNDRED_SEC :
> > ONE_SEC;
> > > +
> > > +	sync[0].addr = to_user_pointer(&data[0].vm_sync);
> > > +	xe_vm_unbind_async(fd, vm, bind_exec_queues, 0, addr, bo_size,
> > > +			sync, 1);
> > > +	xe_wait_ufence(fd, &data[0].vm_sync, USER_FENCE_VALUE,
> > > +			bind_exec_queues, fence_timeout);
> > > +}
> > > +
> > > +void destory_exec_queue(int fd, uint32_t *exec_queues,
> > > +		uint32_t *bind_exec_queues, int n_exec_queues)
> > > +{
> > > +	int i;
> > > +
> > > +	for (i = 0; i < n_exec_queues; i++) {
> > > +		xe_exec_queue_destroy(fd, exec_queues[i]);
> > > +		if (bind_exec_queues[i])
> > > +			xe_exec_queue_destroy(fd, bind_exec_queues[i]);
> > > +	}
> > > +}
> > > +
> > > diff --git a/lib/xe/xe_util.h b/lib/xe/xe_util.h
> > > index 6480ea01a..aa1b0dcc1 100644
> > > --- a/lib/xe/xe_util.h
> > > +++ b/lib/xe/xe_util.h
> > > @@ -47,4 +47,52 @@ void xe_bind_unbind_async(int fd, uint32_t vm, uint32_t
> > bind_engine,
> > >
> > >  bool xe_is_gt_in_c6(int fd, int gt);
> > >
> > > +
> > > +#define USERPTR			(0x1 << 0)
> > > +#define REBIND			(0x1 << 1)
> > > +#define INVALIDATE		(0x1 << 2)
> > > +#define RACE			(0x1 << 3)
> > > +#define BIND_EXEC_QUEUE		(0x1 << 4)
> > > +#define PREFETCH		(0x1 << 5)
> > > +#define INVALID_FAULT		(0x1 << 6)
> > > +#define INVALID_VA		(0x1 << 7)
> > > +#define ENABLE_SCRATCH		(0x1 << 8)
> > > +#define VM_FOR_BO		(0x1 << 9)
> > > +#define EXEC_QUEUE_EARLY	(0x1 << 10)
> > > +
> > > +#define MAX_N_EXEC_QUEUES	16
> > > +#define USER_FENCE_VALUE	0xdeadbeefdeadbeefull
> > > +#define MAP_ADDRESS		0x00007fadeadbe000
> > > +
> > > +#define ONE_SEC		MS_TO_NS(1000)
> > > +#define HUNDRED_SEC	MS_TO_NS(100000)
> > > +
> > > +struct cpu_va {
> > > +	uint32_t batch[16];
> > > +	uint64_t pad;
> > > +	uint64_t vm_sync;
> > > +	uint64_t exec_sync;
> > > +	uint32_t data;
> > > +};
> > > +
> > > +struct cpu_va *create_xe_bo(int fd, uint32_t vm, uint32_t *bo,
> > > +		uint32_t bo_size, uint64_t placement,  unsigned int flags);
> > > +void create_exec_queue(int fd, uint32_t vm, uint32_t *exec_queues,
> > > +		uint32_t *bind_exec_queues, int n_exec_queues,
> > > +		struct drm_xe_engine_class_instance *eci, unsigned int flags);
> > > +void xe_bo_bind(int fd, uint32_t vm, uint32_t bind_exec_queues, uint32_t bo,
> > > +		uint32_t bo_size, uint64_t addr, struct drm_xe_sync *sync,
> > > +		struct cpu_va *data, unsigned int flags);
> > > +void xe_execbuf(int fd, uint32_t vm, struct drm_xe_exec *exec, int n_execs,
> > > +		uint32_t *exec_queues, uint32_t *bind_exec_queues,
> > > +		int n_exec_queues, uint32_t bo, uint32_t bo_size,
> > > +		struct drm_xe_sync *sync, uint64_t addr, struct cpu_va *data,
> > > +		int *map_fd, unsigned int flags);
> > > +void insert_store(uint64_t dst, struct cpu_va *data, uint32_t val, int i);
> > > +void xe_vm_unbind(int fd, uint32_t vm, uint32_t bind_exec_queues,
> > > +		struct drm_xe_sync *sync, struct cpu_va *data,
> > > +		uint64_t addr, uint32_t bo_size);
> > > +void destory_exec_queue(int fd, uint32_t *exec_queues,
> > > +		uint32_t *bind_exec_queues, int n_exec_queues);
> > > +
> > >  #endif /* XE_UTIL_H */
> > > diff --git a/tests/intel/xe_exec_compute_mode.c
> > b/tests/intel/xe_exec_compute_mode.c
> > > index 3ec848164..e8d82cc69 100644
> > > --- a/tests/intel/xe_exec_compute_mode.c
> > > +++ b/tests/intel/xe_exec_compute_mode.c
> > > @@ -15,6 +15,7 @@
> > >  #include "igt.h"
> > >  #include "lib/igt_syncobj.h"
> > >  #include "lib/intel_reg.h"
> > > +#include "lib/xe/xe_util.h"
> > >  #include <sys/ioctl.h>
> > >  #include "xe_drm.h"
> > >
> > > @@ -23,15 +24,6 @@
> > >  #include "xe/xe_spin.h"
> > >  #include <string.h>
> > >
> > > -#define MAX_N_EXECQUEUES 	16
> > > -#define USERPTR				(0x1 << 0)
> > > -#define REBIND				(0x1 << 1)
> > > -#define INVALIDATE			(0x1 << 2)
> > > -#define RACE				(0x1 << 3)
> > > -#define BIND_EXECQUEUE		(0x1 << 4)
> > > -#define VM_FOR_BO			(0x1 << 5)
> > > -#define EXEC_QUEUE_EARLY	(0x1 << 6)
> > > -
> > >  /**
> > >   * SUBTEST: twice-%s
> > >   * Description: Run %arg[1] compute machine test twice
> > > @@ -88,7 +80,6 @@ test_exec(int fd, struct drm_xe_engine_class_instance
> > *eci,
> > >  {
> > >  	uint32_t vm;
> > >  	uint64_t addr = 0x1a0000;
> > > -#define USER_FENCE_VALUE	0xdeadbeefdeadbeefull
> > >  	struct drm_xe_sync sync[1] = {
> > >  		{ .type = DRM_XE_SYNC_TYPE_USER_FENCE,
> > >  		  .flags = DRM_XE_SYNC_FLAG_SIGNAL,
> > > @@ -99,161 +90,38 @@ test_exec(int fd, struct drm_xe_engine_class_instance
> > *eci,
> > >  		.num_syncs = 1,
> > >  		.syncs = to_user_pointer(sync),
> > >  	};
> > > -	uint32_t exec_queues[MAX_N_EXECQUEUES];
> > > -	uint32_t bind_exec_queues[MAX_N_EXECQUEUES];
> > > +	uint32_t exec_queues[MAX_N_EXEC_QUEUES];
> > > +	uint32_t bind_exec_queues[MAX_N_EXEC_QUEUES];
> > >  	size_t bo_size;
> > >  	uint32_t bo = 0;
> > > -	struct {
> > > -		uint32_t batch[16];
> > > -		uint64_t pad;
> > > -		uint64_t vm_sync;
> > > -		uint64_t exec_sync;
> > > -		uint32_t data;
> > > -	} *data;
> > > -	int i, j, b;
> > > +	struct cpu_va *data;
> > > +	int i, j;
> > >  	int map_fd = -1;
> > >  	int64_t fence_timeout;
> > > +	uint64_t placement;
> > >
> > > -	igt_assert(n_exec_queues <= MAX_N_EXECQUEUES);
> > > +	igt_assert(n_exec_queues <= MAX_N_EXEC_QUEUES);
> > >
> > >  	vm = xe_vm_create(fd, DRM_XE_VM_CREATE_FLAG_LR_MODE, 0);
> > >  	bo_size = sizeof(*data) * n_execs;
> > >  	bo_size = xe_bb_size(fd, bo_size);
> > >
> > > -	for (i = 0; (flags & EXEC_QUEUE_EARLY) && i < n_exec_queues; i++) {
> > > -		exec_queues[i] = xe_exec_queue_create(fd, vm, eci, 0);
> > > -		if (flags & BIND_EXECQUEUE)
> > > -			bind_exec_queues[i] =
> > > -				xe_bind_exec_queue_create(fd, vm, 0);
> > > -		else
> > > -			bind_exec_queues[i] = 0;
> > > -	};
> > > +	if (flags & EXEC_QUEUE_EARLY)
> > > +		create_exec_queue(fd, vm, exec_queues, bind_exec_queues,
> > n_exec_queues, eci, flags);
> > >
> > > -	if (flags & USERPTR) {
> > > -#define	MAP_ADDRESS	0x00007fadeadbe000
> > > -		if (flags & INVALIDATE) {
> > > -			data = mmap((void *)MAP_ADDRESS, bo_size,
> > PROT_READ |
> > > -				    PROT_WRITE, MAP_SHARED | MAP_FIXED |
> > > -				    MAP_ANONYMOUS, -1, 0);
> > > -			igt_assert(data != MAP_FAILED);
> > > -		} else {
> > > -			data = aligned_alloc(xe_get_default_alignment(fd),
> > > -					     bo_size);
> > > -			igt_assert(data);
> > > -		}
> > > -	} else {
> > > -		bo = xe_bo_create(fd, flags & VM_FOR_BO ? vm : 0,
> > > -				  bo_size, vram_if_possible(fd, eci->gt_id),
> > > -
> > DRM_XE_GEM_CREATE_FLAG_NEEDS_VISIBLE_VRAM);
> > > -		data = xe_bo_map(fd, bo, bo_size);
> > > -	}
> > > -	memset(data, 0, bo_size);
> > > +	placement = vram_if_possible(fd, eci->gt_id);
> > >
> > > -	for (i = 0; !(flags & EXEC_QUEUE_EARLY) && i < n_exec_queues; i++) {
> > > -		exec_queues[i] = xe_exec_queue_create(fd, vm, eci, 0);
> > > -		if (flags & BIND_EXECQUEUE)
> > > -			bind_exec_queues[i] =
> > > -				xe_bind_exec_queue_create(fd, vm, 0);
> > > -		else
> > > -			bind_exec_queues[i] = 0;
> > > -	};
> > > +	data = create_xe_bo(fd, vm, &bo, bo_size, placement, flags);
> > >
> > > -	sync[0].addr = to_user_pointer(&data[0].vm_sync);
> > > -	if (bo)
> > > -		xe_vm_bind_async(fd, vm, bind_exec_queues[0], bo, 0, addr,
> > > -				 bo_size, sync, 1);
> > > -	else
> > > -		xe_vm_bind_userptr_async(fd, vm, bind_exec_queues[0],
> > > -					 to_user_pointer(data), addr,
> > > -					 bo_size, sync, 1);
> > > -#define ONE_SEC	MS_TO_NS(1000)
> > > -#define HUNDRED_SEC	MS_TO_NS(100000)
> > > +	if(!(flags & EXEC_QUEUE_EARLY))
> > > +		create_exec_queue(fd, vm, exec_queues, bind_exec_queues,
> > n_exec_queues, eci, flags);
> > > +
> > > +	xe_bo_bind(fd, vm, bind_exec_queues[0], bo, bo_size, addr, sync, data,
> > flags);
> > >
> > >  	fence_timeout = igt_run_in_simulation() ? HUNDRED_SEC : ONE_SEC;
> > >
> > > -	xe_wait_ufence(fd, &data[0].vm_sync, USER_FENCE_VALUE,
> > > -		       bind_exec_queues[0], fence_timeout);
> > > -	data[0].vm_sync = 0;
> > > -
> > > -	for (i = 0; i < n_execs; i++) {
> > > -		uint64_t batch_offset = (char *)&data[i].batch - (char *)data;
> > > -		uint64_t batch_addr = addr + batch_offset;
> > > -		uint64_t sdi_offset = (char *)&data[i].data - (char *)data;
> > > -		uint64_t sdi_addr = addr + sdi_offset;
> > > -		int e = i % n_exec_queues;
> > > -
> > > -		b = 0;
> > > -		data[i].batch[b++] = MI_STORE_DWORD_IMM_GEN4;
> > > -		data[i].batch[b++] = sdi_addr;
> > > -		data[i].batch[b++] = sdi_addr >> 32;
> > > -		data[i].batch[b++] = 0xc0ffee;
> > > -		data[i].batch[b++] = MI_BATCH_BUFFER_END;
> > > -		igt_assert(b <= ARRAY_SIZE(data[i].batch));
> > > -
> > > -		sync[0].addr = addr + (char *)&data[i].exec_sync - (char *)data;
> > > -
> > > -		exec.exec_queue_id = exec_queues[e];
> > > -		exec.address = batch_addr;
> > > -		xe_exec(fd, &exec);
> > > -
> > > -		if (flags & REBIND && i + 1 != n_execs) {
> > > -			xe_wait_ufence(fd, &data[i].exec_sync,
> > USER_FENCE_VALUE,
> > > -				       exec_queues[e], fence_timeout);
> > > -			xe_vm_unbind_async(fd, vm, bind_exec_queues[e], 0,
> > > -					   addr, bo_size, NULL, 0);
> > > -
> > > -			sync[0].addr = to_user_pointer(&data[0].vm_sync);
> > > -			addr += bo_size;
> > > -			if (bo)
> > > -				xe_vm_bind_async(fd, vm,
> > bind_exec_queues[e], bo,
> > > -						 0, addr, bo_size, sync, 1);
> > > -			else
> > > -				xe_vm_bind_userptr_async(fd, vm,
> > > -							 bind_exec_queues[e],
> > > -							 to_user_pointer(data),
> > > -							 addr, bo_size, sync,
> > > -							 1);
> > > -			xe_wait_ufence(fd, &data[0].vm_sync,
> > USER_FENCE_VALUE,
> > > -				       bind_exec_queues[e], fence_timeout);
> > > -			data[0].vm_sync = 0;
> > > -		}
> > > -
> > > -		if (flags & INVALIDATE && i + 1 != n_execs) {
> > > -			if (!(flags & RACE)) {
> > > -				/*
> > > -				 * Wait for exec completion and check data as
> > > -				 * userptr will likely change to different
> > > -				 * physical memory on next mmap call triggering
> > > -				 * an invalidate.
> > > -				 */
> > > -				xe_wait_ufence(fd, &data[i].exec_sync,
> > > -					       USER_FENCE_VALUE,
> > exec_queues[e],
> > > -					       fence_timeout);
> > > -				igt_assert_eq(data[i].data, 0xc0ffee);
> > > -			} else if (i * 2 != n_execs) {
> > > -				/*
> > > -				 * We issue 1 mmap which races against running
> > > -				 * jobs. No real check here aside from this test
> > > -				 * not faulting on the GPU.
> > > -				 */
> > > -				continue;
> > > -			}
> > > -
> > > -			if (flags & RACE) {
> > > -				map_fd = open("/tmp", O_TMPFILE | O_RDWR,
> > > -					      0x666);
> > > -				write(map_fd, data, bo_size);
> > > -				data = mmap((void *)MAP_ADDRESS, bo_size,
> > > -					    PROT_READ | PROT_WRITE,
> > MAP_SHARED |
> > > -					    MAP_FIXED, map_fd, 0);
> > > -			} else {
> > > -				data = mmap((void *)MAP_ADDRESS, bo_size,
> > > -					    PROT_READ | PROT_WRITE,
> > MAP_SHARED |
> > > -					    MAP_FIXED | MAP_ANONYMOUS, -1,
> > 0);
> > > -			}
> > > -			igt_assert(data != MAP_FAILED);
> > > -		}
> > > -	}
> > > +	xe_execbuf(fd, vm, &exec, n_execs, exec_queues, bind_exec_queues,
> > > +			n_exec_queues, bo, bo_size, sync, addr, data, &map_fd,
> > flags);
> > >
> > >  	j = flags & INVALIDATE ? n_execs - 1 : 0;
> > >  	for (i = j; i < n_execs; i++)
> > > @@ -264,20 +132,12 @@ test_exec(int fd, struct
> > drm_xe_engine_class_instance *eci,
> > >  	if (flags & INVALIDATE)
> > >  		usleep(250000);
> > >
> > > -	sync[0].addr = to_user_pointer(&data[0].vm_sync);
> > > -	xe_vm_unbind_async(fd, vm, bind_exec_queues[0], 0, addr, bo_size,
> > > -			   sync, 1);
> > > -	xe_wait_ufence(fd, &data[0].vm_sync, USER_FENCE_VALUE,
> > > -		       bind_exec_queues[0], fence_timeout);
> > > +	xe_vm_unbind(fd, vm, bind_exec_queues[0], sync, data, addr, bo_size);
> > >
> > >  	for (i = j; i < n_execs; i++)
> > >  		igt_assert_eq(data[i].data, 0xc0ffee);
> > >
> > > -	for (i = 0; i < n_exec_queues; i++) {
> > > -		xe_exec_queue_destroy(fd, exec_queues[i]);
> > > -		if (bind_exec_queues[i])
> > > -			xe_exec_queue_destroy(fd, bind_exec_queues[i]);
> > > -	}
> > > +	destory_exec_queue(fd, exec_queues, bind_exec_queues,
> > n_exec_queues);
> > >
> > >  	if (bo) {
> > >  		munmap(data, bo_size);
> > > @@ -492,14 +352,14 @@ igt_main
> > >  		{ "userptr-rebind", USERPTR | REBIND },
> > >  		{ "userptr-invalidate", USERPTR | INVALIDATE },
> > >  		{ "userptr-invalidate-race", USERPTR | INVALIDATE | RACE },
> > > -		{ "bindexecqueue", BIND_EXECQUEUE },
> > > -		{ "bindexecqueue-userptr", BIND_EXECQUEUE | USERPTR },
> > > -		{ "bindexecqueue-rebind",  BIND_EXECQUEUE | REBIND },
> > > -		{ "bindexecqueue-userptr-rebind",  BIND_EXECQUEUE |
> > USERPTR |
> > > +		{ "bindexecqueue", BIND_EXEC_QUEUE },
> > > +		{ "bindexecqueue-userptr", BIND_EXEC_QUEUE | USERPTR },
> > > +		{ "bindexecqueue-rebind",  BIND_EXEC_QUEUE | REBIND },
> > > +		{ "bindexecqueue-userptr-rebind",  BIND_EXEC_QUEUE |
> > USERPTR |
> > >  			REBIND },
> > > -		{ "bindexecqueue-userptr-invalidate",  BIND_EXECQUEUE |
> > USERPTR |
> > > +		{ "bindexecqueue-userptr-invalidate",  BIND_EXEC_QUEUE |
> > USERPTR |
> > >  			INVALIDATE },
> > > -		{ "bindexecqueue-userptr-invalidate-race", BIND_EXECQUEUE |
> > USERPTR |
> > > +		{ "bindexecqueue-userptr-invalidate-race", BIND_EXEC_QUEUE |
> > USERPTR |
> > >  			INVALIDATE | RACE },
> > >  		{ NULL },
> > >  	};
> > > diff --git a/tests/intel/xe_exec_fault_mode.c
> > b/tests/intel/xe_exec_fault_mode.c
> > > index 40fe1743e..7b2fef224 100644
> > > --- a/tests/intel/xe_exec_fault_mode.c
> > > +++ b/tests/intel/xe_exec_fault_mode.c
> > > @@ -16,24 +16,13 @@
> > >  #include "igt.h"
> > >  #include "lib/igt_syncobj.h"
> > >  #include "lib/intel_reg.h"
> > > +#include "lib/xe/xe_util.h"
> > >  #include "xe_drm.h"
> > >
> > >  #include "xe/xe_ioctl.h"
> > >  #include "xe/xe_query.h"
> > >  #include <string.h>
> > >
> > > -#define MAX_N_EXEC_QUEUES	16
> > > -
> > > -#define USERPTR		(0x1 << 0)
> > > -#define REBIND		(0x1 << 1)
> > > -#define INVALIDATE	(0x1 << 2)
> > > -#define RACE		(0x1 << 3)
> > > -#define BIND_EXEC_QUEUE	(0x1 << 4)
> > > -#define PREFETCH	(0x1 << 5)
> > > -#define INVALID_FAULT	(0x1 << 6)
> > > -#define INVALID_VA	(0x1 << 7)
> > > -#define ENABLE_SCRATCH  (0x1 << 8)
> > > -
> > >  /**
> > >   * SUBTEST: invalid-va
> > >   * Description: Access invalid va and check for EIO through user fence.
> > > @@ -99,7 +88,6 @@ test_exec(int fd, struct drm_xe_engine_class_instance
> > *eci,
> > >  {
> > >  	uint32_t vm;
> > >  	uint64_t addr = 0x1a0000;
> > > -#define USER_FENCE_VALUE	0xdeadbeefdeadbeefull
> > >  	struct drm_xe_sync sync[1] = {
> > >  		{ .type = DRM_XE_SYNC_TYPE_USER_FENCE, .flags =
> > DRM_XE_SYNC_FLAG_SIGNAL,
> > >  	          .timeline_value = USER_FENCE_VALUE },
> > > @@ -113,14 +101,9 @@ test_exec(int fd, struct drm_xe_engine_class_instance
> > *eci,
> > >  	uint32_t bind_exec_queues[MAX_N_EXEC_QUEUES];
> > >  	size_t bo_size;
> > >  	uint32_t bo = 0;
> > > -	struct {
> > > -		uint32_t batch[16];
> > > -		uint64_t pad;
> > > -		uint64_t vm_sync;
> > > -		uint64_t exec_sync;
> > > -		uint32_t data;
> > > -	} *data;
> > > -	int i, j, b;
> > > +	struct cpu_va *data;
> > > +	uint64_t placement;
> > > +	int i, j;
> > >  	int map_fd = -1;
> > >
> > >  	igt_assert(n_exec_queues <= MAX_N_EXEC_QUEUES);
> > > @@ -134,144 +117,19 @@ test_exec(int fd, struct
> > drm_xe_engine_class_instance *eci,
> > >  	bo_size = sizeof(*data) * n_execs;
> > >  	bo_size = xe_bb_size(fd, bo_size);
> > >
> > > -	if (flags & USERPTR) {
> > > -#define	MAP_ADDRESS	0x00007fadeadbe000
> > > -		if (flags & INVALIDATE) {
> > > -			data = mmap((void *)MAP_ADDRESS, bo_size,
> > PROT_READ |
> > > -				    PROT_WRITE, MAP_SHARED | MAP_FIXED |
> > > -				    MAP_ANONYMOUS, -1, 0);
> > > -			igt_assert(data != MAP_FAILED);
> > > -		} else {
> > > -			data = aligned_alloc(xe_get_default_alignment(fd),
> > > -					     bo_size);
> > > -			igt_assert(data);
> > > -		}
> > > -	} else {
> > > -		if (flags & PREFETCH)
> > > -			bo = xe_bo_create(fd, 0, bo_size,
> > > -					  all_memory_regions(fd) |
> > > -					  vram_if_possible(fd, 0),
> > > -
> > DRM_XE_GEM_CREATE_FLAG_NEEDS_VISIBLE_VRAM);
> > > -		else
> > > -			bo = xe_bo_create(fd, 0, bo_size,
> > > -					  vram_if_possible(fd, eci->gt_id),
> > > -
> > DRM_XE_GEM_CREATE_FLAG_NEEDS_VISIBLE_VRAM);
> > > -		data = xe_bo_map(fd, bo, bo_size);
> > > -	}
> > > -	memset(data, 0, bo_size);
> > > -
> > > -	for (i = 0; i < n_exec_queues; i++) {
> > > -		exec_queues[i] = xe_exec_queue_create(fd, vm, eci, 0);
> > > -		if (flags & BIND_EXEC_QUEUE)
> > > -			bind_exec_queues[i] =
> > > -				xe_bind_exec_queue_create(fd, vm, 0);
> > > -		else
> > > -			bind_exec_queues[i] = 0;
> > > -	};
> > > -
> > > -	sync[0].addr = to_user_pointer(&data[0].vm_sync);
> > > -	if (bo)
> > > -		xe_vm_bind_async(fd, vm, bind_exec_queues[0], bo, 0, addr,
> > bo_size, sync, 1);
> > > -	else
> > > -		xe_vm_bind_userptr_async(fd, vm, bind_exec_queues[0],
> > > -					 to_user_pointer(data), addr,
> > > -					 bo_size, sync, 1);
> > > -#define ONE_SEC	MS_TO_NS(1000)
> > > -	xe_wait_ufence(fd, &data[0].vm_sync, USER_FENCE_VALUE,
> > > -		       bind_exec_queues[0], ONE_SEC);
> > > -	data[0].vm_sync = 0;
> > > -
> > > -	if (flags & PREFETCH) {
> > > -		/* Should move to system memory */
> > > -		xe_vm_prefetch_async(fd, vm, bind_exec_queues[0], 0, addr,
> > > -				     bo_size, sync, 1, 0);
> > > -		xe_wait_ufence(fd, &data[0].vm_sync, USER_FENCE_VALUE,
> > > -			       bind_exec_queues[0], ONE_SEC);
> > > -		data[0].vm_sync = 0;
> > > -	}
> > > -
> > > -	for (i = 0; i < n_execs; i++) {
> > > -		uint64_t batch_offset = (char *)&data[i].batch - (char *)data;
> > > -		uint64_t batch_addr = addr + batch_offset;
> > > -		uint64_t sdi_offset = (char *)&data[i].data - (char *)data;
> > > -		uint64_t sdi_addr = addr + sdi_offset;
> > > -		int e = i % n_exec_queues;
> > > +	placement = (flags & PREFETCH) ? all_memory_regions(fd) |
> > > +		vram_if_possible(fd, 0) : vram_if_possible(fd, eci->gt_id);
> > >
> > > -		b = 0;
> > > -		if (flags & INVALID_VA)
> > > -			sdi_addr = 0x1fffffffffff000;
> > > +	data = create_xe_bo(fd, vm, &bo, bo_size, placement, flags);
> > >
> > > -		data[i].batch[b++] = MI_STORE_DWORD_IMM_GEN4;
> > > -		data[i].batch[b++] = sdi_addr;
> > > -		data[i].batch[b++] = sdi_addr >> 32;
> > > -		data[i].batch[b++] = 0xc0ffee;
> > > -		data[i].batch[b++] = MI_BATCH_BUFFER_END;
> > > -		igt_assert(b <= ARRAY_SIZE(data[i].batch));
> > > +	create_exec_queue(fd, vm, exec_queues, bind_exec_queues,
> > > +			n_exec_queues, eci, flags);
> > >
> > > -		sync[0].addr = addr + (char *)&data[i].exec_sync - (char *)data;
> > > +	xe_bo_bind(fd, vm, bind_exec_queues[0], bo, bo_size, addr, sync, data,
> > flags);
> > >
> > > -		exec.exec_queue_id = exec_queues[e];
> > > -		exec.address = batch_addr;
> > > -		xe_exec(fd, &exec);
> > > +	xe_execbuf(fd, vm, &exec, n_execs, exec_queues, bind_exec_queues,
> > > +			n_exec_queues, bo, bo_size, sync, addr, data, &map_fd,
> > flags);
> > >
> > > -		if (flags & REBIND && i + 1 != n_execs) {
> > > -			xe_wait_ufence(fd, &data[i].exec_sync,
> > USER_FENCE_VALUE,
> > > -				       exec_queues[e], ONE_SEC);
> > > -			xe_vm_unbind_async(fd, vm, bind_exec_queues[e], 0,
> > > -					   addr, bo_size, NULL, 0);
> > > -
> > > -			sync[0].addr = to_user_pointer(&data[0].vm_sync);
> > > -			addr += bo_size;
> > > -			if (bo)
> > > -				xe_vm_bind_async(fd, vm,
> > bind_exec_queues[e], bo,
> > > -						 0, addr, bo_size, sync, 1);
> > > -			else
> > > -				xe_vm_bind_userptr_async(fd, vm,
> > > -							 bind_exec_queues[e],
> > > -							 to_user_pointer(data),
> > > -							 addr, bo_size, sync,
> > > -							 1);
> > > -			xe_wait_ufence(fd, &data[0].vm_sync,
> > USER_FENCE_VALUE,
> > > -				       bind_exec_queues[e], ONE_SEC);
> > > -			data[0].vm_sync = 0;
> > > -		}
> > > -
> > > -		if (flags & INVALIDATE && i + 1 != n_execs) {
> > > -			if (!(flags & RACE)) {
> > > -				/*
> > > -				 * Wait for exec completion and check data as
> > > -				 * userptr will likely change to different
> > > -				 * physical memory on next mmap call triggering
> > > -				 * an invalidate.
> > > -				 */
> > > -				xe_wait_ufence(fd, &data[i].exec_sync,
> > > -					       USER_FENCE_VALUE,
> > exec_queues[e],
> > > -					       ONE_SEC);
> > > -				igt_assert_eq(data[i].data, 0xc0ffee);
> > > -			} else if (i * 2 != n_execs) {
> > > -				/*
> > > -				 * We issue 1 mmap which races against running
> > > -				 * jobs. No real check here aside from this test
> > > -				 * not faulting on the GPU.
> > > -				 */
> > > -				continue;
> > > -			}
> > > -
> > > -			if (flags & RACE) {
> > > -				map_fd = open("/tmp", O_TMPFILE | O_RDWR,
> > > -					      0x666);
> > > -				write(map_fd, data, bo_size);
> > > -				data = mmap((void *)MAP_ADDRESS, bo_size,
> > > -					    PROT_READ | PROT_WRITE,
> > MAP_SHARED |
> > > -					    MAP_FIXED, map_fd, 0);
> > > -			} else {
> > > -				data = mmap((void *)MAP_ADDRESS, bo_size,
> > > -					    PROT_READ | PROT_WRITE,
> > MAP_SHARED |
> > > -					    MAP_FIXED | MAP_ANONYMOUS, -1,
> > 0);
> > > -			}
> > > -			igt_assert(data != MAP_FAILED);
> > > -		}
> > > -	}
> > >  	if (!(flags & INVALID_FAULT)) {
> > >  		int64_t timeout = ONE_SEC;
> > >
> > > @@ -286,22 +144,15 @@ test_exec(int fd, struct
> > drm_xe_engine_class_instance *eci,
> > >  							       exec_queues[i %
> > n_exec_queues], &timeout), 0);
> > >  		}
> > >  	}
> > > -	sync[0].addr = to_user_pointer(&data[0].vm_sync);
> > > -	xe_vm_unbind_async(fd, vm, bind_exec_queues[0], 0, addr, bo_size,
> > > -			   sync, 1);
> > > -	xe_wait_ufence(fd, &data[0].vm_sync, USER_FENCE_VALUE,
> > > -		       bind_exec_queues[0], ONE_SEC);
> > > +
> > > +	xe_vm_unbind(fd, vm, bind_exec_queues[0], sync, data, addr, bo_size);
> > >
> > >  	if (!(flags & INVALID_FAULT) && !(flags & INVALID_VA)) {
> > >  		for (i = j; i < n_execs; i++)
> > >  				igt_assert_eq(data[i].data, 0xc0ffee);
> > >  	}
> > >
> > > -	for (i = 0; i < n_exec_queues; i++) {
> > > -		xe_exec_queue_destroy(fd, exec_queues[i]);
> > > -		if (bind_exec_queues[i])
> > > -			xe_exec_queue_destroy(fd, bind_exec_queues[i]);
> > > -	}
> > > +	destory_exec_queue(fd, exec_queues, bind_exec_queues,
> > n_exec_queues);
> > >
> > >  	if (bo) {
> > >  		munmap(data, bo_size);
> > > --
> > > 2.25.1
> > >

^ permalink raw reply	[flat|nested] 9+ messages in thread

* RE: [PATCH] lib/xe/xe_util: Creating the helper functions
  2024-04-01 18:40     ` Matthew Brost
@ 2024-04-01 20:28       ` Zeng, Oak
  2024-04-01 22:32         ` Matthew Brost
  0 siblings, 1 reply; 9+ messages in thread
From: Zeng, Oak @ 2024-04-01 20:28 UTC (permalink / raw)
  To: Brost, Matthew
  Cc: Bommu, Krishnaiah, intel-xe@lists.freedesktop.org,
	Kumar, Janga Rahul, Ghimiray, Himal Prasad, Chehab, Mauro

Hi Matt

> -----Original Message-----
> From: Brost, Matthew <matthew.brost@intel.com>
> Sent: Monday, April 1, 2024 2:40 PM
> To: Zeng, Oak <oak.zeng@intel.com>
> Cc: Bommu, Krishnaiah <krishnaiah.bommu@intel.com>; intel-
> xe@lists.freedesktop.org; Kumar, Janga Rahul <janga.rahul.kumar@intel.com>;
> Ghimiray, Himal Prasad <himal.prasad.ghimiray@intel.com>; Chehab, Mauro
> <mauro.chehab@intel.com>
> Subject: Re: [PATCH] lib/xe/xe_util: Creating the helper functions
> 
> On Mon, Apr 01, 2024 at 10:10:06AM -0600, Zeng, Oak wrote:
> > Hi Matt,
> >
> > > -----Original Message-----
> > > From: Brost, Matthew <matthew.brost@intel.com>
> > > Sent: Sunday, March 31, 2024 5:05 PM
> > > To: Bommu, Krishnaiah <krishnaiah.bommu@intel.com>
> > > Cc: intel-xe@lists.freedesktop.org; Kumar, Janga Rahul
> > > <janga.rahul.kumar@intel.com>; Ghimiray, Himal Prasad
> > > <himal.prasad.ghimiray@intel.com>; Zeng, Oak <oak.zeng@intel.com>
> > > Subject: Re: [PATCH] lib/xe/xe_util: Creating the helper functions
> > >
> > > On Mon, Apr 01, 2024 at 12:29:49AM +0530, Bommu Krishnaiah wrote:
> > > > Creating the helper functions which can be reused by many tests,
> > > > as part of this patch xe_exec_compute_mode and xe_exec_fault_mode
> > > > are used this helper functions.
> > > >
> > >
> > > As the author of most these tests and IGTs, I can say while not great to
> > > have copy paste between tests I much prefer it to adding helpers to try
> > > to jam the needs of each test into the helpers. IMO this just creates a
> > > headache in the long run. e.g. Let's say I want to change the behavior
> > > of xe_exec_compute_mode but not xe_exec_fault_mode, with shared
> helpers
> > > this gets harder.
> >
> > Duplicated codes is not a good practice in software. When you have a fix, you
> need to fix all the places where you have the duplicated codes.
> >
> 
> I understand concept of why to not use duplicate codes but see below the
> design philosophy of these tests.
> 
> > In the example you give, it depends on how we make the helpers. If the
> helpers are not abstract well enough, we will run into the problem you said.
> >
> > The helper idea is inherited from i915. With those helpers, we can write a test
> with less than 50 lines of codes. In many cases, just ~20 lines of code, very easy to
> read and debug. Just take a look of i915 gem_vm_pagefault.c and i915_svm.c.
> >
> 
> gem_vm_pagefault.c is 2000 plus lines of code for 27 tests.
> 
> The test_exec function in xe_exec_compute_mode is ~150 lines of code
> which generates 50+ plus tests. There is power to a single data path
> controlled by various flags - this leads to the pattern of testing which
> is not directed rather do a bunch of things all at the same time. That
> is the intent of tests which use this style of coding (xe_exec_basic,
> xe_exec_compute_mode, xe_exec_balancer, xe_exec_fault_mode, and
> xe_exec_threads). I've used this style of coding to test hardware /
> software over the years (started my career writing functional tests for
> hardware) and can say these style of tests are by far the most effective
> for finding bugs / producing coverage / confidence.


Yes, I was also amazed by the efficiency of the xe style test. But that doesn't prohibit we introduce a few helpers.

As said in the other thread of this code review, I agree we can continue this style for the existing test. I also asked Krishna to remove some big helpers. The idea is, we will introduce a few simple helpers (mainly for the system allocator tests which will take another style: each test is simple, but overall has less test/LOC efficiency). 

We will apply those helpers for the existing tests but won't change your coding style. 

> 
> Also they should be easily extentable, see [1] which adds some new
> sections to xe_exec_compute_mode to test [2]. New tests are added by
> adding a new flag + update data path with new flags. This is my example
> why for these types of tests helpers replacing the data-path is not
> ideal.
> 
> With all of this, I'm not suggesting helpers are not useful nor I am
> suggesting directed testing is not useful (see xe_vm, largely a
> directed test one of the most essential tests in the suite) just in this
> case I do not think the aforementioned tests will benefit from helpers
> as written in this patch.

Sure. Let's Krishna rework on the helpers and let's see what we get. As said, our goal is not to simplify the existing test, nor to change the coding style of existing test. 

Whether we can apply helpers to existing tests? It all depends on how we abstract the helpers well enough. I believe helpers such as create and map buffer, create and fill command buffer, submit command buffer etc can simplify things a little. But lets wait for Krishna's improvement and see.



> 
> [1] https://patchwork.freedesktop.org/patch/583145/?series=131211&rev=1
> [2] https://patchwork.freedesktop.org/series/130935/
> 
> > Xe igt is rather complicated compared above i915 tests. In the past few days, I
> had to debug into xe-exec-threads, it took me long time to really understand the
> test. The main reason is, we have a huge function such as test_legacy_mode, and
> flags is used to control the behavior of the test. This is hard to read, maintain and
> debug.
> >
> 
> Saying something is complicated is a matter of opinion. I think the i915
> tests are incomprehensible and can easily understand the Xe tests but
> course I wrote the Xe tests... I suspect the inverse is true in the i915
> tests...
> 
> I will say if xe_exec_threads runs to completion the condfience level in
> submission working Xe is very, very high. Again ~1500 lines of code
> generates ~120 tests.

Agreed. Most of the i915 tests are also the same style as you coded for xe. Just those two I mentioned above used another style. I have no doubt of the efficiency of that style. As said, I don't have any intension to change this style.

> 
> > >
> > > I'd much rather leave this as is.
> > >
> > > Maybe I can buy a helper or two to program the batches, i.e.
> > > insert_store LGTM, but certainly not the logic doing bind, invalidates,
> > > loop, etc...
> >
> >
> > Let me take a look of the helpers, and see whether we can make it better. Our
> plan is to use those helpers for system allocator test. We use existing tests to test
> those helpers because right now we don't have system allocator ready. So let's
> improve it first. We might end up something which is good for both existing test
> and coming system allocator test.
> >
> 
> Again I don't like in general predicting the future... But I'd almostly
> certainly say a test in the style of afformentioned tests will be
> written too with me insisting or writing one myself given my experience
> doing so.
> 
> Off the top of my head...
> 
> - various allocation sizes
> - mix mmap / malloc / BO allocations
> - aligned / unaligned allocations
> - different madvise policies
> - allocations that faulted in (CPU touches pages before GPU) or not
>   faulted in (no backing store on GPU access)
> - races between CPU / GPU access
> - sparse access to allocations
> - dynamic freeing of allocations
> - intentionally leak memory
> - invalid addresses
> - allocate / GPU fault in more memory than VRAM available causing eviction
> 
> All of this within a single function controlled by flags.

In the coding style I preferred, each of above lines will be one function, vs single function controlled by flags. I am pretty sure my style will be more readable but at the cost of more LOC.

Again, I think both styles works. Both have its pros and cons. I don't have any intension to change the coding style of existing tests.


> 
> Extend to multi-threaded versions with dedicated VMs or a shared VMs.
> 
> Extend to multi-process version with dedicated VMs and FDs.
> 
> So this will probably be 100s of tests with more or less 1 function
> called with different arguments. I bet if a test like this run to
> completion confidence in SA design is very, very high.
> 
> > We will let you decide whether you want to use those helpers for existing tests.
> If we can't make something good for existing tests, we will target to using those
> helpers only for the coming system allocator tests.
> >
> 
> Always open to improvements but my main point here is - helpers
> replacing the data-path is a tricky proposition.

Again, we will not change the data path.

Oak


> 
> Matt
> 
> > Thanks,
> > Oak
> >
> > >
> > > Matt
> > >
> > > > Signed-off-by: Bommu Krishnaiah <krishnaiah.bommu@intel.com>
> > > > Cc: Janga Rahul Kumar <janga.rahul.kumar@intel.com>
> > > > Cc: Himal Prasad Ghimiray <himal.prasad.ghimiray@intel.com>
> > > > Cc: Oak Zeng <oak.zeng@intel.com>
> > > > ---
> > > >  lib/xe/xe_util.c                   | 194 +++++++++++++++++++++++++++++
> > > >  lib/xe/xe_util.h                   |  48 +++++++
> > > >  tests/intel/xe_exec_compute_mode.c | 190 ++++------------------------
> > > >  tests/intel/xe_exec_fault_mode.c   | 179 +++-----------------------
> > > >  4 files changed, 282 insertions(+), 329 deletions(-)
> > > >
> > > > diff --git a/lib/xe/xe_util.c b/lib/xe/xe_util.c
> > > > index 050162b5e..17cd45e45 100644
> > > > --- a/lib/xe/xe_util.c
> > > > +++ b/lib/xe/xe_util.c
> > > > @@ -255,3 +255,197 @@ bool xe_is_gt_in_c6(int fd, int gt)
> > > >
> > > >  	return strcmp(gt_c_state, "gt-c6") == 0;
> > > >  }
> > > > +
> > > > +struct cpu_va *create_xe_bo(int fd, uint32_t vm, uint32_t *bo,
> > > > +		uint32_t bo_size, uint64_t placement, unsigned int flags)
> > > > +{
> > > > +	struct cpu_va *data;
> > > > +
> > > > +	if (flags & USERPTR) {
> > > > +		if (flags & INVALIDATE) {
> > > > +			data = mmap((void *)MAP_ADDRESS, bo_size,
> > > PROT_READ | PROT_WRITE,
> > > > +					MAP_SHARED | MAP_FIXED |
> > > MAP_ANONYMOUS, -1, 0);
> > > > +			igt_assert(data != MAP_FAILED);
> > > > +		} else {
> > > > +			data = aligned_alloc(xe_get_default_alignment(fd),
> > > bo_size);
> > > > +			igt_assert(data);
> > > > +		}
> > > > +	} else {
> > > > +		*bo = xe_bo_create(fd, flags & VM_FOR_BO ? vm : 0, bo_size,
> > > placement,
> > > > +
> > > 	DRM_XE_GEM_CREATE_FLAG_NEEDS_VISIBLE_VRAM);
> > > > +		data = xe_bo_map(fd, *bo, bo_size);
> > > > +	}
> > > > +
> > > > +	memset(data, 0, bo_size);
> > > > +
> > > > +	return data;
> > > > +}
> > > > +
> > > > +void create_exec_queue(int fd, uint32_t vm, uint32_t *exec_queues,
> > > > +		uint32_t *bind_exec_queues, int n_exec_queues,
> > > > +		struct drm_xe_engine_class_instance *eci, unsigned int flags)
> > > > +{
> > > > +	int i;
> > > > +
> > > > +	for (i = 0; i < n_exec_queues; i++) {
> > > > +		exec_queues[i] = xe_exec_queue_create(fd, vm, eci, 0);
> > > > +		if (flags & BIND_EXEC_QUEUE)
> > > > +			bind_exec_queues[i] = xe_bind_exec_queue_create(fd,
> > > vm, 0);
> > > > +		else
> > > > +			bind_exec_queues[i] = 0;
> > > > +	};
> > > > +}
> > > > +
> > > > +void xe_bo_bind(int fd, uint32_t vm, uint32_t bind_exec_queues,
> uint32_t bo,
> > > > +		uint32_t bo_size, uint64_t addr, struct drm_xe_sync *sync,
> > > > +		struct cpu_va *data, unsigned int flags)
> > > > +{
> > > > +	int64_t fence_timeout = igt_run_in_simulation() ? HUNDRED_SEC :
> > > ONE_SEC;
> > > > +
> > > > +	sync[0].addr = to_user_pointer(&data[0].vm_sync);
> > > > +	if (bo)
> > > > +		xe_vm_bind_async(fd, vm, bind_exec_queues, bo, 0, addr,
> > > bo_size, sync, 1);
> > > > +	else
> > > > +		xe_vm_bind_userptr_async(fd, vm, bind_exec_queues,
> > > > +				to_user_pointer(data), addr,
> > > > +				bo_size, sync, 1);
> > > > +
> > > > +	xe_wait_ufence(fd, &data[0].vm_sync, USER_FENCE_VALUE,
> > > > +			bind_exec_queues, fence_timeout);
> > > > +	data[0].vm_sync = 0;
> > > > +
> > > > +	if (flags & PREFETCH) {
> > > > +		/* Should move to system memory */
> > > > +		xe_vm_prefetch_async(fd, vm, bind_exec_queues, 0, addr,
> > > > +				bo_size, sync, 1, 0);
> > > > +		xe_wait_ufence(fd, &data[0].vm_sync, USER_FENCE_VALUE,
> > > > +				bind_exec_queues, fence_timeout);
> > > > +		data[0].vm_sync = 0;
> > > > +	}
> > > > +}
> > > > +
> > > > +void insert_store(uint64_t dst, struct cpu_va *data, uint32_t val, int i)
> > > > +{
> > > > +	int b = 0;
> > > > +	data[i].batch[b++] = MI_STORE_DWORD_IMM_GEN4;
> > > > +	data[i].batch[b++] = dst;
> > > > +	data[i].batch[b++] = dst >> 32;
> > > > +	data[i].batch[b++] = val;
> > > > +	data[i].batch[b++] = MI_BATCH_BUFFER_END;
> > > > +	igt_assert(b <= ARRAY_SIZE(data[i].batch));
> > > > +}
> > > > +
> > > > +void xe_execbuf(int fd, uint32_t vm, struct drm_xe_exec *exec, int
> n_execs,
> > > > +		uint32_t *exec_queues, uint32_t *bind_exec_queues,
> > > > +		int n_exec_queues, uint32_t bo, uint32_t bo_size,
> > > > +		struct drm_xe_sync *sync, uint64_t addr, struct cpu_va *data,
> > > > +		int *map_fd, unsigned int flags)
> > > > +{
> > > > +	int i;
> > > > +	int64_t fence_timeout = igt_run_in_simulation() ? HUNDRED_SEC :
> > > ONE_SEC;
> > > > +
> > > > +	for (i = 0; i < n_execs; i++) {
> > > > +		uint64_t batch_offset = (char *)&data[i].batch - (char *)data;
> > > > +		uint64_t batch_addr = addr + batch_offset;
> > > > +		uint64_t sdi_offset = (char *)&data[i].data - (char *)data;
> > > > +		uint64_t sdi_addr = addr + sdi_offset;
> > > > +		int e = i % n_exec_queues;
> > > > +
> > > > +		if (flags & INVALID_VA)
> > > > +			sdi_addr = 0x1fffffffffff000;
> > > > +
> > > > +		insert_store(sdi_addr, data, 0xc0ffee, i);
> > > > +
> > > > +		sync[0].addr = addr + (char *)&data[i].exec_sync - (char *)data;
> > > > +
> > > > +		exec->exec_queue_id = exec_queues[e];
> > > > +		exec->address = batch_addr;
> > > > +		xe_exec(fd, exec);
> > > > +
> > > > +		if (flags & REBIND && i + 1 != n_execs) {
> > > > +			xe_wait_ufence(fd, &data[i].exec_sync,
> > > USER_FENCE_VALUE,
> > > > +					exec_queues[e], fence_timeout);
> > > > +			xe_vm_unbind_async(fd, vm, bind_exec_queues[e], 0,
> > > > +					addr, bo_size, NULL, 0);
> > > > +
> > > > +			sync[0].addr = to_user_pointer(&data[0].vm_sync);
> > > > +			addr += bo_size;
> > > > +
> > > > +			if (bo)
> > > > +				xe_vm_bind_async(fd, vm,
> > > bind_exec_queues[e], bo,
> > > > +						0, addr, bo_size, sync, 1);
> > > > +			else
> > > > +				xe_vm_bind_userptr_async(fd, vm,
> > > > +						bind_exec_queues[e],
> > > > +						to_user_pointer(data),
> > > > +						addr, bo_size, sync, 1);
> > > > +
> > > > +			xe_wait_ufence(fd, &data[0].vm_sync,
> > > USER_FENCE_VALUE,
> > > > +					bind_exec_queues[e], fence_timeout);
> > > > +			data[0].vm_sync = 0;
> > > > +		}
> > > > +
> > > > +		if (flags & INVALIDATE && i + 1 != n_execs) {
> > > > +			if (!(flags & RACE)) {
> > > > +				/*
> > > > +				 * Wait for exec completion and check data as
> > > > +				 * userptr will likely change to different
> > > > +				 * physical memory on next mmap call triggering
> > > > +				 * an invalidate.
> > > > +				 */
> > > > +				xe_wait_ufence(fd, &data[i].exec_sync,
> > > > +						USER_FENCE_VALUE,
> > > exec_queues[e],
> > > > +						fence_timeout);
> > > > +				igt_assert_eq(data[i].data, 0xc0ffee);
> > > > +			} else if (i * 2 != n_execs) {
> > > > +				/*
> > > > +				 * We issue 1 mmap which races against running
> > > > +				 * jobs. No real check here aside from this test
> > > > +				 * not faulting on the GPU.
> > > > +				 */
> > > > +				continue;
> > > > +			}
> > > > +
> > > > +			if (flags & RACE) {
> > > > +				*map_fd = open("/tmp", O_TMPFILE | O_RDWR,
> > > > +						0x666);
> > > > +				write(*map_fd, data, bo_size);
> > > > +				data = mmap((void *)MAP_ADDRESS, bo_size,
> > > > +						PROT_READ | PROT_WRITE,
> > > MAP_SHARED |
> > > > +						MAP_FIXED, *map_fd, 0);
> > > > +			} else {
> > > > +				data = mmap((void *)MAP_ADDRESS, bo_size,
> > > > +						PROT_READ | PROT_WRITE,
> > > MAP_SHARED |
> > > > +						MAP_FIXED |
> > > MAP_ANONYMOUS, -1, 0);
> > > > +			}
> > > > +
> > > > +			igt_assert(data != MAP_FAILED);
> > > > +		}
> > > > +	}
> > > > +}
> > > > +
> > > > +void xe_vm_unbind(int fd, uint32_t vm, uint32_t bind_exec_queues,
> > > > +		struct drm_xe_sync *sync, struct cpu_va *data, uint64_t addr,
> > > > +		uint32_t bo_size)
> > > > +{
> > > > +	int64_t fence_timeout = igt_run_in_simulation() ? HUNDRED_SEC :
> > > ONE_SEC;
> > > > +
> > > > +	sync[0].addr = to_user_pointer(&data[0].vm_sync);
> > > > +	xe_vm_unbind_async(fd, vm, bind_exec_queues, 0, addr, bo_size,
> > > > +			sync, 1);
> > > > +	xe_wait_ufence(fd, &data[0].vm_sync, USER_FENCE_VALUE,
> > > > +			bind_exec_queues, fence_timeout);
> > > > +}
> > > > +
> > > > +void destory_exec_queue(int fd, uint32_t *exec_queues,
> > > > +		uint32_t *bind_exec_queues, int n_exec_queues)
> > > > +{
> > > > +	int i;
> > > > +
> > > > +	for (i = 0; i < n_exec_queues; i++) {
> > > > +		xe_exec_queue_destroy(fd, exec_queues[i]);
> > > > +		if (bind_exec_queues[i])
> > > > +			xe_exec_queue_destroy(fd, bind_exec_queues[i]);
> > > > +	}
> > > > +}
> > > > +
> > > > diff --git a/lib/xe/xe_util.h b/lib/xe/xe_util.h
> > > > index 6480ea01a..aa1b0dcc1 100644
> > > > --- a/lib/xe/xe_util.h
> > > > +++ b/lib/xe/xe_util.h
> > > > @@ -47,4 +47,52 @@ void xe_bind_unbind_async(int fd, uint32_t vm,
> uint32_t
> > > bind_engine,
> > > >
> > > >  bool xe_is_gt_in_c6(int fd, int gt);
> > > >
> > > > +
> > > > +#define USERPTR			(0x1 << 0)
> > > > +#define REBIND			(0x1 << 1)
> > > > +#define INVALIDATE		(0x1 << 2)
> > > > +#define RACE			(0x1 << 3)
> > > > +#define BIND_EXEC_QUEUE		(0x1 << 4)
> > > > +#define PREFETCH		(0x1 << 5)
> > > > +#define INVALID_FAULT		(0x1 << 6)
> > > > +#define INVALID_VA		(0x1 << 7)
> > > > +#define ENABLE_SCRATCH		(0x1 << 8)
> > > > +#define VM_FOR_BO		(0x1 << 9)
> > > > +#define EXEC_QUEUE_EARLY	(0x1 << 10)
> > > > +
> > > > +#define MAX_N_EXEC_QUEUES	16
> > > > +#define USER_FENCE_VALUE	0xdeadbeefdeadbeefull
> > > > +#define MAP_ADDRESS		0x00007fadeadbe000
> > > > +
> > > > +#define ONE_SEC		MS_TO_NS(1000)
> > > > +#define HUNDRED_SEC	MS_TO_NS(100000)
> > > > +
> > > > +struct cpu_va {
> > > > +	uint32_t batch[16];
> > > > +	uint64_t pad;
> > > > +	uint64_t vm_sync;
> > > > +	uint64_t exec_sync;
> > > > +	uint32_t data;
> > > > +};
> > > > +
> > > > +struct cpu_va *create_xe_bo(int fd, uint32_t vm, uint32_t *bo,
> > > > +		uint32_t bo_size, uint64_t placement,  unsigned int flags);
> > > > +void create_exec_queue(int fd, uint32_t vm, uint32_t *exec_queues,
> > > > +		uint32_t *bind_exec_queues, int n_exec_queues,
> > > > +		struct drm_xe_engine_class_instance *eci, unsigned int flags);
> > > > +void xe_bo_bind(int fd, uint32_t vm, uint32_t bind_exec_queues,
> uint32_t bo,
> > > > +		uint32_t bo_size, uint64_t addr, struct drm_xe_sync *sync,
> > > > +		struct cpu_va *data, unsigned int flags);
> > > > +void xe_execbuf(int fd, uint32_t vm, struct drm_xe_exec *exec, int
> n_execs,
> > > > +		uint32_t *exec_queues, uint32_t *bind_exec_queues,
> > > > +		int n_exec_queues, uint32_t bo, uint32_t bo_size,
> > > > +		struct drm_xe_sync *sync, uint64_t addr, struct cpu_va *data,
> > > > +		int *map_fd, unsigned int flags);
> > > > +void insert_store(uint64_t dst, struct cpu_va *data, uint32_t val, int i);
> > > > +void xe_vm_unbind(int fd, uint32_t vm, uint32_t bind_exec_queues,
> > > > +		struct drm_xe_sync *sync, struct cpu_va *data,
> > > > +		uint64_t addr, uint32_t bo_size);
> > > > +void destory_exec_queue(int fd, uint32_t *exec_queues,
> > > > +		uint32_t *bind_exec_queues, int n_exec_queues);
> > > > +
> > > >  #endif /* XE_UTIL_H */
> > > > diff --git a/tests/intel/xe_exec_compute_mode.c
> > > b/tests/intel/xe_exec_compute_mode.c
> > > > index 3ec848164..e8d82cc69 100644
> > > > --- a/tests/intel/xe_exec_compute_mode.c
> > > > +++ b/tests/intel/xe_exec_compute_mode.c
> > > > @@ -15,6 +15,7 @@
> > > >  #include "igt.h"
> > > >  #include "lib/igt_syncobj.h"
> > > >  #include "lib/intel_reg.h"
> > > > +#include "lib/xe/xe_util.h"
> > > >  #include <sys/ioctl.h>
> > > >  #include "xe_drm.h"
> > > >
> > > > @@ -23,15 +24,6 @@
> > > >  #include "xe/xe_spin.h"
> > > >  #include <string.h>
> > > >
> > > > -#define MAX_N_EXECQUEUES 	16
> > > > -#define USERPTR				(0x1 << 0)
> > > > -#define REBIND				(0x1 << 1)
> > > > -#define INVALIDATE			(0x1 << 2)
> > > > -#define RACE				(0x1 << 3)
> > > > -#define BIND_EXECQUEUE		(0x1 << 4)
> > > > -#define VM_FOR_BO			(0x1 << 5)
> > > > -#define EXEC_QUEUE_EARLY	(0x1 << 6)
> > > > -
> > > >  /**
> > > >   * SUBTEST: twice-%s
> > > >   * Description: Run %arg[1] compute machine test twice
> > > > @@ -88,7 +80,6 @@ test_exec(int fd, struct drm_xe_engine_class_instance
> > > *eci,
> > > >  {
> > > >  	uint32_t vm;
> > > >  	uint64_t addr = 0x1a0000;
> > > > -#define USER_FENCE_VALUE	0xdeadbeefdeadbeefull
> > > >  	struct drm_xe_sync sync[1] = {
> > > >  		{ .type = DRM_XE_SYNC_TYPE_USER_FENCE,
> > > >  		  .flags = DRM_XE_SYNC_FLAG_SIGNAL,
> > > > @@ -99,161 +90,38 @@ test_exec(int fd, struct
> drm_xe_engine_class_instance
> > > *eci,
> > > >  		.num_syncs = 1,
> > > >  		.syncs = to_user_pointer(sync),
> > > >  	};
> > > > -	uint32_t exec_queues[MAX_N_EXECQUEUES];
> > > > -	uint32_t bind_exec_queues[MAX_N_EXECQUEUES];
> > > > +	uint32_t exec_queues[MAX_N_EXEC_QUEUES];
> > > > +	uint32_t bind_exec_queues[MAX_N_EXEC_QUEUES];
> > > >  	size_t bo_size;
> > > >  	uint32_t bo = 0;
> > > > -	struct {
> > > > -		uint32_t batch[16];
> > > > -		uint64_t pad;
> > > > -		uint64_t vm_sync;
> > > > -		uint64_t exec_sync;
> > > > -		uint32_t data;
> > > > -	} *data;
> > > > -	int i, j, b;
> > > > +	struct cpu_va *data;
> > > > +	int i, j;
> > > >  	int map_fd = -1;
> > > >  	int64_t fence_timeout;
> > > > +	uint64_t placement;
> > > >
> > > > -	igt_assert(n_exec_queues <= MAX_N_EXECQUEUES);
> > > > +	igt_assert(n_exec_queues <= MAX_N_EXEC_QUEUES);
> > > >
> > > >  	vm = xe_vm_create(fd, DRM_XE_VM_CREATE_FLAG_LR_MODE, 0);
> > > >  	bo_size = sizeof(*data) * n_execs;
> > > >  	bo_size = xe_bb_size(fd, bo_size);
> > > >
> > > > -	for (i = 0; (flags & EXEC_QUEUE_EARLY) && i < n_exec_queues; i++) {
> > > > -		exec_queues[i] = xe_exec_queue_create(fd, vm, eci, 0);
> > > > -		if (flags & BIND_EXECQUEUE)
> > > > -			bind_exec_queues[i] =
> > > > -				xe_bind_exec_queue_create(fd, vm, 0);
> > > > -		else
> > > > -			bind_exec_queues[i] = 0;
> > > > -	};
> > > > +	if (flags & EXEC_QUEUE_EARLY)
> > > > +		create_exec_queue(fd, vm, exec_queues, bind_exec_queues,
> > > n_exec_queues, eci, flags);
> > > >
> > > > -	if (flags & USERPTR) {
> > > > -#define	MAP_ADDRESS	0x00007fadeadbe000
> > > > -		if (flags & INVALIDATE) {
> > > > -			data = mmap((void *)MAP_ADDRESS, bo_size,
> > > PROT_READ |
> > > > -				    PROT_WRITE, MAP_SHARED | MAP_FIXED |
> > > > -				    MAP_ANONYMOUS, -1, 0);
> > > > -			igt_assert(data != MAP_FAILED);
> > > > -		} else {
> > > > -			data = aligned_alloc(xe_get_default_alignment(fd),
> > > > -					     bo_size);
> > > > -			igt_assert(data);
> > > > -		}
> > > > -	} else {
> > > > -		bo = xe_bo_create(fd, flags & VM_FOR_BO ? vm : 0,
> > > > -				  bo_size, vram_if_possible(fd, eci->gt_id),
> > > > -
> > > DRM_XE_GEM_CREATE_FLAG_NEEDS_VISIBLE_VRAM);
> > > > -		data = xe_bo_map(fd, bo, bo_size);
> > > > -	}
> > > > -	memset(data, 0, bo_size);
> > > > +	placement = vram_if_possible(fd, eci->gt_id);
> > > >
> > > > -	for (i = 0; !(flags & EXEC_QUEUE_EARLY) && i < n_exec_queues; i++) {
> > > > -		exec_queues[i] = xe_exec_queue_create(fd, vm, eci, 0);
> > > > -		if (flags & BIND_EXECQUEUE)
> > > > -			bind_exec_queues[i] =
> > > > -				xe_bind_exec_queue_create(fd, vm, 0);
> > > > -		else
> > > > -			bind_exec_queues[i] = 0;
> > > > -	};
> > > > +	data = create_xe_bo(fd, vm, &bo, bo_size, placement, flags);
> > > >
> > > > -	sync[0].addr = to_user_pointer(&data[0].vm_sync);
> > > > -	if (bo)
> > > > -		xe_vm_bind_async(fd, vm, bind_exec_queues[0], bo, 0, addr,
> > > > -				 bo_size, sync, 1);
> > > > -	else
> > > > -		xe_vm_bind_userptr_async(fd, vm, bind_exec_queues[0],
> > > > -					 to_user_pointer(data), addr,
> > > > -					 bo_size, sync, 1);
> > > > -#define ONE_SEC	MS_TO_NS(1000)
> > > > -#define HUNDRED_SEC	MS_TO_NS(100000)
> > > > +	if(!(flags & EXEC_QUEUE_EARLY))
> > > > +		create_exec_queue(fd, vm, exec_queues, bind_exec_queues,
> > > n_exec_queues, eci, flags);
> > > > +
> > > > +	xe_bo_bind(fd, vm, bind_exec_queues[0], bo, bo_size, addr, sync, data,
> > > flags);
> > > >
> > > >  	fence_timeout = igt_run_in_simulation() ? HUNDRED_SEC : ONE_SEC;
> > > >
> > > > -	xe_wait_ufence(fd, &data[0].vm_sync, USER_FENCE_VALUE,
> > > > -		       bind_exec_queues[0], fence_timeout);
> > > > -	data[0].vm_sync = 0;
> > > > -
> > > > -	for (i = 0; i < n_execs; i++) {
> > > > -		uint64_t batch_offset = (char *)&data[i].batch - (char *)data;
> > > > -		uint64_t batch_addr = addr + batch_offset;
> > > > -		uint64_t sdi_offset = (char *)&data[i].data - (char *)data;
> > > > -		uint64_t sdi_addr = addr + sdi_offset;
> > > > -		int e = i % n_exec_queues;
> > > > -
> > > > -		b = 0;
> > > > -		data[i].batch[b++] = MI_STORE_DWORD_IMM_GEN4;
> > > > -		data[i].batch[b++] = sdi_addr;
> > > > -		data[i].batch[b++] = sdi_addr >> 32;
> > > > -		data[i].batch[b++] = 0xc0ffee;
> > > > -		data[i].batch[b++] = MI_BATCH_BUFFER_END;
> > > > -		igt_assert(b <= ARRAY_SIZE(data[i].batch));
> > > > -
> > > > -		sync[0].addr = addr + (char *)&data[i].exec_sync - (char *)data;
> > > > -
> > > > -		exec.exec_queue_id = exec_queues[e];
> > > > -		exec.address = batch_addr;
> > > > -		xe_exec(fd, &exec);
> > > > -
> > > > -		if (flags & REBIND && i + 1 != n_execs) {
> > > > -			xe_wait_ufence(fd, &data[i].exec_sync,
> > > USER_FENCE_VALUE,
> > > > -				       exec_queues[e], fence_timeout);
> > > > -			xe_vm_unbind_async(fd, vm, bind_exec_queues[e], 0,
> > > > -					   addr, bo_size, NULL, 0);
> > > > -
> > > > -			sync[0].addr = to_user_pointer(&data[0].vm_sync);
> > > > -			addr += bo_size;
> > > > -			if (bo)
> > > > -				xe_vm_bind_async(fd, vm,
> > > bind_exec_queues[e], bo,
> > > > -						 0, addr, bo_size, sync, 1);
> > > > -			else
> > > > -				xe_vm_bind_userptr_async(fd, vm,
> > > > -							 bind_exec_queues[e],
> > > > -							 to_user_pointer(data),
> > > > -							 addr, bo_size, sync,
> > > > -							 1);
> > > > -			xe_wait_ufence(fd, &data[0].vm_sync,
> > > USER_FENCE_VALUE,
> > > > -				       bind_exec_queues[e], fence_timeout);
> > > > -			data[0].vm_sync = 0;
> > > > -		}
> > > > -
> > > > -		if (flags & INVALIDATE && i + 1 != n_execs) {
> > > > -			if (!(flags & RACE)) {
> > > > -				/*
> > > > -				 * Wait for exec completion and check data as
> > > > -				 * userptr will likely change to different
> > > > -				 * physical memory on next mmap call triggering
> > > > -				 * an invalidate.
> > > > -				 */
> > > > -				xe_wait_ufence(fd, &data[i].exec_sync,
> > > > -					       USER_FENCE_VALUE,
> > > exec_queues[e],
> > > > -					       fence_timeout);
> > > > -				igt_assert_eq(data[i].data, 0xc0ffee);
> > > > -			} else if (i * 2 != n_execs) {
> > > > -				/*
> > > > -				 * We issue 1 mmap which races against running
> > > > -				 * jobs. No real check here aside from this test
> > > > -				 * not faulting on the GPU.
> > > > -				 */
> > > > -				continue;
> > > > -			}
> > > > -
> > > > -			if (flags & RACE) {
> > > > -				map_fd = open("/tmp", O_TMPFILE | O_RDWR,
> > > > -					      0x666);
> > > > -				write(map_fd, data, bo_size);
> > > > -				data = mmap((void *)MAP_ADDRESS, bo_size,
> > > > -					    PROT_READ | PROT_WRITE,
> > > MAP_SHARED |
> > > > -					    MAP_FIXED, map_fd, 0);
> > > > -			} else {
> > > > -				data = mmap((void *)MAP_ADDRESS, bo_size,
> > > > -					    PROT_READ | PROT_WRITE,
> > > MAP_SHARED |
> > > > -					    MAP_FIXED | MAP_ANONYMOUS, -1,
> > > 0);
> > > > -			}
> > > > -			igt_assert(data != MAP_FAILED);
> > > > -		}
> > > > -	}
> > > > +	xe_execbuf(fd, vm, &exec, n_execs, exec_queues, bind_exec_queues,
> > > > +			n_exec_queues, bo, bo_size, sync, addr, data, &map_fd,
> > > flags);
> > > >
> > > >  	j = flags & INVALIDATE ? n_execs - 1 : 0;
> > > >  	for (i = j; i < n_execs; i++)
> > > > @@ -264,20 +132,12 @@ test_exec(int fd, struct
> > > drm_xe_engine_class_instance *eci,
> > > >  	if (flags & INVALIDATE)
> > > >  		usleep(250000);
> > > >
> > > > -	sync[0].addr = to_user_pointer(&data[0].vm_sync);
> > > > -	xe_vm_unbind_async(fd, vm, bind_exec_queues[0], 0, addr, bo_size,
> > > > -			   sync, 1);
> > > > -	xe_wait_ufence(fd, &data[0].vm_sync, USER_FENCE_VALUE,
> > > > -		       bind_exec_queues[0], fence_timeout);
> > > > +	xe_vm_unbind(fd, vm, bind_exec_queues[0], sync, data, addr, bo_size);
> > > >
> > > >  	for (i = j; i < n_execs; i++)
> > > >  		igt_assert_eq(data[i].data, 0xc0ffee);
> > > >
> > > > -	for (i = 0; i < n_exec_queues; i++) {
> > > > -		xe_exec_queue_destroy(fd, exec_queues[i]);
> > > > -		if (bind_exec_queues[i])
> > > > -			xe_exec_queue_destroy(fd, bind_exec_queues[i]);
> > > > -	}
> > > > +	destory_exec_queue(fd, exec_queues, bind_exec_queues,
> > > n_exec_queues);
> > > >
> > > >  	if (bo) {
> > > >  		munmap(data, bo_size);
> > > > @@ -492,14 +352,14 @@ igt_main
> > > >  		{ "userptr-rebind", USERPTR | REBIND },
> > > >  		{ "userptr-invalidate", USERPTR | INVALIDATE },
> > > >  		{ "userptr-invalidate-race", USERPTR | INVALIDATE | RACE },
> > > > -		{ "bindexecqueue", BIND_EXECQUEUE },
> > > > -		{ "bindexecqueue-userptr", BIND_EXECQUEUE | USERPTR },
> > > > -		{ "bindexecqueue-rebind",  BIND_EXECQUEUE | REBIND },
> > > > -		{ "bindexecqueue-userptr-rebind",  BIND_EXECQUEUE |
> > > USERPTR |
> > > > +		{ "bindexecqueue", BIND_EXEC_QUEUE },
> > > > +		{ "bindexecqueue-userptr", BIND_EXEC_QUEUE | USERPTR },
> > > > +		{ "bindexecqueue-rebind",  BIND_EXEC_QUEUE | REBIND },
> > > > +		{ "bindexecqueue-userptr-rebind",  BIND_EXEC_QUEUE |
> > > USERPTR |
> > > >  			REBIND },
> > > > -		{ "bindexecqueue-userptr-invalidate",  BIND_EXECQUEUE |
> > > USERPTR |
> > > > +		{ "bindexecqueue-userptr-invalidate",  BIND_EXEC_QUEUE |
> > > USERPTR |
> > > >  			INVALIDATE },
> > > > -		{ "bindexecqueue-userptr-invalidate-race", BIND_EXECQUEUE |
> > > USERPTR |
> > > > +		{ "bindexecqueue-userptr-invalidate-race", BIND_EXEC_QUEUE |
> > > USERPTR |
> > > >  			INVALIDATE | RACE },
> > > >  		{ NULL },
> > > >  	};
> > > > diff --git a/tests/intel/xe_exec_fault_mode.c
> > > b/tests/intel/xe_exec_fault_mode.c
> > > > index 40fe1743e..7b2fef224 100644
> > > > --- a/tests/intel/xe_exec_fault_mode.c
> > > > +++ b/tests/intel/xe_exec_fault_mode.c
> > > > @@ -16,24 +16,13 @@
> > > >  #include "igt.h"
> > > >  #include "lib/igt_syncobj.h"
> > > >  #include "lib/intel_reg.h"
> > > > +#include "lib/xe/xe_util.h"
> > > >  #include "xe_drm.h"
> > > >
> > > >  #include "xe/xe_ioctl.h"
> > > >  #include "xe/xe_query.h"
> > > >  #include <string.h>
> > > >
> > > > -#define MAX_N_EXEC_QUEUES	16
> > > > -
> > > > -#define USERPTR		(0x1 << 0)
> > > > -#define REBIND		(0x1 << 1)
> > > > -#define INVALIDATE	(0x1 << 2)
> > > > -#define RACE		(0x1 << 3)
> > > > -#define BIND_EXEC_QUEUE	(0x1 << 4)
> > > > -#define PREFETCH	(0x1 << 5)
> > > > -#define INVALID_FAULT	(0x1 << 6)
> > > > -#define INVALID_VA	(0x1 << 7)
> > > > -#define ENABLE_SCRATCH  (0x1 << 8)
> > > > -
> > > >  /**
> > > >   * SUBTEST: invalid-va
> > > >   * Description: Access invalid va and check for EIO through user fence.
> > > > @@ -99,7 +88,6 @@ test_exec(int fd, struct drm_xe_engine_class_instance
> > > *eci,
> > > >  {
> > > >  	uint32_t vm;
> > > >  	uint64_t addr = 0x1a0000;
> > > > -#define USER_FENCE_VALUE	0xdeadbeefdeadbeefull
> > > >  	struct drm_xe_sync sync[1] = {
> > > >  		{ .type = DRM_XE_SYNC_TYPE_USER_FENCE, .flags =
> > > DRM_XE_SYNC_FLAG_SIGNAL,
> > > >  	          .timeline_value = USER_FENCE_VALUE },
> > > > @@ -113,14 +101,9 @@ test_exec(int fd, struct
> drm_xe_engine_class_instance
> > > *eci,
> > > >  	uint32_t bind_exec_queues[MAX_N_EXEC_QUEUES];
> > > >  	size_t bo_size;
> > > >  	uint32_t bo = 0;
> > > > -	struct {
> > > > -		uint32_t batch[16];
> > > > -		uint64_t pad;
> > > > -		uint64_t vm_sync;
> > > > -		uint64_t exec_sync;
> > > > -		uint32_t data;
> > > > -	} *data;
> > > > -	int i, j, b;
> > > > +	struct cpu_va *data;
> > > > +	uint64_t placement;
> > > > +	int i, j;
> > > >  	int map_fd = -1;
> > > >
> > > >  	igt_assert(n_exec_queues <= MAX_N_EXEC_QUEUES);
> > > > @@ -134,144 +117,19 @@ test_exec(int fd, struct
> > > drm_xe_engine_class_instance *eci,
> > > >  	bo_size = sizeof(*data) * n_execs;
> > > >  	bo_size = xe_bb_size(fd, bo_size);
> > > >
> > > > -	if (flags & USERPTR) {
> > > > -#define	MAP_ADDRESS	0x00007fadeadbe000
> > > > -		if (flags & INVALIDATE) {
> > > > -			data = mmap((void *)MAP_ADDRESS, bo_size,
> > > PROT_READ |
> > > > -				    PROT_WRITE, MAP_SHARED | MAP_FIXED |
> > > > -				    MAP_ANONYMOUS, -1, 0);
> > > > -			igt_assert(data != MAP_FAILED);
> > > > -		} else {
> > > > -			data = aligned_alloc(xe_get_default_alignment(fd),
> > > > -					     bo_size);
> > > > -			igt_assert(data);
> > > > -		}
> > > > -	} else {
> > > > -		if (flags & PREFETCH)
> > > > -			bo = xe_bo_create(fd, 0, bo_size,
> > > > -					  all_memory_regions(fd) |
> > > > -					  vram_if_possible(fd, 0),
> > > > -
> > > DRM_XE_GEM_CREATE_FLAG_NEEDS_VISIBLE_VRAM);
> > > > -		else
> > > > -			bo = xe_bo_create(fd, 0, bo_size,
> > > > -					  vram_if_possible(fd, eci->gt_id),
> > > > -
> > > DRM_XE_GEM_CREATE_FLAG_NEEDS_VISIBLE_VRAM);
> > > > -		data = xe_bo_map(fd, bo, bo_size);
> > > > -	}
> > > > -	memset(data, 0, bo_size);
> > > > -
> > > > -	for (i = 0; i < n_exec_queues; i++) {
> > > > -		exec_queues[i] = xe_exec_queue_create(fd, vm, eci, 0);
> > > > -		if (flags & BIND_EXEC_QUEUE)
> > > > -			bind_exec_queues[i] =
> > > > -				xe_bind_exec_queue_create(fd, vm, 0);
> > > > -		else
> > > > -			bind_exec_queues[i] = 0;
> > > > -	};
> > > > -
> > > > -	sync[0].addr = to_user_pointer(&data[0].vm_sync);
> > > > -	if (bo)
> > > > -		xe_vm_bind_async(fd, vm, bind_exec_queues[0], bo, 0, addr,
> > > bo_size, sync, 1);
> > > > -	else
> > > > -		xe_vm_bind_userptr_async(fd, vm, bind_exec_queues[0],
> > > > -					 to_user_pointer(data), addr,
> > > > -					 bo_size, sync, 1);
> > > > -#define ONE_SEC	MS_TO_NS(1000)
> > > > -	xe_wait_ufence(fd, &data[0].vm_sync, USER_FENCE_VALUE,
> > > > -		       bind_exec_queues[0], ONE_SEC);
> > > > -	data[0].vm_sync = 0;
> > > > -
> > > > -	if (flags & PREFETCH) {
> > > > -		/* Should move to system memory */
> > > > -		xe_vm_prefetch_async(fd, vm, bind_exec_queues[0], 0, addr,
> > > > -				     bo_size, sync, 1, 0);
> > > > -		xe_wait_ufence(fd, &data[0].vm_sync, USER_FENCE_VALUE,
> > > > -			       bind_exec_queues[0], ONE_SEC);
> > > > -		data[0].vm_sync = 0;
> > > > -	}
> > > > -
> > > > -	for (i = 0; i < n_execs; i++) {
> > > > -		uint64_t batch_offset = (char *)&data[i].batch - (char *)data;
> > > > -		uint64_t batch_addr = addr + batch_offset;
> > > > -		uint64_t sdi_offset = (char *)&data[i].data - (char *)data;
> > > > -		uint64_t sdi_addr = addr + sdi_offset;
> > > > -		int e = i % n_exec_queues;
> > > > +	placement = (flags & PREFETCH) ? all_memory_regions(fd) |
> > > > +		vram_if_possible(fd, 0) : vram_if_possible(fd, eci->gt_id);
> > > >
> > > > -		b = 0;
> > > > -		if (flags & INVALID_VA)
> > > > -			sdi_addr = 0x1fffffffffff000;
> > > > +	data = create_xe_bo(fd, vm, &bo, bo_size, placement, flags);
> > > >
> > > > -		data[i].batch[b++] = MI_STORE_DWORD_IMM_GEN4;
> > > > -		data[i].batch[b++] = sdi_addr;
> > > > -		data[i].batch[b++] = sdi_addr >> 32;
> > > > -		data[i].batch[b++] = 0xc0ffee;
> > > > -		data[i].batch[b++] = MI_BATCH_BUFFER_END;
> > > > -		igt_assert(b <= ARRAY_SIZE(data[i].batch));
> > > > +	create_exec_queue(fd, vm, exec_queues, bind_exec_queues,
> > > > +			n_exec_queues, eci, flags);
> > > >
> > > > -		sync[0].addr = addr + (char *)&data[i].exec_sync - (char *)data;
> > > > +	xe_bo_bind(fd, vm, bind_exec_queues[0], bo, bo_size, addr, sync, data,
> > > flags);
> > > >
> > > > -		exec.exec_queue_id = exec_queues[e];
> > > > -		exec.address = batch_addr;
> > > > -		xe_exec(fd, &exec);
> > > > +	xe_execbuf(fd, vm, &exec, n_execs, exec_queues, bind_exec_queues,
> > > > +			n_exec_queues, bo, bo_size, sync, addr, data, &map_fd,
> > > flags);
> > > >
> > > > -		if (flags & REBIND && i + 1 != n_execs) {
> > > > -			xe_wait_ufence(fd, &data[i].exec_sync,
> > > USER_FENCE_VALUE,
> > > > -				       exec_queues[e], ONE_SEC);
> > > > -			xe_vm_unbind_async(fd, vm, bind_exec_queues[e], 0,
> > > > -					   addr, bo_size, NULL, 0);
> > > > -
> > > > -			sync[0].addr = to_user_pointer(&data[0].vm_sync);
> > > > -			addr += bo_size;
> > > > -			if (bo)
> > > > -				xe_vm_bind_async(fd, vm,
> > > bind_exec_queues[e], bo,
> > > > -						 0, addr, bo_size, sync, 1);
> > > > -			else
> > > > -				xe_vm_bind_userptr_async(fd, vm,
> > > > -							 bind_exec_queues[e],
> > > > -							 to_user_pointer(data),
> > > > -							 addr, bo_size, sync,
> > > > -							 1);
> > > > -			xe_wait_ufence(fd, &data[0].vm_sync,
> > > USER_FENCE_VALUE,
> > > > -				       bind_exec_queues[e], ONE_SEC);
> > > > -			data[0].vm_sync = 0;
> > > > -		}
> > > > -
> > > > -		if (flags & INVALIDATE && i + 1 != n_execs) {
> > > > -			if (!(flags & RACE)) {
> > > > -				/*
> > > > -				 * Wait for exec completion and check data as
> > > > -				 * userptr will likely change to different
> > > > -				 * physical memory on next mmap call triggering
> > > > -				 * an invalidate.
> > > > -				 */
> > > > -				xe_wait_ufence(fd, &data[i].exec_sync,
> > > > -					       USER_FENCE_VALUE,
> > > exec_queues[e],
> > > > -					       ONE_SEC);
> > > > -				igt_assert_eq(data[i].data, 0xc0ffee);
> > > > -			} else if (i * 2 != n_execs) {
> > > > -				/*
> > > > -				 * We issue 1 mmap which races against running
> > > > -				 * jobs. No real check here aside from this test
> > > > -				 * not faulting on the GPU.
> > > > -				 */
> > > > -				continue;
> > > > -			}
> > > > -
> > > > -			if (flags & RACE) {
> > > > -				map_fd = open("/tmp", O_TMPFILE | O_RDWR,
> > > > -					      0x666);
> > > > -				write(map_fd, data, bo_size);
> > > > -				data = mmap((void *)MAP_ADDRESS, bo_size,
> > > > -					    PROT_READ | PROT_WRITE,
> > > MAP_SHARED |
> > > > -					    MAP_FIXED, map_fd, 0);
> > > > -			} else {
> > > > -				data = mmap((void *)MAP_ADDRESS, bo_size,
> > > > -					    PROT_READ | PROT_WRITE,
> > > MAP_SHARED |
> > > > -					    MAP_FIXED | MAP_ANONYMOUS, -1,
> > > 0);
> > > > -			}
> > > > -			igt_assert(data != MAP_FAILED);
> > > > -		}
> > > > -	}
> > > >  	if (!(flags & INVALID_FAULT)) {
> > > >  		int64_t timeout = ONE_SEC;
> > > >
> > > > @@ -286,22 +144,15 @@ test_exec(int fd, struct
> > > drm_xe_engine_class_instance *eci,
> > > >  							       exec_queues[i %
> > > n_exec_queues], &timeout), 0);
> > > >  		}
> > > >  	}
> > > > -	sync[0].addr = to_user_pointer(&data[0].vm_sync);
> > > > -	xe_vm_unbind_async(fd, vm, bind_exec_queues[0], 0, addr, bo_size,
> > > > -			   sync, 1);
> > > > -	xe_wait_ufence(fd, &data[0].vm_sync, USER_FENCE_VALUE,
> > > > -		       bind_exec_queues[0], ONE_SEC);
> > > > +
> > > > +	xe_vm_unbind(fd, vm, bind_exec_queues[0], sync, data, addr, bo_size);
> > > >
> > > >  	if (!(flags & INVALID_FAULT) && !(flags & INVALID_VA)) {
> > > >  		for (i = j; i < n_execs; i++)
> > > >  				igt_assert_eq(data[i].data, 0xc0ffee);
> > > >  	}
> > > >
> > > > -	for (i = 0; i < n_exec_queues; i++) {
> > > > -		xe_exec_queue_destroy(fd, exec_queues[i]);
> > > > -		if (bind_exec_queues[i])
> > > > -			xe_exec_queue_destroy(fd, bind_exec_queues[i]);
> > > > -	}
> > > > +	destory_exec_queue(fd, exec_queues, bind_exec_queues,
> > > n_exec_queues);
> > > >
> > > >  	if (bo) {
> > > >  		munmap(data, bo_size);
> > > > --
> > > > 2.25.1
> > > >

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] lib/xe/xe_util: Creating the helper functions
  2024-04-01 20:28       ` Zeng, Oak
@ 2024-04-01 22:32         ` Matthew Brost
  0 siblings, 0 replies; 9+ messages in thread
From: Matthew Brost @ 2024-04-01 22:32 UTC (permalink / raw)
  To: Zeng, Oak
  Cc: Bommu, Krishnaiah, intel-xe@lists.freedesktop.org,
	Kumar, Janga Rahul, Ghimiray, Himal Prasad, Chehab, Mauro

On Mon, Apr 01, 2024 at 02:28:58PM -0600, Zeng, Oak wrote:
> Hi Matt
> 
> > -----Original Message-----
> > From: Brost, Matthew <matthew.brost@intel.com>
> > Sent: Monday, April 1, 2024 2:40 PM
> > To: Zeng, Oak <oak.zeng@intel.com>
> > Cc: Bommu, Krishnaiah <krishnaiah.bommu@intel.com>; intel-
> > xe@lists.freedesktop.org; Kumar, Janga Rahul <janga.rahul.kumar@intel.com>;
> > Ghimiray, Himal Prasad <himal.prasad.ghimiray@intel.com>; Chehab, Mauro
> > <mauro.chehab@intel.com>
> > Subject: Re: [PATCH] lib/xe/xe_util: Creating the helper functions
> > 
> > On Mon, Apr 01, 2024 at 10:10:06AM -0600, Zeng, Oak wrote:
> > > Hi Matt,
> > >
> > > > -----Original Message-----
> > > > From: Brost, Matthew <matthew.brost@intel.com>
> > > > Sent: Sunday, March 31, 2024 5:05 PM
> > > > To: Bommu, Krishnaiah <krishnaiah.bommu@intel.com>
> > > > Cc: intel-xe@lists.freedesktop.org; Kumar, Janga Rahul
> > > > <janga.rahul.kumar@intel.com>; Ghimiray, Himal Prasad
> > > > <himal.prasad.ghimiray@intel.com>; Zeng, Oak <oak.zeng@intel.com>
> > > > Subject: Re: [PATCH] lib/xe/xe_util: Creating the helper functions
> > > >
> > > > On Mon, Apr 01, 2024 at 12:29:49AM +0530, Bommu Krishnaiah wrote:
> > > > > Creating the helper functions which can be reused by many tests,
> > > > > as part of this patch xe_exec_compute_mode and xe_exec_fault_mode
> > > > > are used this helper functions.
> > > > >
> > > >
> > > > As the author of most these tests and IGTs, I can say while not great to
> > > > have copy paste between tests I much prefer it to adding helpers to try
> > > > to jam the needs of each test into the helpers. IMO this just creates a
> > > > headache in the long run. e.g. Let's say I want to change the behavior
> > > > of xe_exec_compute_mode but not xe_exec_fault_mode, with shared
> > helpers
> > > > this gets harder.
> > >
> > > Duplicated codes is not a good practice in software. When you have a fix, you
> > need to fix all the places where you have the duplicated codes.
> > >
> > 
> > I understand concept of why to not use duplicate codes but see below the
> > design philosophy of these tests.
> > 
> > > In the example you give, it depends on how we make the helpers. If the
> > helpers are not abstract well enough, we will run into the problem you said.
> > >
> > > The helper idea is inherited from i915. With those helpers, we can write a test
> > with less than 50 lines of codes. In many cases, just ~20 lines of code, very easy to
> > read and debug. Just take a look of i915 gem_vm_pagefault.c and i915_svm.c.
> > >
> > 
> > gem_vm_pagefault.c is 2000 plus lines of code for 27 tests.
> > 
> > The test_exec function in xe_exec_compute_mode is ~150 lines of code
> > which generates 50+ plus tests. There is power to a single data path
> > controlled by various flags - this leads to the pattern of testing which
> > is not directed rather do a bunch of things all at the same time. That
> > is the intent of tests which use this style of coding (xe_exec_basic,
> > xe_exec_compute_mode, xe_exec_balancer, xe_exec_fault_mode, and
> > xe_exec_threads). I've used this style of coding to test hardware /
> > software over the years (started my career writing functional tests for
> > hardware) and can say these style of tests are by far the most effective
> > for finding bugs / producing coverage / confidence.
> 
> 
> Yes, I was also amazed by the efficiency of the xe style test. But that doesn't prohibit we introduce a few helpers.
> 
> As said in the other thread of this code review, I agree we can continue this style for the existing test. I also asked Krishna to remove some big helpers. The idea is, we will introduce a few simple helpers (mainly for the system allocator tests which will take another style: each test is simple, but overall has less test/LOC efficiency). 
> 
> We will apply those helpers for the existing tests but won't change your coding style. 
> 
> > 
> > Also they should be easily extentable, see [1] which adds some new
> > sections to xe_exec_compute_mode to test [2]. New tests are added by
> > adding a new flag + update data path with new flags. This is my example
> > why for these types of tests helpers replacing the data-path is not
> > ideal.
> > 
> > With all of this, I'm not suggesting helpers are not useful nor I am
> > suggesting directed testing is not useful (see xe_vm, largely a
> > directed test one of the most essential tests in the suite) just in this
> > case I do not think the aforementioned tests will benefit from helpers
> > as written in this patch.
> 
> Sure. Let's Krishna rework on the helpers and let's see what we get. As said, our goal is not to simplify the existing test, nor to change the coding style of existing test. 
> 
> Whether we can apply helpers to existing tests? It all depends on how we abstract the helpers well enough. I believe helpers such as create and map buffer, create and fill command buffer, submit command buffer etc can simplify things a little. But lets wait for Krishna's improvement and see.
>

That roughly what I think - BO / memory allocation, programming batches are where
we can improve these tests with helpers.

Matt

> 
> 
> > 
> > [1] https://patchwork.freedesktop.org/patch/583145/?series=131211&rev=1
> > [2] https://patchwork.freedesktop.org/series/130935/
> > 
> > > Xe igt is rather complicated compared above i915 tests. In the past few days, I
> > had to debug into xe-exec-threads, it took me long time to really understand the
> > test. The main reason is, we have a huge function such as test_legacy_mode, and
> > flags is used to control the behavior of the test. This is hard to read, maintain and
> > debug.
> > >
> > 
> > Saying something is complicated is a matter of opinion. I think the i915
> > tests are incomprehensible and can easily understand the Xe tests but
> > course I wrote the Xe tests... I suspect the inverse is true in the i915
> > tests...
> > 
> > I will say if xe_exec_threads runs to completion the condfience level in
> > submission working Xe is very, very high. Again ~1500 lines of code
> > generates ~120 tests.
> 
> Agreed. Most of the i915 tests are also the same style as you coded for xe. Just those two I mentioned above used another style. I have no doubt of the efficiency of that style. As said, I don't have any intension to change this style.
> 
> > 
> > > >
> > > > I'd much rather leave this as is.
> > > >
> > > > Maybe I can buy a helper or two to program the batches, i.e.
> > > > insert_store LGTM, but certainly not the logic doing bind, invalidates,
> > > > loop, etc...
> > >
> > >
> > > Let me take a look of the helpers, and see whether we can make it better. Our
> > plan is to use those helpers for system allocator test. We use existing tests to test
> > those helpers because right now we don't have system allocator ready. So let's
> > improve it first. We might end up something which is good for both existing test
> > and coming system allocator test.
> > >
> > 
> > Again I don't like in general predicting the future... But I'd almostly
> > certainly say a test in the style of afformentioned tests will be
> > written too with me insisting or writing one myself given my experience
> > doing so.
> > 
> > Off the top of my head...
> > 
> > - various allocation sizes
> > - mix mmap / malloc / BO allocations
> > - aligned / unaligned allocations
> > - different madvise policies
> > - allocations that faulted in (CPU touches pages before GPU) or not
> >   faulted in (no backing store on GPU access)
> > - races between CPU / GPU access
> > - sparse access to allocations
> > - dynamic freeing of allocations
> > - intentionally leak memory
> > - invalid addresses
> > - allocate / GPU fault in more memory than VRAM available causing eviction
> > 
> > All of this within a single function controlled by flags.
> 
> In the coding style I preferred, each of above lines will be one function, vs single function controlled by flags. I am pretty sure my style will be more readable but at the cost of more LOC.
> 
> Again, I think both styles works. Both have its pros and cons. I don't have any intension to change the coding style of existing tests.
> 
> 
> > 
> > Extend to multi-threaded versions with dedicated VMs or a shared VMs.
> > 
> > Extend to multi-process version with dedicated VMs and FDs.
> > 
> > So this will probably be 100s of tests with more or less 1 function
> > called with different arguments. I bet if a test like this run to
> > completion confidence in SA design is very, very high.
> > 
> > > We will let you decide whether you want to use those helpers for existing tests.
> > If we can't make something good for existing tests, we will target to using those
> > helpers only for the coming system allocator tests.
> > >
> > 
> > Always open to improvements but my main point here is - helpers
> > replacing the data-path is a tricky proposition.
> 
> Again, we will not change the data path.
> 
> Oak
> 
> 
> > 
> > Matt
> > 
> > > Thanks,
> > > Oak
> > >
> > > >
> > > > Matt
> > > >
> > > > > Signed-off-by: Bommu Krishnaiah <krishnaiah.bommu@intel.com>
> > > > > Cc: Janga Rahul Kumar <janga.rahul.kumar@intel.com>
> > > > > Cc: Himal Prasad Ghimiray <himal.prasad.ghimiray@intel.com>
> > > > > Cc: Oak Zeng <oak.zeng@intel.com>
> > > > > ---
> > > > >  lib/xe/xe_util.c                   | 194 +++++++++++++++++++++++++++++
> > > > >  lib/xe/xe_util.h                   |  48 +++++++
> > > > >  tests/intel/xe_exec_compute_mode.c | 190 ++++------------------------
> > > > >  tests/intel/xe_exec_fault_mode.c   | 179 +++-----------------------
> > > > >  4 files changed, 282 insertions(+), 329 deletions(-)
> > > > >
> > > > > diff --git a/lib/xe/xe_util.c b/lib/xe/xe_util.c
> > > > > index 050162b5e..17cd45e45 100644
> > > > > --- a/lib/xe/xe_util.c
> > > > > +++ b/lib/xe/xe_util.c
> > > > > @@ -255,3 +255,197 @@ bool xe_is_gt_in_c6(int fd, int gt)
> > > > >
> > > > >  	return strcmp(gt_c_state, "gt-c6") == 0;
> > > > >  }
> > > > > +
> > > > > +struct cpu_va *create_xe_bo(int fd, uint32_t vm, uint32_t *bo,
> > > > > +		uint32_t bo_size, uint64_t placement, unsigned int flags)
> > > > > +{
> > > > > +	struct cpu_va *data;
> > > > > +
> > > > > +	if (flags & USERPTR) {
> > > > > +		if (flags & INVALIDATE) {
> > > > > +			data = mmap((void *)MAP_ADDRESS, bo_size,
> > > > PROT_READ | PROT_WRITE,
> > > > > +					MAP_SHARED | MAP_FIXED |
> > > > MAP_ANONYMOUS, -1, 0);
> > > > > +			igt_assert(data != MAP_FAILED);
> > > > > +		} else {
> > > > > +			data = aligned_alloc(xe_get_default_alignment(fd),
> > > > bo_size);
> > > > > +			igt_assert(data);
> > > > > +		}
> > > > > +	} else {
> > > > > +		*bo = xe_bo_create(fd, flags & VM_FOR_BO ? vm : 0, bo_size,
> > > > placement,
> > > > > +
> > > > 	DRM_XE_GEM_CREATE_FLAG_NEEDS_VISIBLE_VRAM);
> > > > > +		data = xe_bo_map(fd, *bo, bo_size);
> > > > > +	}
> > > > > +
> > > > > +	memset(data, 0, bo_size);
> > > > > +
> > > > > +	return data;
> > > > > +}
> > > > > +
> > > > > +void create_exec_queue(int fd, uint32_t vm, uint32_t *exec_queues,
> > > > > +		uint32_t *bind_exec_queues, int n_exec_queues,
> > > > > +		struct drm_xe_engine_class_instance *eci, unsigned int flags)
> > > > > +{
> > > > > +	int i;
> > > > > +
> > > > > +	for (i = 0; i < n_exec_queues; i++) {
> > > > > +		exec_queues[i] = xe_exec_queue_create(fd, vm, eci, 0);
> > > > > +		if (flags & BIND_EXEC_QUEUE)
> > > > > +			bind_exec_queues[i] = xe_bind_exec_queue_create(fd,
> > > > vm, 0);
> > > > > +		else
> > > > > +			bind_exec_queues[i] = 0;
> > > > > +	};
> > > > > +}
> > > > > +
> > > > > +void xe_bo_bind(int fd, uint32_t vm, uint32_t bind_exec_queues,
> > uint32_t bo,
> > > > > +		uint32_t bo_size, uint64_t addr, struct drm_xe_sync *sync,
> > > > > +		struct cpu_va *data, unsigned int flags)
> > > > > +{
> > > > > +	int64_t fence_timeout = igt_run_in_simulation() ? HUNDRED_SEC :
> > > > ONE_SEC;
> > > > > +
> > > > > +	sync[0].addr = to_user_pointer(&data[0].vm_sync);
> > > > > +	if (bo)
> > > > > +		xe_vm_bind_async(fd, vm, bind_exec_queues, bo, 0, addr,
> > > > bo_size, sync, 1);
> > > > > +	else
> > > > > +		xe_vm_bind_userptr_async(fd, vm, bind_exec_queues,
> > > > > +				to_user_pointer(data), addr,
> > > > > +				bo_size, sync, 1);
> > > > > +
> > > > > +	xe_wait_ufence(fd, &data[0].vm_sync, USER_FENCE_VALUE,
> > > > > +			bind_exec_queues, fence_timeout);
> > > > > +	data[0].vm_sync = 0;
> > > > > +
> > > > > +	if (flags & PREFETCH) {
> > > > > +		/* Should move to system memory */
> > > > > +		xe_vm_prefetch_async(fd, vm, bind_exec_queues, 0, addr,
> > > > > +				bo_size, sync, 1, 0);
> > > > > +		xe_wait_ufence(fd, &data[0].vm_sync, USER_FENCE_VALUE,
> > > > > +				bind_exec_queues, fence_timeout);
> > > > > +		data[0].vm_sync = 0;
> > > > > +	}
> > > > > +}
> > > > > +
> > > > > +void insert_store(uint64_t dst, struct cpu_va *data, uint32_t val, int i)
> > > > > +{
> > > > > +	int b = 0;
> > > > > +	data[i].batch[b++] = MI_STORE_DWORD_IMM_GEN4;
> > > > > +	data[i].batch[b++] = dst;
> > > > > +	data[i].batch[b++] = dst >> 32;
> > > > > +	data[i].batch[b++] = val;
> > > > > +	data[i].batch[b++] = MI_BATCH_BUFFER_END;
> > > > > +	igt_assert(b <= ARRAY_SIZE(data[i].batch));
> > > > > +}
> > > > > +
> > > > > +void xe_execbuf(int fd, uint32_t vm, struct drm_xe_exec *exec, int
> > n_execs,
> > > > > +		uint32_t *exec_queues, uint32_t *bind_exec_queues,
> > > > > +		int n_exec_queues, uint32_t bo, uint32_t bo_size,
> > > > > +		struct drm_xe_sync *sync, uint64_t addr, struct cpu_va *data,
> > > > > +		int *map_fd, unsigned int flags)
> > > > > +{
> > > > > +	int i;
> > > > > +	int64_t fence_timeout = igt_run_in_simulation() ? HUNDRED_SEC :
> > > > ONE_SEC;
> > > > > +
> > > > > +	for (i = 0; i < n_execs; i++) {
> > > > > +		uint64_t batch_offset = (char *)&data[i].batch - (char *)data;
> > > > > +		uint64_t batch_addr = addr + batch_offset;
> > > > > +		uint64_t sdi_offset = (char *)&data[i].data - (char *)data;
> > > > > +		uint64_t sdi_addr = addr + sdi_offset;
> > > > > +		int e = i % n_exec_queues;
> > > > > +
> > > > > +		if (flags & INVALID_VA)
> > > > > +			sdi_addr = 0x1fffffffffff000;
> > > > > +
> > > > > +		insert_store(sdi_addr, data, 0xc0ffee, i);
> > > > > +
> > > > > +		sync[0].addr = addr + (char *)&data[i].exec_sync - (char *)data;
> > > > > +
> > > > > +		exec->exec_queue_id = exec_queues[e];
> > > > > +		exec->address = batch_addr;
> > > > > +		xe_exec(fd, exec);
> > > > > +
> > > > > +		if (flags & REBIND && i + 1 != n_execs) {
> > > > > +			xe_wait_ufence(fd, &data[i].exec_sync,
> > > > USER_FENCE_VALUE,
> > > > > +					exec_queues[e], fence_timeout);
> > > > > +			xe_vm_unbind_async(fd, vm, bind_exec_queues[e], 0,
> > > > > +					addr, bo_size, NULL, 0);
> > > > > +
> > > > > +			sync[0].addr = to_user_pointer(&data[0].vm_sync);
> > > > > +			addr += bo_size;
> > > > > +
> > > > > +			if (bo)
> > > > > +				xe_vm_bind_async(fd, vm,
> > > > bind_exec_queues[e], bo,
> > > > > +						0, addr, bo_size, sync, 1);
> > > > > +			else
> > > > > +				xe_vm_bind_userptr_async(fd, vm,
> > > > > +						bind_exec_queues[e],
> > > > > +						to_user_pointer(data),
> > > > > +						addr, bo_size, sync, 1);
> > > > > +
> > > > > +			xe_wait_ufence(fd, &data[0].vm_sync,
> > > > USER_FENCE_VALUE,
> > > > > +					bind_exec_queues[e], fence_timeout);
> > > > > +			data[0].vm_sync = 0;
> > > > > +		}
> > > > > +
> > > > > +		if (flags & INVALIDATE && i + 1 != n_execs) {
> > > > > +			if (!(flags & RACE)) {
> > > > > +				/*
> > > > > +				 * Wait for exec completion and check data as
> > > > > +				 * userptr will likely change to different
> > > > > +				 * physical memory on next mmap call triggering
> > > > > +				 * an invalidate.
> > > > > +				 */
> > > > > +				xe_wait_ufence(fd, &data[i].exec_sync,
> > > > > +						USER_FENCE_VALUE,
> > > > exec_queues[e],
> > > > > +						fence_timeout);
> > > > > +				igt_assert_eq(data[i].data, 0xc0ffee);
> > > > > +			} else if (i * 2 != n_execs) {
> > > > > +				/*
> > > > > +				 * We issue 1 mmap which races against running
> > > > > +				 * jobs. No real check here aside from this test
> > > > > +				 * not faulting on the GPU.
> > > > > +				 */
> > > > > +				continue;
> > > > > +			}
> > > > > +
> > > > > +			if (flags & RACE) {
> > > > > +				*map_fd = open("/tmp", O_TMPFILE | O_RDWR,
> > > > > +						0x666);
> > > > > +				write(*map_fd, data, bo_size);
> > > > > +				data = mmap((void *)MAP_ADDRESS, bo_size,
> > > > > +						PROT_READ | PROT_WRITE,
> > > > MAP_SHARED |
> > > > > +						MAP_FIXED, *map_fd, 0);
> > > > > +			} else {
> > > > > +				data = mmap((void *)MAP_ADDRESS, bo_size,
> > > > > +						PROT_READ | PROT_WRITE,
> > > > MAP_SHARED |
> > > > > +						MAP_FIXED |
> > > > MAP_ANONYMOUS, -1, 0);
> > > > > +			}
> > > > > +
> > > > > +			igt_assert(data != MAP_FAILED);
> > > > > +		}
> > > > > +	}
> > > > > +}
> > > > > +
> > > > > +void xe_vm_unbind(int fd, uint32_t vm, uint32_t bind_exec_queues,
> > > > > +		struct drm_xe_sync *sync, struct cpu_va *data, uint64_t addr,
> > > > > +		uint32_t bo_size)
> > > > > +{
> > > > > +	int64_t fence_timeout = igt_run_in_simulation() ? HUNDRED_SEC :
> > > > ONE_SEC;
> > > > > +
> > > > > +	sync[0].addr = to_user_pointer(&data[0].vm_sync);
> > > > > +	xe_vm_unbind_async(fd, vm, bind_exec_queues, 0, addr, bo_size,
> > > > > +			sync, 1);
> > > > > +	xe_wait_ufence(fd, &data[0].vm_sync, USER_FENCE_VALUE,
> > > > > +			bind_exec_queues, fence_timeout);
> > > > > +}
> > > > > +
> > > > > +void destory_exec_queue(int fd, uint32_t *exec_queues,
> > > > > +		uint32_t *bind_exec_queues, int n_exec_queues)
> > > > > +{
> > > > > +	int i;
> > > > > +
> > > > > +	for (i = 0; i < n_exec_queues; i++) {
> > > > > +		xe_exec_queue_destroy(fd, exec_queues[i]);
> > > > > +		if (bind_exec_queues[i])
> > > > > +			xe_exec_queue_destroy(fd, bind_exec_queues[i]);
> > > > > +	}
> > > > > +}
> > > > > +
> > > > > diff --git a/lib/xe/xe_util.h b/lib/xe/xe_util.h
> > > > > index 6480ea01a..aa1b0dcc1 100644
> > > > > --- a/lib/xe/xe_util.h
> > > > > +++ b/lib/xe/xe_util.h
> > > > > @@ -47,4 +47,52 @@ void xe_bind_unbind_async(int fd, uint32_t vm,
> > uint32_t
> > > > bind_engine,
> > > > >
> > > > >  bool xe_is_gt_in_c6(int fd, int gt);
> > > > >
> > > > > +
> > > > > +#define USERPTR			(0x1 << 0)
> > > > > +#define REBIND			(0x1 << 1)
> > > > > +#define INVALIDATE		(0x1 << 2)
> > > > > +#define RACE			(0x1 << 3)
> > > > > +#define BIND_EXEC_QUEUE		(0x1 << 4)
> > > > > +#define PREFETCH		(0x1 << 5)
> > > > > +#define INVALID_FAULT		(0x1 << 6)
> > > > > +#define INVALID_VA		(0x1 << 7)
> > > > > +#define ENABLE_SCRATCH		(0x1 << 8)
> > > > > +#define VM_FOR_BO		(0x1 << 9)
> > > > > +#define EXEC_QUEUE_EARLY	(0x1 << 10)
> > > > > +
> > > > > +#define MAX_N_EXEC_QUEUES	16
> > > > > +#define USER_FENCE_VALUE	0xdeadbeefdeadbeefull
> > > > > +#define MAP_ADDRESS		0x00007fadeadbe000
> > > > > +
> > > > > +#define ONE_SEC		MS_TO_NS(1000)
> > > > > +#define HUNDRED_SEC	MS_TO_NS(100000)
> > > > > +
> > > > > +struct cpu_va {
> > > > > +	uint32_t batch[16];
> > > > > +	uint64_t pad;
> > > > > +	uint64_t vm_sync;
> > > > > +	uint64_t exec_sync;
> > > > > +	uint32_t data;
> > > > > +};
> > > > > +
> > > > > +struct cpu_va *create_xe_bo(int fd, uint32_t vm, uint32_t *bo,
> > > > > +		uint32_t bo_size, uint64_t placement,  unsigned int flags);
> > > > > +void create_exec_queue(int fd, uint32_t vm, uint32_t *exec_queues,
> > > > > +		uint32_t *bind_exec_queues, int n_exec_queues,
> > > > > +		struct drm_xe_engine_class_instance *eci, unsigned int flags);
> > > > > +void xe_bo_bind(int fd, uint32_t vm, uint32_t bind_exec_queues,
> > uint32_t bo,
> > > > > +		uint32_t bo_size, uint64_t addr, struct drm_xe_sync *sync,
> > > > > +		struct cpu_va *data, unsigned int flags);
> > > > > +void xe_execbuf(int fd, uint32_t vm, struct drm_xe_exec *exec, int
> > n_execs,
> > > > > +		uint32_t *exec_queues, uint32_t *bind_exec_queues,
> > > > > +		int n_exec_queues, uint32_t bo, uint32_t bo_size,
> > > > > +		struct drm_xe_sync *sync, uint64_t addr, struct cpu_va *data,
> > > > > +		int *map_fd, unsigned int flags);
> > > > > +void insert_store(uint64_t dst, struct cpu_va *data, uint32_t val, int i);
> > > > > +void xe_vm_unbind(int fd, uint32_t vm, uint32_t bind_exec_queues,
> > > > > +		struct drm_xe_sync *sync, struct cpu_va *data,
> > > > > +		uint64_t addr, uint32_t bo_size);
> > > > > +void destory_exec_queue(int fd, uint32_t *exec_queues,
> > > > > +		uint32_t *bind_exec_queues, int n_exec_queues);
> > > > > +
> > > > >  #endif /* XE_UTIL_H */
> > > > > diff --git a/tests/intel/xe_exec_compute_mode.c
> > > > b/tests/intel/xe_exec_compute_mode.c
> > > > > index 3ec848164..e8d82cc69 100644
> > > > > --- a/tests/intel/xe_exec_compute_mode.c
> > > > > +++ b/tests/intel/xe_exec_compute_mode.c
> > > > > @@ -15,6 +15,7 @@
> > > > >  #include "igt.h"
> > > > >  #include "lib/igt_syncobj.h"
> > > > >  #include "lib/intel_reg.h"
> > > > > +#include "lib/xe/xe_util.h"
> > > > >  #include <sys/ioctl.h>
> > > > >  #include "xe_drm.h"
> > > > >
> > > > > @@ -23,15 +24,6 @@
> > > > >  #include "xe/xe_spin.h"
> > > > >  #include <string.h>
> > > > >
> > > > > -#define MAX_N_EXECQUEUES 	16
> > > > > -#define USERPTR				(0x1 << 0)
> > > > > -#define REBIND				(0x1 << 1)
> > > > > -#define INVALIDATE			(0x1 << 2)
> > > > > -#define RACE				(0x1 << 3)
> > > > > -#define BIND_EXECQUEUE		(0x1 << 4)
> > > > > -#define VM_FOR_BO			(0x1 << 5)
> > > > > -#define EXEC_QUEUE_EARLY	(0x1 << 6)
> > > > > -
> > > > >  /**
> > > > >   * SUBTEST: twice-%s
> > > > >   * Description: Run %arg[1] compute machine test twice
> > > > > @@ -88,7 +80,6 @@ test_exec(int fd, struct drm_xe_engine_class_instance
> > > > *eci,
> > > > >  {
> > > > >  	uint32_t vm;
> > > > >  	uint64_t addr = 0x1a0000;
> > > > > -#define USER_FENCE_VALUE	0xdeadbeefdeadbeefull
> > > > >  	struct drm_xe_sync sync[1] = {
> > > > >  		{ .type = DRM_XE_SYNC_TYPE_USER_FENCE,
> > > > >  		  .flags = DRM_XE_SYNC_FLAG_SIGNAL,
> > > > > @@ -99,161 +90,38 @@ test_exec(int fd, struct
> > drm_xe_engine_class_instance
> > > > *eci,
> > > > >  		.num_syncs = 1,
> > > > >  		.syncs = to_user_pointer(sync),
> > > > >  	};
> > > > > -	uint32_t exec_queues[MAX_N_EXECQUEUES];
> > > > > -	uint32_t bind_exec_queues[MAX_N_EXECQUEUES];
> > > > > +	uint32_t exec_queues[MAX_N_EXEC_QUEUES];
> > > > > +	uint32_t bind_exec_queues[MAX_N_EXEC_QUEUES];
> > > > >  	size_t bo_size;
> > > > >  	uint32_t bo = 0;
> > > > > -	struct {
> > > > > -		uint32_t batch[16];
> > > > > -		uint64_t pad;
> > > > > -		uint64_t vm_sync;
> > > > > -		uint64_t exec_sync;
> > > > > -		uint32_t data;
> > > > > -	} *data;
> > > > > -	int i, j, b;
> > > > > +	struct cpu_va *data;
> > > > > +	int i, j;
> > > > >  	int map_fd = -1;
> > > > >  	int64_t fence_timeout;
> > > > > +	uint64_t placement;
> > > > >
> > > > > -	igt_assert(n_exec_queues <= MAX_N_EXECQUEUES);
> > > > > +	igt_assert(n_exec_queues <= MAX_N_EXEC_QUEUES);
> > > > >
> > > > >  	vm = xe_vm_create(fd, DRM_XE_VM_CREATE_FLAG_LR_MODE, 0);
> > > > >  	bo_size = sizeof(*data) * n_execs;
> > > > >  	bo_size = xe_bb_size(fd, bo_size);
> > > > >
> > > > > -	for (i = 0; (flags & EXEC_QUEUE_EARLY) && i < n_exec_queues; i++) {
> > > > > -		exec_queues[i] = xe_exec_queue_create(fd, vm, eci, 0);
> > > > > -		if (flags & BIND_EXECQUEUE)
> > > > > -			bind_exec_queues[i] =
> > > > > -				xe_bind_exec_queue_create(fd, vm, 0);
> > > > > -		else
> > > > > -			bind_exec_queues[i] = 0;
> > > > > -	};
> > > > > +	if (flags & EXEC_QUEUE_EARLY)
> > > > > +		create_exec_queue(fd, vm, exec_queues, bind_exec_queues,
> > > > n_exec_queues, eci, flags);
> > > > >
> > > > > -	if (flags & USERPTR) {
> > > > > -#define	MAP_ADDRESS	0x00007fadeadbe000
> > > > > -		if (flags & INVALIDATE) {
> > > > > -			data = mmap((void *)MAP_ADDRESS, bo_size,
> > > > PROT_READ |
> > > > > -				    PROT_WRITE, MAP_SHARED | MAP_FIXED |
> > > > > -				    MAP_ANONYMOUS, -1, 0);
> > > > > -			igt_assert(data != MAP_FAILED);
> > > > > -		} else {
> > > > > -			data = aligned_alloc(xe_get_default_alignment(fd),
> > > > > -					     bo_size);
> > > > > -			igt_assert(data);
> > > > > -		}
> > > > > -	} else {
> > > > > -		bo = xe_bo_create(fd, flags & VM_FOR_BO ? vm : 0,
> > > > > -				  bo_size, vram_if_possible(fd, eci->gt_id),
> > > > > -
> > > > DRM_XE_GEM_CREATE_FLAG_NEEDS_VISIBLE_VRAM);
> > > > > -		data = xe_bo_map(fd, bo, bo_size);
> > > > > -	}
> > > > > -	memset(data, 0, bo_size);
> > > > > +	placement = vram_if_possible(fd, eci->gt_id);
> > > > >
> > > > > -	for (i = 0; !(flags & EXEC_QUEUE_EARLY) && i < n_exec_queues; i++) {
> > > > > -		exec_queues[i] = xe_exec_queue_create(fd, vm, eci, 0);
> > > > > -		if (flags & BIND_EXECQUEUE)
> > > > > -			bind_exec_queues[i] =
> > > > > -				xe_bind_exec_queue_create(fd, vm, 0);
> > > > > -		else
> > > > > -			bind_exec_queues[i] = 0;
> > > > > -	};
> > > > > +	data = create_xe_bo(fd, vm, &bo, bo_size, placement, flags);
> > > > >
> > > > > -	sync[0].addr = to_user_pointer(&data[0].vm_sync);
> > > > > -	if (bo)
> > > > > -		xe_vm_bind_async(fd, vm, bind_exec_queues[0], bo, 0, addr,
> > > > > -				 bo_size, sync, 1);
> > > > > -	else
> > > > > -		xe_vm_bind_userptr_async(fd, vm, bind_exec_queues[0],
> > > > > -					 to_user_pointer(data), addr,
> > > > > -					 bo_size, sync, 1);
> > > > > -#define ONE_SEC	MS_TO_NS(1000)
> > > > > -#define HUNDRED_SEC	MS_TO_NS(100000)
> > > > > +	if(!(flags & EXEC_QUEUE_EARLY))
> > > > > +		create_exec_queue(fd, vm, exec_queues, bind_exec_queues,
> > > > n_exec_queues, eci, flags);
> > > > > +
> > > > > +	xe_bo_bind(fd, vm, bind_exec_queues[0], bo, bo_size, addr, sync, data,
> > > > flags);
> > > > >
> > > > >  	fence_timeout = igt_run_in_simulation() ? HUNDRED_SEC : ONE_SEC;
> > > > >
> > > > > -	xe_wait_ufence(fd, &data[0].vm_sync, USER_FENCE_VALUE,
> > > > > -		       bind_exec_queues[0], fence_timeout);
> > > > > -	data[0].vm_sync = 0;
> > > > > -
> > > > > -	for (i = 0; i < n_execs; i++) {
> > > > > -		uint64_t batch_offset = (char *)&data[i].batch - (char *)data;
> > > > > -		uint64_t batch_addr = addr + batch_offset;
> > > > > -		uint64_t sdi_offset = (char *)&data[i].data - (char *)data;
> > > > > -		uint64_t sdi_addr = addr + sdi_offset;
> > > > > -		int e = i % n_exec_queues;
> > > > > -
> > > > > -		b = 0;
> > > > > -		data[i].batch[b++] = MI_STORE_DWORD_IMM_GEN4;
> > > > > -		data[i].batch[b++] = sdi_addr;
> > > > > -		data[i].batch[b++] = sdi_addr >> 32;
> > > > > -		data[i].batch[b++] = 0xc0ffee;
> > > > > -		data[i].batch[b++] = MI_BATCH_BUFFER_END;
> > > > > -		igt_assert(b <= ARRAY_SIZE(data[i].batch));
> > > > > -
> > > > > -		sync[0].addr = addr + (char *)&data[i].exec_sync - (char *)data;
> > > > > -
> > > > > -		exec.exec_queue_id = exec_queues[e];
> > > > > -		exec.address = batch_addr;
> > > > > -		xe_exec(fd, &exec);
> > > > > -
> > > > > -		if (flags & REBIND && i + 1 != n_execs) {
> > > > > -			xe_wait_ufence(fd, &data[i].exec_sync,
> > > > USER_FENCE_VALUE,
> > > > > -				       exec_queues[e], fence_timeout);
> > > > > -			xe_vm_unbind_async(fd, vm, bind_exec_queues[e], 0,
> > > > > -					   addr, bo_size, NULL, 0);
> > > > > -
> > > > > -			sync[0].addr = to_user_pointer(&data[0].vm_sync);
> > > > > -			addr += bo_size;
> > > > > -			if (bo)
> > > > > -				xe_vm_bind_async(fd, vm,
> > > > bind_exec_queues[e], bo,
> > > > > -						 0, addr, bo_size, sync, 1);
> > > > > -			else
> > > > > -				xe_vm_bind_userptr_async(fd, vm,
> > > > > -							 bind_exec_queues[e],
> > > > > -							 to_user_pointer(data),
> > > > > -							 addr, bo_size, sync,
> > > > > -							 1);
> > > > > -			xe_wait_ufence(fd, &data[0].vm_sync,
> > > > USER_FENCE_VALUE,
> > > > > -				       bind_exec_queues[e], fence_timeout);
> > > > > -			data[0].vm_sync = 0;
> > > > > -		}
> > > > > -
> > > > > -		if (flags & INVALIDATE && i + 1 != n_execs) {
> > > > > -			if (!(flags & RACE)) {
> > > > > -				/*
> > > > > -				 * Wait for exec completion and check data as
> > > > > -				 * userptr will likely change to different
> > > > > -				 * physical memory on next mmap call triggering
> > > > > -				 * an invalidate.
> > > > > -				 */
> > > > > -				xe_wait_ufence(fd, &data[i].exec_sync,
> > > > > -					       USER_FENCE_VALUE,
> > > > exec_queues[e],
> > > > > -					       fence_timeout);
> > > > > -				igt_assert_eq(data[i].data, 0xc0ffee);
> > > > > -			} else if (i * 2 != n_execs) {
> > > > > -				/*
> > > > > -				 * We issue 1 mmap which races against running
> > > > > -				 * jobs. No real check here aside from this test
> > > > > -				 * not faulting on the GPU.
> > > > > -				 */
> > > > > -				continue;
> > > > > -			}
> > > > > -
> > > > > -			if (flags & RACE) {
> > > > > -				map_fd = open("/tmp", O_TMPFILE | O_RDWR,
> > > > > -					      0x666);
> > > > > -				write(map_fd, data, bo_size);
> > > > > -				data = mmap((void *)MAP_ADDRESS, bo_size,
> > > > > -					    PROT_READ | PROT_WRITE,
> > > > MAP_SHARED |
> > > > > -					    MAP_FIXED, map_fd, 0);
> > > > > -			} else {
> > > > > -				data = mmap((void *)MAP_ADDRESS, bo_size,
> > > > > -					    PROT_READ | PROT_WRITE,
> > > > MAP_SHARED |
> > > > > -					    MAP_FIXED | MAP_ANONYMOUS, -1,
> > > > 0);
> > > > > -			}
> > > > > -			igt_assert(data != MAP_FAILED);
> > > > > -		}
> > > > > -	}
> > > > > +	xe_execbuf(fd, vm, &exec, n_execs, exec_queues, bind_exec_queues,
> > > > > +			n_exec_queues, bo, bo_size, sync, addr, data, &map_fd,
> > > > flags);
> > > > >
> > > > >  	j = flags & INVALIDATE ? n_execs - 1 : 0;
> > > > >  	for (i = j; i < n_execs; i++)
> > > > > @@ -264,20 +132,12 @@ test_exec(int fd, struct
> > > > drm_xe_engine_class_instance *eci,
> > > > >  	if (flags & INVALIDATE)
> > > > >  		usleep(250000);
> > > > >
> > > > > -	sync[0].addr = to_user_pointer(&data[0].vm_sync);
> > > > > -	xe_vm_unbind_async(fd, vm, bind_exec_queues[0], 0, addr, bo_size,
> > > > > -			   sync, 1);
> > > > > -	xe_wait_ufence(fd, &data[0].vm_sync, USER_FENCE_VALUE,
> > > > > -		       bind_exec_queues[0], fence_timeout);
> > > > > +	xe_vm_unbind(fd, vm, bind_exec_queues[0], sync, data, addr, bo_size);
> > > > >
> > > > >  	for (i = j; i < n_execs; i++)
> > > > >  		igt_assert_eq(data[i].data, 0xc0ffee);
> > > > >
> > > > > -	for (i = 0; i < n_exec_queues; i++) {
> > > > > -		xe_exec_queue_destroy(fd, exec_queues[i]);
> > > > > -		if (bind_exec_queues[i])
> > > > > -			xe_exec_queue_destroy(fd, bind_exec_queues[i]);
> > > > > -	}
> > > > > +	destory_exec_queue(fd, exec_queues, bind_exec_queues,
> > > > n_exec_queues);
> > > > >
> > > > >  	if (bo) {
> > > > >  		munmap(data, bo_size);
> > > > > @@ -492,14 +352,14 @@ igt_main
> > > > >  		{ "userptr-rebind", USERPTR | REBIND },
> > > > >  		{ "userptr-invalidate", USERPTR | INVALIDATE },
> > > > >  		{ "userptr-invalidate-race", USERPTR | INVALIDATE | RACE },
> > > > > -		{ "bindexecqueue", BIND_EXECQUEUE },
> > > > > -		{ "bindexecqueue-userptr", BIND_EXECQUEUE | USERPTR },
> > > > > -		{ "bindexecqueue-rebind",  BIND_EXECQUEUE | REBIND },
> > > > > -		{ "bindexecqueue-userptr-rebind",  BIND_EXECQUEUE |
> > > > USERPTR |
> > > > > +		{ "bindexecqueue", BIND_EXEC_QUEUE },
> > > > > +		{ "bindexecqueue-userptr", BIND_EXEC_QUEUE | USERPTR },
> > > > > +		{ "bindexecqueue-rebind",  BIND_EXEC_QUEUE | REBIND },
> > > > > +		{ "bindexecqueue-userptr-rebind",  BIND_EXEC_QUEUE |
> > > > USERPTR |
> > > > >  			REBIND },
> > > > > -		{ "bindexecqueue-userptr-invalidate",  BIND_EXECQUEUE |
> > > > USERPTR |
> > > > > +		{ "bindexecqueue-userptr-invalidate",  BIND_EXEC_QUEUE |
> > > > USERPTR |
> > > > >  			INVALIDATE },
> > > > > -		{ "bindexecqueue-userptr-invalidate-race", BIND_EXECQUEUE |
> > > > USERPTR |
> > > > > +		{ "bindexecqueue-userptr-invalidate-race", BIND_EXEC_QUEUE |
> > > > USERPTR |
> > > > >  			INVALIDATE | RACE },
> > > > >  		{ NULL },
> > > > >  	};
> > > > > diff --git a/tests/intel/xe_exec_fault_mode.c
> > > > b/tests/intel/xe_exec_fault_mode.c
> > > > > index 40fe1743e..7b2fef224 100644
> > > > > --- a/tests/intel/xe_exec_fault_mode.c
> > > > > +++ b/tests/intel/xe_exec_fault_mode.c
> > > > > @@ -16,24 +16,13 @@
> > > > >  #include "igt.h"
> > > > >  #include "lib/igt_syncobj.h"
> > > > >  #include "lib/intel_reg.h"
> > > > > +#include "lib/xe/xe_util.h"
> > > > >  #include "xe_drm.h"
> > > > >
> > > > >  #include "xe/xe_ioctl.h"
> > > > >  #include "xe/xe_query.h"
> > > > >  #include <string.h>
> > > > >
> > > > > -#define MAX_N_EXEC_QUEUES	16
> > > > > -
> > > > > -#define USERPTR		(0x1 << 0)
> > > > > -#define REBIND		(0x1 << 1)
> > > > > -#define INVALIDATE	(0x1 << 2)
> > > > > -#define RACE		(0x1 << 3)
> > > > > -#define BIND_EXEC_QUEUE	(0x1 << 4)
> > > > > -#define PREFETCH	(0x1 << 5)
> > > > > -#define INVALID_FAULT	(0x1 << 6)
> > > > > -#define INVALID_VA	(0x1 << 7)
> > > > > -#define ENABLE_SCRATCH  (0x1 << 8)
> > > > > -
> > > > >  /**
> > > > >   * SUBTEST: invalid-va
> > > > >   * Description: Access invalid va and check for EIO through user fence.
> > > > > @@ -99,7 +88,6 @@ test_exec(int fd, struct drm_xe_engine_class_instance
> > > > *eci,
> > > > >  {
> > > > >  	uint32_t vm;
> > > > >  	uint64_t addr = 0x1a0000;
> > > > > -#define USER_FENCE_VALUE	0xdeadbeefdeadbeefull
> > > > >  	struct drm_xe_sync sync[1] = {
> > > > >  		{ .type = DRM_XE_SYNC_TYPE_USER_FENCE, .flags =
> > > > DRM_XE_SYNC_FLAG_SIGNAL,
> > > > >  	          .timeline_value = USER_FENCE_VALUE },
> > > > > @@ -113,14 +101,9 @@ test_exec(int fd, struct
> > drm_xe_engine_class_instance
> > > > *eci,
> > > > >  	uint32_t bind_exec_queues[MAX_N_EXEC_QUEUES];
> > > > >  	size_t bo_size;
> > > > >  	uint32_t bo = 0;
> > > > > -	struct {
> > > > > -		uint32_t batch[16];
> > > > > -		uint64_t pad;
> > > > > -		uint64_t vm_sync;
> > > > > -		uint64_t exec_sync;
> > > > > -		uint32_t data;
> > > > > -	} *data;
> > > > > -	int i, j, b;
> > > > > +	struct cpu_va *data;
> > > > > +	uint64_t placement;
> > > > > +	int i, j;
> > > > >  	int map_fd = -1;
> > > > >
> > > > >  	igt_assert(n_exec_queues <= MAX_N_EXEC_QUEUES);
> > > > > @@ -134,144 +117,19 @@ test_exec(int fd, struct
> > > > drm_xe_engine_class_instance *eci,
> > > > >  	bo_size = sizeof(*data) * n_execs;
> > > > >  	bo_size = xe_bb_size(fd, bo_size);
> > > > >
> > > > > -	if (flags & USERPTR) {
> > > > > -#define	MAP_ADDRESS	0x00007fadeadbe000
> > > > > -		if (flags & INVALIDATE) {
> > > > > -			data = mmap((void *)MAP_ADDRESS, bo_size,
> > > > PROT_READ |
> > > > > -				    PROT_WRITE, MAP_SHARED | MAP_FIXED |
> > > > > -				    MAP_ANONYMOUS, -1, 0);
> > > > > -			igt_assert(data != MAP_FAILED);
> > > > > -		} else {
> > > > > -			data = aligned_alloc(xe_get_default_alignment(fd),
> > > > > -					     bo_size);
> > > > > -			igt_assert(data);
> > > > > -		}
> > > > > -	} else {
> > > > > -		if (flags & PREFETCH)
> > > > > -			bo = xe_bo_create(fd, 0, bo_size,
> > > > > -					  all_memory_regions(fd) |
> > > > > -					  vram_if_possible(fd, 0),
> > > > > -
> > > > DRM_XE_GEM_CREATE_FLAG_NEEDS_VISIBLE_VRAM);
> > > > > -		else
> > > > > -			bo = xe_bo_create(fd, 0, bo_size,
> > > > > -					  vram_if_possible(fd, eci->gt_id),
> > > > > -
> > > > DRM_XE_GEM_CREATE_FLAG_NEEDS_VISIBLE_VRAM);
> > > > > -		data = xe_bo_map(fd, bo, bo_size);
> > > > > -	}
> > > > > -	memset(data, 0, bo_size);
> > > > > -
> > > > > -	for (i = 0; i < n_exec_queues; i++) {
> > > > > -		exec_queues[i] = xe_exec_queue_create(fd, vm, eci, 0);
> > > > > -		if (flags & BIND_EXEC_QUEUE)
> > > > > -			bind_exec_queues[i] =
> > > > > -				xe_bind_exec_queue_create(fd, vm, 0);
> > > > > -		else
> > > > > -			bind_exec_queues[i] = 0;
> > > > > -	};
> > > > > -
> > > > > -	sync[0].addr = to_user_pointer(&data[0].vm_sync);
> > > > > -	if (bo)
> > > > > -		xe_vm_bind_async(fd, vm, bind_exec_queues[0], bo, 0, addr,
> > > > bo_size, sync, 1);
> > > > > -	else
> > > > > -		xe_vm_bind_userptr_async(fd, vm, bind_exec_queues[0],
> > > > > -					 to_user_pointer(data), addr,
> > > > > -					 bo_size, sync, 1);
> > > > > -#define ONE_SEC	MS_TO_NS(1000)
> > > > > -	xe_wait_ufence(fd, &data[0].vm_sync, USER_FENCE_VALUE,
> > > > > -		       bind_exec_queues[0], ONE_SEC);
> > > > > -	data[0].vm_sync = 0;
> > > > > -
> > > > > -	if (flags & PREFETCH) {
> > > > > -		/* Should move to system memory */
> > > > > -		xe_vm_prefetch_async(fd, vm, bind_exec_queues[0], 0, addr,
> > > > > -				     bo_size, sync, 1, 0);
> > > > > -		xe_wait_ufence(fd, &data[0].vm_sync, USER_FENCE_VALUE,
> > > > > -			       bind_exec_queues[0], ONE_SEC);
> > > > > -		data[0].vm_sync = 0;
> > > > > -	}
> > > > > -
> > > > > -	for (i = 0; i < n_execs; i++) {
> > > > > -		uint64_t batch_offset = (char *)&data[i].batch - (char *)data;
> > > > > -		uint64_t batch_addr = addr + batch_offset;
> > > > > -		uint64_t sdi_offset = (char *)&data[i].data - (char *)data;
> > > > > -		uint64_t sdi_addr = addr + sdi_offset;
> > > > > -		int e = i % n_exec_queues;
> > > > > +	placement = (flags & PREFETCH) ? all_memory_regions(fd) |
> > > > > +		vram_if_possible(fd, 0) : vram_if_possible(fd, eci->gt_id);
> > > > >
> > > > > -		b = 0;
> > > > > -		if (flags & INVALID_VA)
> > > > > -			sdi_addr = 0x1fffffffffff000;
> > > > > +	data = create_xe_bo(fd, vm, &bo, bo_size, placement, flags);
> > > > >
> > > > > -		data[i].batch[b++] = MI_STORE_DWORD_IMM_GEN4;
> > > > > -		data[i].batch[b++] = sdi_addr;
> > > > > -		data[i].batch[b++] = sdi_addr >> 32;
> > > > > -		data[i].batch[b++] = 0xc0ffee;
> > > > > -		data[i].batch[b++] = MI_BATCH_BUFFER_END;
> > > > > -		igt_assert(b <= ARRAY_SIZE(data[i].batch));
> > > > > +	create_exec_queue(fd, vm, exec_queues, bind_exec_queues,
> > > > > +			n_exec_queues, eci, flags);
> > > > >
> > > > > -		sync[0].addr = addr + (char *)&data[i].exec_sync - (char *)data;
> > > > > +	xe_bo_bind(fd, vm, bind_exec_queues[0], bo, bo_size, addr, sync, data,
> > > > flags);
> > > > >
> > > > > -		exec.exec_queue_id = exec_queues[e];
> > > > > -		exec.address = batch_addr;
> > > > > -		xe_exec(fd, &exec);
> > > > > +	xe_execbuf(fd, vm, &exec, n_execs, exec_queues, bind_exec_queues,
> > > > > +			n_exec_queues, bo, bo_size, sync, addr, data, &map_fd,
> > > > flags);
> > > > >
> > > > > -		if (flags & REBIND && i + 1 != n_execs) {
> > > > > -			xe_wait_ufence(fd, &data[i].exec_sync,
> > > > USER_FENCE_VALUE,
> > > > > -				       exec_queues[e], ONE_SEC);
> > > > > -			xe_vm_unbind_async(fd, vm, bind_exec_queues[e], 0,
> > > > > -					   addr, bo_size, NULL, 0);
> > > > > -
> > > > > -			sync[0].addr = to_user_pointer(&data[0].vm_sync);
> > > > > -			addr += bo_size;
> > > > > -			if (bo)
> > > > > -				xe_vm_bind_async(fd, vm,
> > > > bind_exec_queues[e], bo,
> > > > > -						 0, addr, bo_size, sync, 1);
> > > > > -			else
> > > > > -				xe_vm_bind_userptr_async(fd, vm,
> > > > > -							 bind_exec_queues[e],
> > > > > -							 to_user_pointer(data),
> > > > > -							 addr, bo_size, sync,
> > > > > -							 1);
> > > > > -			xe_wait_ufence(fd, &data[0].vm_sync,
> > > > USER_FENCE_VALUE,
> > > > > -				       bind_exec_queues[e], ONE_SEC);
> > > > > -			data[0].vm_sync = 0;
> > > > > -		}
> > > > > -
> > > > > -		if (flags & INVALIDATE && i + 1 != n_execs) {
> > > > > -			if (!(flags & RACE)) {
> > > > > -				/*
> > > > > -				 * Wait for exec completion and check data as
> > > > > -				 * userptr will likely change to different
> > > > > -				 * physical memory on next mmap call triggering
> > > > > -				 * an invalidate.
> > > > > -				 */
> > > > > -				xe_wait_ufence(fd, &data[i].exec_sync,
> > > > > -					       USER_FENCE_VALUE,
> > > > exec_queues[e],
> > > > > -					       ONE_SEC);
> > > > > -				igt_assert_eq(data[i].data, 0xc0ffee);
> > > > > -			} else if (i * 2 != n_execs) {
> > > > > -				/*
> > > > > -				 * We issue 1 mmap which races against running
> > > > > -				 * jobs. No real check here aside from this test
> > > > > -				 * not faulting on the GPU.
> > > > > -				 */
> > > > > -				continue;
> > > > > -			}
> > > > > -
> > > > > -			if (flags & RACE) {
> > > > > -				map_fd = open("/tmp", O_TMPFILE | O_RDWR,
> > > > > -					      0x666);
> > > > > -				write(map_fd, data, bo_size);
> > > > > -				data = mmap((void *)MAP_ADDRESS, bo_size,
> > > > > -					    PROT_READ | PROT_WRITE,
> > > > MAP_SHARED |
> > > > > -					    MAP_FIXED, map_fd, 0);
> > > > > -			} else {
> > > > > -				data = mmap((void *)MAP_ADDRESS, bo_size,
> > > > > -					    PROT_READ | PROT_WRITE,
> > > > MAP_SHARED |
> > > > > -					    MAP_FIXED | MAP_ANONYMOUS, -1,
> > > > 0);
> > > > > -			}
> > > > > -			igt_assert(data != MAP_FAILED);
> > > > > -		}
> > > > > -	}
> > > > >  	if (!(flags & INVALID_FAULT)) {
> > > > >  		int64_t timeout = ONE_SEC;
> > > > >
> > > > > @@ -286,22 +144,15 @@ test_exec(int fd, struct
> > > > drm_xe_engine_class_instance *eci,
> > > > >  							       exec_queues[i %
> > > > n_exec_queues], &timeout), 0);
> > > > >  		}
> > > > >  	}
> > > > > -	sync[0].addr = to_user_pointer(&data[0].vm_sync);
> > > > > -	xe_vm_unbind_async(fd, vm, bind_exec_queues[0], 0, addr, bo_size,
> > > > > -			   sync, 1);
> > > > > -	xe_wait_ufence(fd, &data[0].vm_sync, USER_FENCE_VALUE,
> > > > > -		       bind_exec_queues[0], ONE_SEC);
> > > > > +
> > > > > +	xe_vm_unbind(fd, vm, bind_exec_queues[0], sync, data, addr, bo_size);
> > > > >
> > > > >  	if (!(flags & INVALID_FAULT) && !(flags & INVALID_VA)) {
> > > > >  		for (i = j; i < n_execs; i++)
> > > > >  				igt_assert_eq(data[i].data, 0xc0ffee);
> > > > >  	}
> > > > >
> > > > > -	for (i = 0; i < n_exec_queues; i++) {
> > > > > -		xe_exec_queue_destroy(fd, exec_queues[i]);
> > > > > -		if (bind_exec_queues[i])
> > > > > -			xe_exec_queue_destroy(fd, bind_exec_queues[i]);
> > > > > -	}
> > > > > +	destory_exec_queue(fd, exec_queues, bind_exec_queues,
> > > > n_exec_queues);
> > > > >
> > > > >  	if (bo) {
> > > > >  		munmap(data, bo_size);
> > > > > --
> > > > > 2.25.1
> > > > >

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2024-04-01 22:31 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2024-03-31 18:59 [PATCH] lib/xe/xe_util: Creating the helper functions Bommu Krishnaiah
2024-03-31 19:04 ` Bommu, Krishnaiah
2024-03-31 19:05 ` ✗ CI.Patch_applied: failure for " Patchwork
2024-03-31 21:05 ` [PATCH] " Matthew Brost
2024-04-01 16:10   ` Zeng, Oak
2024-04-01 18:40     ` Matthew Brost
2024-04-01 20:28       ` Zeng, Oak
2024-04-01 22:32         ` Matthew Brost
2024-04-01 17:39 ` Zeng, Oak

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox