Igt-dev Archive on lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v3 0/2] Add system_allocator test
@ 2025-04-24 20:44 Matthew Brost
  2025-04-24 20:44 ` [PATCH v3 1/2] uapi/xe: Sync latest uAPI KMD headers Matthew Brost
  2025-04-24 20:44 ` [PATCH v3 2/2] tests/xe: Add system_allocator test Matthew Brost
  0 siblings, 2 replies; 8+ messages in thread
From: Matthew Brost @ 2025-04-24 20:44 UTC (permalink / raw)
  To: igt-dev; +Cc: francois.dugast

Matthew Brost (2):
  uapi/xe: Sync latest uAPI KMD headers
  tests/xe: Add system_allocator test

 include/drm-uapi/xe_drm.h              |   49 +-
 lib/xe/xe_ioctl.c                      |   12 +
 lib/xe/xe_ioctl.h                      |    1 +
 tests/intel/xe_exec_system_allocator.c | 1849 ++++++++++++++++++++++++
 tests/meson.build                      |    1 +
 5 files changed, 1909 insertions(+), 3 deletions(-)
 create mode 100644 tests/intel/xe_exec_system_allocator.c

-- 
2.34.1


^ permalink raw reply	[flat|nested] 8+ messages in thread

* [PATCH v3 1/2] uapi/xe: Sync latest uAPI KMD headers
  2025-04-24 20:44 [PATCH v3 0/2] Add system_allocator test Matthew Brost
@ 2025-04-24 20:44 ` Matthew Brost
  2025-04-24 22:32   ` Cavitt, Jonathan
  2025-04-25  6:54   ` Francois Dugast
  2025-04-24 20:44 ` [PATCH v3 2/2] tests/xe: Add system_allocator test Matthew Brost
  1 sibling, 2 replies; 8+ messages in thread
From: Matthew Brost @ 2025-04-24 20:44 UTC (permalink / raw)
  To: igt-dev; +Cc: francois.dugast

Pull in latest uAPI KMD headers to enable testing of new features.

Signed-off-by: Matthew Brost <matthew.brost@intel.com>
---
 include/drm-uapi/xe_drm.h | 49 ++++++++++++++++++++++++++++++++++++---
 1 file changed, 46 insertions(+), 3 deletions(-)

diff --git a/include/drm-uapi/xe_drm.h b/include/drm-uapi/xe_drm.h
index 154f947ef0..c90fab1b00 100644
--- a/include/drm-uapi/xe_drm.h
+++ b/include/drm-uapi/xe_drm.h
@@ -393,6 +393,10 @@ struct drm_xe_query_mem_regions {
  *
  *    - %DRM_XE_QUERY_CONFIG_FLAG_HAS_VRAM - Flag is set if the device
  *      has usable VRAM
+ *    - %DRM_XE_QUERY_CONFIG_FLAG_HAS_LOW_LATENCY - Flag is set if the device
+ *      has low latency hint support
+ *    - %DRM_XE_QUERY_CONFIG_FLAG_HAS_CPU_ADDR_MIRROR - Flag is set if the
+ *      device has CPU address mirroring support
  *  - %DRM_XE_QUERY_CONFIG_MIN_ALIGNMENT - Minimal memory alignment
  *    required by this device, typically SZ_4K or SZ_64K
  *  - %DRM_XE_QUERY_CONFIG_VA_BITS - Maximum bits of a virtual address
@@ -409,6 +413,8 @@ struct drm_xe_query_config {
 #define DRM_XE_QUERY_CONFIG_REV_AND_DEVICE_ID	0
 #define DRM_XE_QUERY_CONFIG_FLAGS			1
 	#define DRM_XE_QUERY_CONFIG_FLAG_HAS_VRAM	(1 << 0)
+	#define DRM_XE_QUERY_CONFIG_FLAG_HAS_LOW_LATENCY	(1 << 1)
+	#define DRM_XE_QUERY_CONFIG_FLAG_HAS_CPU_ADDR_MIRROR	(1 << 2)
 #define DRM_XE_QUERY_CONFIG_MIN_ALIGNMENT		2
 #define DRM_XE_QUERY_CONFIG_VA_BITS			3
 #define DRM_XE_QUERY_CONFIG_MAX_EXEC_QUEUE_PRIORITY	4
@@ -911,7 +917,11 @@ struct drm_xe_gem_mmap_offset {
  * struct drm_xe_vm_create - Input of &DRM_IOCTL_XE_VM_CREATE
  *
  * The @flags can be:
- *  - %DRM_XE_VM_CREATE_FLAG_SCRATCH_PAGE
+ *  - %DRM_XE_VM_CREATE_FLAG_SCRATCH_PAGE - Map the whole virtual address
+ *    space of the VM to scratch page. A vm_bind would overwrite the scratch
+ *    page mapping. This flag is mutually exclusive with the
+ *    %DRM_XE_VM_CREATE_FLAG_FAULT_MODE flag, with an exception of on x2 and
+ *    xe3 platform.
  *  - %DRM_XE_VM_CREATE_FLAG_LR_MODE - An LR, or Long Running VM accepts
  *    exec submissions to its exec_queues that don't have an upper time
  *    limit on the job execution time. But exec submissions to these
@@ -987,6 +997,12 @@ struct drm_xe_vm_destroy {
  *  - %DRM_XE_VM_BIND_FLAG_CHECK_PXP - If the object is encrypted via PXP,
  *    reject the binding if the encryption key is no longer valid. This
  *    flag has no effect on BOs that are not marked as using PXP.
+ *  - %DRM_XE_VM_BIND_FLAG_CPU_ADDR_MIRROR - When the CPU address mirror flag is
+ *    set, no mappings are created rather the range is reserved for CPU address
+ *    mirroring which will be populated on GPU page faults or prefetches. Only
+ *    valid on VMs with DRM_XE_VM_CREATE_FLAG_FAULT_MODE set. The CPU address
+ *    mirror flag are only valid for DRM_XE_VM_BIND_OP_MAP operations, the BO
+ *    handle MBZ, and the BO offset MBZ.
  */
 struct drm_xe_vm_bind_op {
 	/** @extensions: Pointer to the first extension struct, if any */
@@ -1039,7 +1055,9 @@ struct drm_xe_vm_bind_op {
 	 * on the @pat_index. For such mappings there is no actual memory being
 	 * mapped (the address in the PTE is invalid), so the various PAT memory
 	 * attributes likely do not apply.  Simply leaving as zero is one
-	 * option (still a valid pat_index).
+	 * option (still a valid pat_index). Same applies to
+	 * DRM_XE_VM_BIND_FLAG_CPU_ADDR_MIRROR bindings as for such mapping
+	 * there is no actual memory being mapped.
 	 */
 	__u16 pat_index;
 
@@ -1055,6 +1073,14 @@ struct drm_xe_vm_bind_op {
 
 		/** @userptr: user pointer to bind on */
 		__u64 userptr;
+
+		/**
+		 * @cpu_addr_mirror_offset: Offset from GPU @addr to create
+		 * CPU address mirror mappings. MBZ with current level of
+		 * support (e.g. 1 to 1 mapping between GPU and CPU mappings
+		 * only supported).
+		 */
+		__s64 cpu_addr_mirror_offset;
 	};
 
 	/**
@@ -1078,6 +1104,7 @@ struct drm_xe_vm_bind_op {
 #define DRM_XE_VM_BIND_FLAG_NULL	(1 << 2)
 #define DRM_XE_VM_BIND_FLAG_DUMPABLE	(1 << 3)
 #define DRM_XE_VM_BIND_FLAG_CHECK_PXP	(1 << 4)
+#define DRM_XE_VM_BIND_FLAG_CPU_ADDR_MIRROR	(1 << 5)
 	/** @flags: Bind flags */
 	__u32 flags;
 
@@ -1205,6 +1232,21 @@ struct drm_xe_vm_bind {
  *     };
  *     ioctl(fd, DRM_IOCTL_XE_EXEC_QUEUE_CREATE, &exec_queue_create);
  *
+ *     Allow users to provide a hint to kernel for cases demanding low latency
+ *     profile. Please note it will have impact on power consumption. User can
+ *     indicate low latency hint with flag while creating exec queue as
+ *     mentioned below,
+ *
+ *     struct drm_xe_exec_queue_create exec_queue_create = {
+ *          .flags = DRM_XE_EXEC_QUEUE_LOW_LATENCY_HINT,
+ *          .extensions = 0,
+ *          .vm_id = vm,
+ *          .num_bb_per_exec = 1,
+ *          .num_eng_per_bb = 1,
+ *          .instances = to_user_pointer(&instance),
+ *     };
+ *     ioctl(fd, DRM_IOCTL_XE_EXEC_QUEUE_CREATE, &exec_queue_create);
+ *
  */
 struct drm_xe_exec_queue_create {
 #define DRM_XE_EXEC_QUEUE_EXTENSION_SET_PROPERTY		0
@@ -1223,7 +1265,8 @@ struct drm_xe_exec_queue_create {
 	/** @vm_id: VM to use for this exec queue */
 	__u32 vm_id;
 
-	/** @flags: MBZ */
+#define DRM_XE_EXEC_QUEUE_LOW_LATENCY_HINT	(1 << 0)
+	/** @flags: flags to use for this exec queue */
 	__u32 flags;
 
 	/** @exec_queue_id: Returned exec queue ID */
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH v3 2/2] tests/xe: Add system_allocator test
  2025-04-24 20:44 [PATCH v3 0/2] Add system_allocator test Matthew Brost
  2025-04-24 20:44 ` [PATCH v3 1/2] uapi/xe: Sync latest uAPI KMD headers Matthew Brost
@ 2025-04-24 20:44 ` Matthew Brost
  2025-04-24 22:32   ` Cavitt, Jonathan
  2025-04-25  7:06   ` Francois Dugast
  1 sibling, 2 replies; 8+ messages in thread
From: Matthew Brost @ 2025-04-24 20:44 UTC (permalink / raw)
  To: igt-dev; +Cc: francois.dugast

Test various uses of system allocator in single thread, multiple
threads, and multiple processes.

Features tested:
 - Malloc with various size
 - Mmap with various sizes and flags including file backed mappings
 - Mixing BO allocations with system allocator
 - Various page sizes
 - Dynamically freeing / unmapping memory
 - Sharing VM across threads
 - Faults racing on different hardware engines / GTs / Tiles
 - GPU faults and CPU faults racing
 - CPU faults on multiple threads racing
 - CPU faults on multiple process racing
 - GPU faults of memory not faulted in by CPU
 - Partial unmap of allocations
 - Attempting to unmap system allocations when GPU has mappings
 - Eviction of both system allocations and BOs
 - Forking child processes and reading data from VRAM
 - mremap data in VRAM
 - Protection changes
 - Multiple faults per execbuf

Running on LNL, BMG, PVC 1 tile, and PVC 2 tile.

v2:
 - Rebase
 - Fix memory allocation to not interfear with malloc (Thomas)
v3:
 - Fix memory leak (Francois)
 - Break out uAPI into own patch (Francois)
 - Use mkstemp for sync file (Francois)
 - Use mkstemp for file backed data (Francois)
 - Drop i argument from READ_VALUE (Francois)
 - Fix test description (Francois)
 - Add comment to check_all_pages_process (Francois)
 - Prefer igt_info over printf (Francois)
 - Fix types in messages (Francois)
 - Prefer odd macro (Francois)

Signed-off-by: Matthew Brost <matthew.brost@intel.com>
---
 lib/xe/xe_ioctl.c                      |   12 +
 lib/xe/xe_ioctl.h                      |    1 +
 tests/intel/xe_exec_system_allocator.c | 1849 ++++++++++++++++++++++++
 tests/meson.build                      |    1 +
 4 files changed, 1863 insertions(+)
 create mode 100644 tests/intel/xe_exec_system_allocator.c

diff --git a/lib/xe/xe_ioctl.c b/lib/xe/xe_ioctl.c
index fb8c4aef13..785fc9184c 100644
--- a/lib/xe/xe_ioctl.c
+++ b/lib/xe/xe_ioctl.c
@@ -440,6 +440,18 @@ void *xe_bo_map(int fd, uint32_t bo, size_t size)
 	return __xe_bo_map(fd, bo, size, PROT_WRITE);
 }
 
+void *xe_bo_map_fixed(int fd, uint32_t bo, size_t size, uint64_t addr)
+{
+	uint64_t mmo;
+	void *map;
+
+	mmo = xe_bo_mmap_offset(fd, bo);
+	map = mmap((void *)addr, size, PROT_WRITE, MAP_SHARED | MAP_FIXED, fd, mmo);
+	igt_assert(map != MAP_FAILED);
+
+	return map;
+}
+
 void *xe_bo_mmap_ext(int fd, uint32_t bo, size_t size, int prot)
 {
 	return __xe_bo_map(fd, bo, size, prot);
diff --git a/lib/xe/xe_ioctl.h b/lib/xe/xe_ioctl.h
index 9bdf73b2bd..554a33c9cd 100644
--- a/lib/xe/xe_ioctl.h
+++ b/lib/xe/xe_ioctl.h
@@ -86,6 +86,7 @@ uint32_t xe_exec_queue_create_class(int fd, uint32_t vm, uint16_t class);
 void xe_exec_queue_destroy(int fd, uint32_t exec_queue);
 uint64_t xe_bo_mmap_offset(int fd, uint32_t bo);
 void *xe_bo_map(int fd, uint32_t bo, size_t size);
+void *xe_bo_map_fixed(int fd, uint32_t bo, size_t size, long unsigned int addr);
 void *xe_bo_mmap_ext(int fd, uint32_t bo, size_t size, int prot);
 int __xe_exec(int fd, struct drm_xe_exec *exec);
 void xe_exec(int fd, struct drm_xe_exec *exec);
diff --git a/tests/intel/xe_exec_system_allocator.c b/tests/intel/xe_exec_system_allocator.c
new file mode 100644
index 0000000000..4839090cb2
--- /dev/null
+++ b/tests/intel/xe_exec_system_allocator.c
@@ -0,0 +1,1849 @@
+// SPDX-License-Identifier: MIT
+/*
+ * Copyright © 2024 Intel Corporation
+ */
+
+/**
+ * TEST: Basic tests for execbuf functionality using system allocator
+ * Category: Core
+ * Mega feature: USM
+ * Sub-category: System allocator
+ * Functionality: fault mode, system allocator
+ * GPU: LNL, BMG, PVC
+ */
+
+#include <fcntl.h>
+#include <linux/mman.h>
+#include <time.h>
+
+#include "igt.h"
+#include "lib/igt_syncobj.h"
+#include "lib/intel_reg.h"
+#include "xe_drm.h"
+
+#include "xe/xe_ioctl.h"
+#include "xe/xe_query.h"
+#include <string.h>
+
+#define USER_FENCE_VALUE	0xdeadbeefdeadbeefull
+#define QUARTER_SEC		(NSEC_PER_SEC / 4)
+#define FIVE_SEC		(5LL * NSEC_PER_SEC)
+
+struct batch_data {
+	uint32_t batch[16];
+	uint64_t pad;
+	uint32_t data;
+	uint32_t expected_data;
+};
+
+#define WRITE_VALUE(data__, i__)	({			\
+	if (!(data__)->expected_data)				\
+		(data__)->expected_data = rand() << 12 | (i__);	\
+	(data__)->expected_data;				\
+})
+#define READ_VALUE(data__)	((data__)->expected_data)
+
+static void __write_dword(uint32_t *batch, uint64_t sdi_addr, uint32_t wdata,
+			int *idx)
+{
+	batch[(*idx)++] = MI_STORE_DWORD_IMM_GEN4;
+	batch[(*idx)++] = sdi_addr;
+	batch[(*idx)++] = sdi_addr >> 32;
+	batch[(*idx)++] = wdata;
+}
+
+static void write_dword(uint32_t *batch, uint64_t sdi_addr, uint32_t wdata,
+			int *idx)
+{
+	__write_dword(batch, sdi_addr, wdata, idx);
+	batch[(*idx)++] = MI_BATCH_BUFFER_END;
+}
+
+static void check_all_pages(void *ptr, uint64_t alloc_size, uint64_t stride,
+			    pthread_barrier_t *barrier)
+{
+	int i, n_writes = alloc_size / stride;
+
+	for (i = 0; i < n_writes; ++i) {
+		struct batch_data *data = ptr + i * stride;
+
+		igt_assert_eq(data->data, READ_VALUE(data));
+
+		if (barrier)
+			pthread_barrier_wait(barrier);
+	}
+}
+
+static char sync_file[] = "/tmp/xe_exec_system_allocator_syncXXXXXX";
+static int sync_fd;
+
+static void open_sync_file(void)
+{
+	sync_fd = mkstemp(sync_file);
+}
+
+static void close_sync_file(void)
+{
+	close(sync_fd);
+}
+
+struct process_data {
+	pthread_mutex_t mutex;
+	pthread_cond_t cond;
+	pthread_barrier_t barrier;
+	bool go;
+};
+
+static void wait_pdata(struct process_data *pdata)
+{
+	pthread_mutex_lock(&pdata->mutex);
+	while (!pdata->go)
+		pthread_cond_wait(&pdata->cond, &pdata->mutex);
+	pthread_mutex_unlock(&pdata->mutex);
+}
+
+static void init_pdata(struct process_data *pdata, int n_engine)
+{
+	pthread_mutexattr_t mutex_attr;
+	pthread_condattr_t cond_attr;
+	pthread_barrierattr_t barrier_attr;
+
+	pthread_mutexattr_init(&mutex_attr);
+	pthread_mutexattr_setpshared(&mutex_attr, PTHREAD_PROCESS_SHARED);
+	pthread_mutex_init(&pdata->mutex, &mutex_attr);
+
+	pthread_condattr_init(&cond_attr);
+	pthread_condattr_setpshared(&cond_attr, PTHREAD_PROCESS_SHARED);
+	pthread_cond_init(&pdata->cond, &cond_attr);
+
+	pthread_barrierattr_init(&barrier_attr);
+	pthread_barrierattr_setpshared(&barrier_attr, PTHREAD_PROCESS_SHARED);
+	pthread_barrier_init(&pdata->barrier, &barrier_attr, n_engine);
+
+	pdata->go = false;
+}
+
+static void signal_pdata(struct process_data *pdata)
+{
+	pthread_mutex_lock(&pdata->mutex);
+	pdata->go = true;
+	pthread_cond_broadcast(&pdata->cond);
+	pthread_mutex_unlock(&pdata->mutex);
+}
+
+/* many_alloc flags */
+#define MIX_BO_ALLOC		(0x1 << 0)
+#define BENCHMARK		(0x1 << 1)
+#define CPU_FAULT_THREADS	(0x1 << 2)
+#define CPU_FAULT_PROCESS	(0x1 << 3)
+#define CPU_FAULT_SAME_PAGE	(0x1 << 4)
+
+static void process_check(void *ptr, uint64_t alloc_size, uint64_t stride,
+			  unsigned int flags)
+{
+	struct process_data *pdata;
+	int map_fd;
+
+	map_fd = open(sync_file, O_RDWR, 0x666);
+	pdata = mmap(NULL, sizeof(*pdata), PROT_READ |
+		     PROT_WRITE, MAP_SHARED, map_fd, 0);
+	wait_pdata(pdata);
+
+	if (flags & CPU_FAULT_SAME_PAGE)
+		check_all_pages(ptr, alloc_size, stride, &pdata->barrier);
+	else
+		check_all_pages(ptr, alloc_size, stride, NULL);
+
+	close(map_fd);
+	munmap(pdata, sizeof(*pdata));
+}
+
+/*
+ * Partition checking of results in chunks which causes multiple processes to
+ * fault same VRAM allocation in parallel.
+ */
+static void
+check_all_pages_process(void *ptr, uint64_t alloc_size, uint64_t stride,
+			int n_process, unsigned int flags)
+{
+	struct process_data *pdata;
+	int map_fd, i;
+
+	map_fd = open(sync_file, O_RDWR | O_CREAT, 0x666);
+	posix_fallocate(map_fd, 0, sizeof(*pdata));
+	pdata = mmap(NULL, sizeof(*pdata), PROT_READ |
+		     PROT_WRITE, MAP_SHARED, map_fd, 0);
+
+	init_pdata(pdata, n_process);
+
+	for (i = 0; i < n_process; ++i) {
+		igt_fork(child, 1)
+			if (flags & CPU_FAULT_SAME_PAGE)
+				process_check(ptr, alloc_size, stride, flags);
+			else
+				process_check(ptr + stride * i, alloc_size,
+					      stride * n_process, flags);
+	}
+
+	signal_pdata(pdata);
+	igt_waitchildren();
+
+	close(map_fd);
+	munmap(pdata, sizeof(*pdata));
+}
+
+struct thread_check_data {
+	pthread_t thread;
+	pthread_mutex_t *mutex;
+	pthread_cond_t *cond;
+	pthread_barrier_t *barrier;
+	void *ptr;
+	uint64_t alloc_size;
+	uint64_t stride;
+	bool *go;
+};
+
+static void *thread_check(void *data)
+{
+	struct thread_check_data *t = data;
+
+	pthread_mutex_lock(t->mutex);
+	while (!*t->go)
+		pthread_cond_wait(t->cond, t->mutex);
+	pthread_mutex_unlock(t->mutex);
+
+	check_all_pages(t->ptr, t->alloc_size, t->stride, t->barrier);
+
+	return NULL;
+}
+
+/*
+ * Partition checking of results in chunks which causes multiple threads to
+ * fault same VRAM allocation in parallel.
+ */
+static void
+check_all_pages_threads(void *ptr, uint64_t alloc_size, uint64_t stride,
+			int n_threads, unsigned int flags)
+{
+	struct thread_check_data *threads_check_data;
+	pthread_mutex_t mutex;
+	pthread_cond_t cond;
+	pthread_barrier_t barrier;
+	int i;
+	bool go = false;
+
+	threads_check_data = calloc(n_threads, sizeof(*threads_check_data));
+	igt_assert(threads_check_data);
+
+	pthread_mutex_init(&mutex, 0);
+	pthread_cond_init(&cond, 0);
+	pthread_barrier_init(&barrier, 0, n_threads);
+
+	for (i = 0; i < n_threads; ++i) {
+		threads_check_data[i].mutex = &mutex;
+		threads_check_data[i].cond = &cond;
+		if (flags & CPU_FAULT_SAME_PAGE) {
+			threads_check_data[i].barrier = &barrier;
+			threads_check_data[i].ptr = ptr;
+			threads_check_data[i].alloc_size = alloc_size;
+			threads_check_data[i].stride = stride;
+		} else {
+			threads_check_data[i].barrier = NULL;
+			threads_check_data[i].ptr = ptr + stride * i;
+			threads_check_data[i].alloc_size = alloc_size;
+			threads_check_data[i].stride = n_threads * stride;
+		}
+		threads_check_data[i].go = &go;
+
+		pthread_create(&threads_check_data[i].thread, 0, thread_check,
+			       &threads_check_data[i]);
+	}
+
+	pthread_mutex_lock(&mutex);
+	go = true;
+	pthread_cond_broadcast(&cond);
+	pthread_mutex_unlock(&mutex);
+
+	for (i = 0; i < n_threads; ++i)
+		pthread_join(threads_check_data[i].thread, NULL);
+	free(threads_check_data);
+}
+
+static void touch_all_pages(int fd, uint32_t exec_queue, void *ptr,
+			    uint64_t alloc_size, uint64_t stride,
+			    struct timespec *tv, uint64_t *submit)
+{
+	struct drm_xe_sync sync[1] = {
+		{ .type = DRM_XE_SYNC_TYPE_USER_FENCE,
+		  .flags = DRM_XE_SYNC_FLAG_SIGNAL,
+		  .timeline_value = USER_FENCE_VALUE },
+	};
+	struct drm_xe_exec exec = {
+		.num_batch_buffer = 1,
+		.num_syncs = 0,
+		.exec_queue_id = exec_queue,
+		.syncs = to_user_pointer(&sync),
+	};
+	uint64_t addr = to_user_pointer(ptr);
+	int i, ret, n_writes = alloc_size / stride;
+	u64 *exec_ufence = NULL;
+	int64_t timeout = FIVE_SEC;
+
+	exec_ufence = mmap(NULL, SZ_4K, PROT_READ |
+			   PROT_WRITE, MAP_SHARED |
+			   MAP_ANONYMOUS, -1, 0);
+	igt_assert(exec_ufence != MAP_FAILED);
+	memset(exec_ufence, 0, SZ_4K);
+	sync[0].addr = to_user_pointer(exec_ufence);
+
+	for (i = 0; i < n_writes; ++i, addr += stride) {
+		struct batch_data *data = ptr + i * stride;
+		uint64_t sdi_offset = (char *)&data->data - (char *)data;
+		uint64_t sdi_addr = addr + sdi_offset;
+		int b = 0;
+
+		write_dword(data->batch, sdi_addr, WRITE_VALUE(data, i), &b);
+		igt_assert(b <= ARRAY_SIZE(data->batch));
+	}
+
+	igt_nsec_elapsed(tv);
+	*submit = igt_nsec_elapsed(tv);
+
+	addr = to_user_pointer(ptr);
+	for (i = 0; i < n_writes; ++i, addr += stride) {
+		struct batch_data *data = ptr + i * stride;
+		uint64_t batch_offset = (char *)&data->batch - (char *)data;
+		uint64_t batch_addr = addr + batch_offset;
+
+		exec.address = batch_addr;
+		if (i + 1 == n_writes)
+			exec.num_syncs = 1;
+		xe_exec(fd, &exec);
+	}
+
+	ret = __xe_wait_ufence(fd, exec_ufence, USER_FENCE_VALUE, exec_queue,
+			       &timeout);
+	if (ret) {
+		igt_info("FAIL EXEC_UFENCE_ADDR: 0x%016llx\n", sync[0].addr);
+		igt_info("FAIL EXEC_UFENCE: EXPECTED=0x%016llx, ACTUAL=0x%016lx\n",
+			 USER_FENCE_VALUE, exec_ufence[0]);
+
+		addr = to_user_pointer(ptr);
+		for (i = 0; i < n_writes; ++i, addr += stride) {
+			struct batch_data *data = ptr + i * stride;
+			uint64_t batch_offset = (char *)&data->batch - (char *)data;
+			uint64_t batch_addr = addr + batch_offset;
+			uint64_t sdi_offset = (char *)&data->data - (char *)data;
+			uint64_t sdi_addr = addr + sdi_offset;
+
+			igt_info("FAIL BATCH_ADDR: 0x%016lx\n", batch_addr);
+			igt_info("FAIL SDI_ADDR: 0x%016lx\n", sdi_addr);
+			igt_info("FAIL SDI_ADDR (in batch): 0x%016lx\n",
+				 (((u64)data->batch[2]) << 32) | data->batch[1]);
+			igt_info("FAIL DATA: EXPECTED=0x%08x, ACTUAL=0x%08x\n",
+				 data->expected_data, data->data);
+		}
+		igt_assert_eq(ret, 0);
+	}
+	munmap(exec_ufence, SZ_4K);
+}
+
+static int va_bits;
+
+#define bind_system_allocator(__sync, __num_sync)			\
+	__xe_vm_bind_assert(fd, vm, 0,					\
+			    0, 0, 0, 0x1ull << va_bits,			\
+			    DRM_XE_VM_BIND_OP_MAP,			\
+			    DRM_XE_VM_BIND_FLAG_CPU_ADDR_MIRROR,	\
+			    (__sync), (__num_sync), 0, 0)
+
+#define unbind_system_allocator()				\
+	__xe_vm_bind(fd, vm, 0, 0, 0, 0, 0x1ull << va_bits,	\
+		     DRM_XE_VM_BIND_OP_UNMAP, 0,		\
+		     NULL, 0, 0, 0, 0)
+
+#define odd(__i)	(__i & 1)
+
+struct aligned_alloc_type {
+	void *__ptr;
+	void *ptr;
+	size_t __size;
+	size_t size;
+};
+
+static struct aligned_alloc_type __aligned_alloc(size_t alignment, size_t size)
+{
+	struct aligned_alloc_type aligned_alloc_type;
+
+	aligned_alloc_type.__ptr = mmap(NULL, alignment + size, PROT_NONE, MAP_PRIVATE |
+			      MAP_ANONYMOUS, -1, 0);
+	igt_assert(aligned_alloc_type.__ptr != MAP_FAILED);
+
+	aligned_alloc_type.ptr = (void *)ALIGN((uint64_t)aligned_alloc_type.__ptr, alignment);
+	aligned_alloc_type.size = size;
+	aligned_alloc_type.__size = size + alignment;
+
+	return aligned_alloc_type;
+}
+
+static void __aligned_free(struct aligned_alloc_type  *aligned_alloc_type)
+{
+	munmap(aligned_alloc_type->__ptr, aligned_alloc_type->__size);
+}
+
+static void __aligned_partial_free(struct aligned_alloc_type  *aligned_alloc_type)
+{
+	size_t begin_size = (size_t)(aligned_alloc_type->ptr - aligned_alloc_type->__ptr);
+
+	if (begin_size)
+		munmap(aligned_alloc_type->__ptr, begin_size);
+	if (aligned_alloc_type->__size - aligned_alloc_type->size - begin_size)
+		munmap(aligned_alloc_type->ptr + aligned_alloc_type->size,
+		       aligned_alloc_type->__size - aligned_alloc_type->size - begin_size);
+}
+
+/**
+ * SUBTEST: unaligned-alloc
+ * Description: allocate unaligned sizes of memory
+ * Test category: functionality test
+ *
+ * SUBTEST: fault-benchmark
+ * Description: Benchmark how long GPU / CPU take
+ * Test category: performance test
+ *
+ * SUBTEST: fault-threads-benchmark
+ * Description: Benchmark how long GPU / CPU take, reading results with multiple threads
+ * Test category: performance and functionality test
+ *
+ * SUBTEST: fault-threads-same-page-benchmark
+ * Description: Benchmark how long GPU / CPU take, reading results with multiple threads, hammer same page
+ * Test category: performance and functionality test
+ *
+ * SUBTEST: fault-process-benchmark
+ * Description: Benchmark how long GPU / CPU take, reading results with multiple process
+ * Test category: performance and functionality test
+ *
+ * SUBTEST: fault-process-same-page-benchmark
+ * Description: Benchmark how long GPU / CPU take, reading results with multiple process, hammer same page
+ * Test category: performance and functionality test
+ *
+ * SUBTEST: evict-malloc
+ * Description: trigger eviction of VRAM allocated via malloc
+ * Test category: functionality test
+ *
+ * SUBTEST: evict-malloc-mix-bo
+ * Description: trigger eviction of VRAM allocated via malloc and BO create
+ * Test category: functionality test
+ *
+ * SUBTEST: processes-evict-malloc
+ * Description: multi-process trigger eviction of VRAM allocated via malloc
+ * Test category: stress test
+ *
+ * SUBTEST: processes-evict-malloc-mix-bo
+ * Description: multi-process trigger eviction of VRAM allocated via malloc and BO create
+ * Test category: stress test
+ */
+
+static void
+many_allocs(int fd, struct drm_xe_engine_class_instance *eci,
+	    uint64_t total_alloc, uint64_t alloc_size, uint64_t stride,
+	    pthread_barrier_t *barrier, unsigned int flags)
+{
+	uint32_t vm, exec_queue;
+	int num_allocs = flags & BENCHMARK ? 1 :
+		(9 * (total_alloc / alloc_size)) / 8;
+	struct aligned_alloc_type *allocs;
+	uint32_t *bos = NULL;
+	struct timespec tv = {};
+	uint64_t submit, read, elapsed;
+	int i;
+
+	vm = xe_vm_create(fd, DRM_XE_VM_CREATE_FLAG_LR_MODE |
+			  DRM_XE_VM_CREATE_FLAG_FAULT_MODE, 0);
+	exec_queue = xe_exec_queue_create(fd, vm, eci, 0);
+
+	bind_system_allocator(NULL, 0);
+
+	allocs = malloc(sizeof(*allocs) * num_allocs);
+	igt_assert(allocs);
+	memset(allocs, 0, sizeof(*allocs) * num_allocs);
+
+	if (flags & MIX_BO_ALLOC) {
+		bos = malloc(sizeof(*bos) * num_allocs);
+		igt_assert(bos);
+		memset(bos, 0, sizeof(*bos) * num_allocs);
+	}
+
+	for (i = 0; i < num_allocs; ++i) {
+		struct aligned_alloc_type alloc;
+
+		if (flags & MIX_BO_ALLOC && odd(i)) {
+			uint32_t bo_flags =
+				DRM_XE_GEM_CREATE_FLAG_NEEDS_VISIBLE_VRAM;
+
+			alloc = __aligned_alloc(SZ_2M, alloc_size);
+			igt_assert(alloc.ptr);
+
+			bos[i] = xe_bo_create(fd, vm, alloc_size,
+					      vram_if_possible(fd, eci->gt_id),
+					      bo_flags);
+			alloc.ptr = xe_bo_map_fixed(fd, bos[i], alloc_size,
+						    to_user_pointer(alloc.ptr));
+			xe_vm_bind_async(fd, vm, 0, bos[i], 0,
+					 to_user_pointer(alloc.ptr),
+					 alloc_size, 0, 0);
+		} else {
+			alloc.ptr = aligned_alloc(SZ_2M, alloc_size);
+			igt_assert(alloc.ptr);
+		}
+		allocs[i] = alloc;
+
+		touch_all_pages(fd, exec_queue, allocs[i].ptr, alloc_size, stride,
+				&tv, &submit);
+	}
+
+	if (barrier)
+		pthread_barrier_wait(barrier);
+
+	for (i = 0; i < num_allocs; ++i) {
+		if (flags & BENCHMARK)
+			read = igt_nsec_elapsed(&tv);
+#define NUM_CHECK_THREADS	8
+		if (flags & CPU_FAULT_PROCESS)
+			check_all_pages_process(allocs[i].ptr, alloc_size, stride,
+						NUM_CHECK_THREADS, flags);
+		else if (flags & CPU_FAULT_THREADS)
+			check_all_pages_threads(allocs[i].ptr, alloc_size, stride,
+						NUM_CHECK_THREADS, flags);
+		else
+			check_all_pages(allocs[i].ptr, alloc_size, stride, NULL);
+		if (flags & BENCHMARK) {
+			elapsed = igt_nsec_elapsed(&tv);
+			igt_info("Execution took %.3fms (submit %.1fus, read %.1fus, total %.1fus, read_total %.1fus)\n",
+				 1e-6 * elapsed, 1e-3 * submit, 1e-3 * read,
+				 1e-3 * (elapsed - submit),
+				 1e-3 * (elapsed - read));
+		}
+		if (bos && bos[i]) {
+			__aligned_free(allocs + i);
+			gem_close(fd, bos[i]);
+		} else {
+			free(allocs[i].ptr);
+		}
+	}
+	if (bos)
+		free(bos);
+	free(allocs);
+	xe_exec_queue_destroy(fd, exec_queue);
+	xe_vm_destroy(fd, vm);
+}
+
+static void process_evict(struct drm_xe_engine_class_instance *hwe,
+			  uint64_t total_alloc, uint64_t alloc_size,
+			  uint64_t stride, unsigned int flags)
+{
+	struct process_data *pdata;
+	int map_fd;
+	int fd;
+
+	map_fd = open(sync_file, O_RDWR, 0x666);
+	pdata = mmap(NULL, sizeof(*pdata), PROT_READ |
+		     PROT_WRITE, MAP_SHARED, map_fd, 0);
+	wait_pdata(pdata);
+
+	fd = drm_open_driver(DRIVER_XE);
+	many_allocs(fd, hwe, total_alloc, alloc_size, stride, &pdata->barrier,
+		    flags);
+	drm_close_driver(fd);
+
+	close(map_fd);
+	munmap(pdata, sizeof(*pdata));
+}
+
+static void
+processes_evict(int fd, uint64_t alloc_size, uint64_t stride,
+		unsigned int flags)
+{
+	struct drm_xe_engine_class_instance *hwe;
+	struct process_data *pdata;
+	int n_engine_gt[2] = { 0, 0 }, n_engine = 0;
+	int map_fd;
+
+	map_fd = open(sync_file, O_RDWR | O_CREAT, 0x666);
+	posix_fallocate(map_fd, 0, sizeof(*pdata));
+	pdata = mmap(NULL, sizeof(*pdata), PROT_READ |
+		     PROT_WRITE, MAP_SHARED, map_fd, 0);
+
+	xe_for_each_engine(fd, hwe) {
+		igt_assert(hwe->gt_id < 2);
+		n_engine_gt[hwe->gt_id]++;
+		n_engine++;
+	}
+
+	init_pdata(pdata, n_engine);
+
+	xe_for_each_engine(fd, hwe) {
+		igt_fork(child, 1)
+			process_evict(hwe,
+				      xe_visible_vram_size(fd, hwe->gt_id) /
+				      n_engine_gt[hwe->gt_id], alloc_size,
+				      stride, flags);
+	}
+
+	signal_pdata(pdata);
+	igt_waitchildren();
+
+	close(map_fd);
+	munmap(pdata, sizeof(*pdata));
+}
+
+#define CPU_FAULT	(0x1 << 0)
+#define REMAP		(0x1 << 1)
+#define MIDDLE		(0x1 << 2)
+
+/**
+ * SUBTEST: partial-munmap-cpu-fault
+ * Description: munmap partially with cpu access in between
+ * Test category: functionality test
+ *
+ * SUBTEST: partial-munmap-no-cpu-fault
+ * Description: munmap partially with no cpu access in between
+ * Test category: functionality test
+ *
+ * SUBTEST: partial-remap-cpu-fault
+ * Description: remap partially with cpu access in between
+ * Test category: functionality test
+ *
+ * SUBTEST: partial-remap-no-cpu-fault
+ * Description: remap partially with no cpu access in between
+ * Test category: functionality test
+ *
+ * SUBTEST: partial-middle-munmap-cpu-fault
+ * Description: munmap middle with cpu access in between
+ * Test category: functionality test
+ *
+ * SUBTEST: partial-middle-munmap-no-cpu-fault
+ * Description: munmap middle with no cpu access in between
+ * Test category: functionality test
+ *
+ * SUBTEST: partial-middle-remap-cpu-fault
+ * Description: remap middle with cpu access in between
+ * Test category: functionality test
+ *
+ * SUBTEST: partial-middle-remap-no-cpu-fault
+ * Description: remap middle with no cpu access in between
+ * Test category: functionality test
+ */
+
+static void
+partial(int fd, struct drm_xe_engine_class_instance *eci, unsigned int flags)
+{
+	struct drm_xe_sync sync[1] = {
+		{ .type = DRM_XE_SYNC_TYPE_USER_FENCE, .flags = DRM_XE_SYNC_FLAG_SIGNAL,
+	          .timeline_value = USER_FENCE_VALUE },
+	};
+	struct drm_xe_exec exec = {
+		.num_batch_buffer = 1,
+		.num_syncs = 1,
+		.syncs = to_user_pointer(sync),
+	};
+	struct {
+		uint32_t batch[16];
+		uint64_t pad;
+		uint64_t vm_sync;
+		uint64_t exec_sync;
+		uint32_t data;
+		uint32_t expected_data;
+	} *data;
+	size_t bo_size = SZ_2M, unmap_offset = 0;
+	uint32_t vm, exec_queue;
+	u64 *exec_ufence = NULL;
+	int i;
+	void *old, *new = NULL;
+	struct aligned_alloc_type alloc;
+
+	if (flags & MIDDLE)
+		unmap_offset = bo_size / 4;
+
+	vm = xe_vm_create(fd, DRM_XE_VM_CREATE_FLAG_LR_MODE |
+			  DRM_XE_VM_CREATE_FLAG_FAULT_MODE, 0);
+
+	alloc = __aligned_alloc(bo_size, bo_size);
+	igt_assert(alloc.ptr);
+
+	data = mmap(alloc.ptr, bo_size, PROT_READ | PROT_WRITE,
+		    MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0);
+	igt_assert(data != MAP_FAILED);
+	memset(data, 0, bo_size);
+	old = data;
+
+	exec_queue = xe_exec_queue_create(fd, vm, eci, 0);
+
+	sync[0].addr = to_user_pointer(&data[0].vm_sync);
+	bind_system_allocator(sync, 1);
+	xe_wait_ufence(fd, &data[0].vm_sync, USER_FENCE_VALUE, 0, FIVE_SEC);
+	data[0].vm_sync = 0;
+
+	exec_ufence = mmap(NULL, SZ_4K, PROT_READ |
+			   PROT_WRITE, MAP_SHARED |
+			   MAP_ANONYMOUS, -1, 0);
+	igt_assert(exec_ufence != MAP_FAILED);
+	memset(exec_ufence, 0, SZ_4K);
+
+	for (i = 0; i < 2; i++) {
+		uint64_t addr = to_user_pointer(data);
+		uint64_t sdi_offset = (char *)&data[i].data - (char *)data;
+		uint64_t sdi_addr = addr + sdi_offset;
+		int b = 0;
+
+		write_dword(data[i].batch, sdi_addr, WRITE_VALUE(&data[i], i), &b);
+		igt_assert(b <= ARRAY_SIZE(data[i].batch));
+
+		if (!i)
+			data = old + unmap_offset + bo_size / 2;
+	}
+
+	data = old;
+	exec.exec_queue_id = exec_queue;
+
+	for (i = 0; i < 2; i++) {
+		uint64_t addr = to_user_pointer(data);
+		uint64_t batch_offset = (char *)&data[i].batch - (char *)data;
+		uint64_t batch_addr = addr + batch_offset;
+
+		sync[0].addr = new ? to_user_pointer(new) :
+			to_user_pointer(exec_ufence);
+		exec.address = batch_addr;
+		xe_exec(fd, &exec);
+
+		xe_wait_ufence(fd, new ?: exec_ufence, USER_FENCE_VALUE,
+			       exec_queue, FIVE_SEC);
+		if (i || (flags & CPU_FAULT))
+			igt_assert_eq(data[i].data, READ_VALUE(&data[i]));
+		exec_ufence[0] = 0;
+
+		if (!i) {
+			data = old + unmap_offset + bo_size / 2;
+			munmap(old + unmap_offset, bo_size / 2);
+			if (flags & REMAP) {
+				new = mmap(old + unmap_offset, bo_size / 2,
+					   PROT_READ | PROT_WRITE,
+					   MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED |
+					   MAP_LOCKED, -1, 0);
+				igt_assert(new != MAP_FAILED);
+			}
+		}
+	}
+
+	xe_exec_queue_destroy(fd, exec_queue);
+	munmap(exec_ufence, SZ_4K);
+	__aligned_free(&alloc);
+	if (new)
+		munmap(new, bo_size / 2);
+	xe_vm_destroy(fd, vm);
+}
+
+#define MAX_N_EXEC_QUEUES	16
+
+#define MMAP		(0x1 << 0)
+#define NEW		(0x1 << 1)
+#define BO_UNMAP	(0x1 << 2)
+#define FREE		(0x1 << 3)
+#define BUSY		(0x1 << 4)
+#define BO_MAP		(0x1 << 5)
+#define RACE		(0x1 << 6)
+#define SKIP_MEMSET	(0x1 << 7)
+#define FAULT		(0x1 << 8)
+#define FILE_BACKED	(0x1 << 9)
+#define LOCK		(0x1 << 10)
+#define MMAP_SHARED	(0x1 << 11)
+#define HUGE_PAGE	(0x1 << 12)
+#define SHARED_ALLOC	(0x1 << 13)
+#define FORK_READ	(0x1 << 14)
+#define FORK_READ_AFTER	(0x1 << 15)
+#define MREMAP		(0x1 << 16)
+#define DONTUNMAP	(0x1 << 17)
+#define READ_ONLY_REMAP	(0x1 << 18)
+#define SYNC_EXEC	(0x1 << 19)
+#define EVERY_OTHER_CHECK	(0x1 << 20)
+#define MULTI_FAULT	(0x1 << 21)
+
+#define N_MULTI_FAULT	4
+
+/**
+ * SUBTEST: once-%s
+ * Description: Run %arg[1] system allocator test only once
+ * Test category: functionality test
+ *
+ * SUBTEST: once-large-%s
+ * Description: Run %arg[1] system allocator test only once with large allocation
+ * Test category: functionality test
+ *
+ * SUBTEST: twice-%s
+ * Description: Run %arg[1] system allocator test twice
+ * Test category: functionality test
+ *
+ * SUBTEST: twice-large-%s
+ * Description: Run %arg[1] system allocator test twice with large allocation
+ * Test category: functionality test
+ *
+ * SUBTEST: many-%s
+ * Description: Run %arg[1] system allocator test many times
+ * Test category: stress test
+ *
+ * SUBTEST: many-stride-%s
+ * Description: Run %arg[1] system allocator test many times with a stride on each exec
+ * Test category: stress test
+ *
+ * SUBTEST: many-execqueues-%s
+ * Description: Run %arg[1] system allocator test on many exec_queues
+ * Test category: stress test
+ *
+ * SUBTEST: many-large-%s
+ * Description: Run %arg[1] system allocator test many times with large allocations
+ * Test category: stress test
+ *
+ * SUBTEST: many-large-execqueues-%s
+ * Description: Run %arg[1] system allocator test on many exec_queues with large allocations
+ *
+ * SUBTEST: threads-many-%s
+ * Description: Run %arg[1] system allocator threaded test many times
+ * Test category: stress test
+ *
+ * SUBTEST: threads-many-stride-%s
+ * Description: Run %arg[1] system allocator threaded test many times with a stride on each exec
+ * Test category: stress test
+ *
+ * SUBTEST: threads-many-execqueues-%s
+ * Description: Run %arg[1] system allocator threaded test on many exec_queues
+ * Test category: stress test
+ *
+ * SUBTEST: threads-many-large-%s
+ * Description: Run %arg[1] system allocator threaded test many times with large allocations
+ * Test category: stress test
+ *
+ * SUBTEST: threads-many-large-execqueues-%s
+ * Description: Run %arg[1] system allocator threaded test on many exec_queues with large allocations
+ *
+ * SUBTEST: threads-shared-vm-many-%s
+ * Description: Run %arg[1] system allocator threaded, shared vm test many times
+ * Test category: stress test
+ *
+ * SUBTEST: threads-shared-vm-many-stride-%s
+ * Description: Run %arg[1] system allocator threaded, shared vm test many times with a stride on each exec
+ * Test category: stress test
+ *
+ * SUBTEST: threads-shared-vm-many-execqueues-%s
+ * Description: Run %arg[1] system allocator threaded, shared vm test on many exec_queues
+ * Test category: stress test
+ *
+ * SUBTEST: threads-shared-vm-many-large-%s
+ * Description: Run %arg[1] system allocator threaded, shared vm test many times with large allocations
+ * Test category: stress test
+ *
+ * SUBTEST: threads-shared-vm-many-large-execqueues-%s
+ * Description: Run %arg[1] system allocator threaded, shared vm test on many exec_queues with large allocations
+ * Test category: stress test
+ *
+ * SUBTEST: process-many-%s
+ * Description: Run %arg[1] system allocator multi-process test many times
+ * Test category: stress test
+ *
+ * SUBTEST: process-many-stride-%s
+ * Description: Run %arg[1] system allocator multi-process test many times with a stride on each exec
+ * Test category: stress test
+ *
+ * SUBTEST: process-many-execqueues-%s
+ * Description: Run %arg[1] system allocator multi-process test on many exec_queues
+ * Test category: stress test
+ *
+ * SUBTEST: process-many-large-%s
+ * Description: Run %arg[1] system allocator multi-process test many times with large allocations
+ * Test category: stress test
+ *
+ * SUBTEST: process-many-large-execqueues-%s
+ * Description: Run %arg[1] system allocator multi-process test on many exec_queues with large allocations
+ *
+ * SUBTEST: fault
+ * Description: use a bad system allocator address resulting in a fault
+ * Test category: bad input
+ *
+ * arg[1]:
+ *
+ * @malloc:				malloc single buffer for all execs, issue a command which will trigger multiple faults
+ * @malloc-multi-fault:			malloc single buffer for all execs
+ * @malloc-fork-read:			malloc single buffer for all execs, fork a process to read test output
+ * @malloc-fork-read-after:		malloc single buffer for all execs, fork a process to read test output, check again after fork returns in parent
+ * @malloc-mlock:			malloc and mlock single buffer for all execs
+ * @malloc-race:			malloc single buffer for all execs with race between cpu and gpu access
+ * @malloc-bo-unmap:			malloc single buffer for all execs, bind and unbind a BO to same address before execs
+ * @malloc-busy:			malloc single buffer for all execs, try to unbind while buffer valid
+ * @mmap:				mmap single buffer for all execs
+ * @mmap-remap:				mmap and mremap a buffer for all execs
+ * @mmap-remap-dontunmap:		mmap and mremap a buffer with dontunmap flag for all execs
+ * @mmap-remap-ro:			mmap and mremap a read-only buffer for all execs
+ * @mmap-remap-ro-dontunmap:		mmap and mremap a read-only buffer with dontunmap flag for all execs
+ * @mmap-remap-eocheck:			mmap and mremap a buffer for all execs, check data every other loop iteration
+ * @mmap-remap-dontunmap-eocheck:	mmap and mremap a buffer with dontunmap flag for all execs, check data every other loop iteration
+ * @mmap-remap-ro-eocheck:		mmap and mremap a read-only buffer for all execs, check data every other loop iteration
+ * @mmap-remap-ro-dontunmap-eocheck:	mmap and mremap a read-only buffer with dontunmap flag for all execs, check data every other loop iteration
+ * @mmap-huge:				mmap huge page single buffer for all execs
+ * @mmap-shared:			mmap shared single buffer for all execs
+ * @mmap-shared-remap:			mmap shared and mremap a buffer for all execs
+ * @mmap-shared-remap-dontunmap:	mmap shared and mremap a buffer with dontunmap flag for all execs
+ * @mmap-shared-remap-eocheck:		mmap shared and mremap a buffer for all execs, check data every other loop iteration
+ * @mmap-shared-remap-dontunmap-eocheck:	mmap shared and mremap a buffer with dontunmap flag for all execs, check data every other loop iteration
+ * @mmap-mlock:				mmap and mlock single buffer for all execs
+ * @mmap-file:				mmap single buffer, with file backing, for all execs
+ * @mmap-file-mlock:			mmap and mlock single buffer, with file backing, for all execs
+ * @mmap-race:				mmap single buffer for all execs with race between cpu and gpu access
+ * @free:				malloc and free buffer for each exec
+ * @free-race:				malloc and free buffer for each exec with race between cpu and gpu access
+ * @new:				malloc a new buffer for each exec
+ * @new-race:				malloc a new buffer for each exec with race between cpu and gpu access
+ * @new-bo-map:				malloc a new buffer or map BO for each exec
+ * @new-busy:				malloc a new buffer for each exec, try to unbind while buffers valid
+ * @mmap-free:				mmap and free buffer for each exec
+ * @mmap-free-huge:			mmap huge page and free buffer for each exec
+ * @mmap-free-race:			mmap and free buffer for each exec with race between cpu and gpu access
+ * @mmap-new:				mmap a new buffer for each exec
+ * @mmap-new-huge:			mmap huge page a new buffer for each exec
+ * @mmap-new-race:			mmap a new buffer for each exec with race between cpu and gpu access
+ * @malloc-nomemset:			malloc single buffer for all execs, skip memset of buffers
+ * @malloc-mlock-nomemset:		malloc and mlock single buffer for all execs, skip memset of buffers
+ * @malloc-race-nomemset:		malloc single buffer for all execs with race between cpu and gpu access, skip memset of buffers
+ * @malloc-bo-unmap-nomemset:		malloc single buffer for all execs, bind and unbind a BO to same address before execs, skip memset of buffers
+ * @malloc-busy-nomemset:		malloc single buffer for all execs, try to unbind while buffer valid, skip memset of buffers
+ * @mmap-nomemset:			mmap single buffer for all execs, skip memset of buffers
+ * @mmap-huge-nomemset:			mmap huge page single buffer for all execs, skip memset of buffers
+ * @mmap-shared-nomemset:		mmap shared single buffer for all execs, skip memset of buffers
+ * @mmap-mlock-nomemset:		mmap and mlock single buffer for all execs, skip memset of buffers
+ * @mmap-file-nomemset:			mmap single buffer, with file backing, for all execs, skip memset of buffers
+ * @mmap-file-mlock-nomemset:		mmap and mlock single buffer, with file backing, for all execs, skip memset of buffers
+ * @mmap-race-nomemset:			mmap single buffer for all execs with race between cpu and gpu access, skip memset of buffers
+ * @free-nomemset:			malloc and free buffer for each exec, skip memset of buffers
+ * @free-race-nomemset:			malloc and free buffer for each exec with race between cpu and gpu access, skip memset of buffers
+ * @new-nomemset:			malloc a new buffer for each exec, skip memset of buffers
+ * @new-race-nomemset:			malloc a new buffer for each exec with race between cpu and gpu access, skip memset of buffers
+ * @new-bo-map-nomemset:		malloc a new buffer or map BO for each exec, skip memset of buffers
+ * @new-busy-nomemset:			malloc a new buffer for each exec, try to unbind while buffers valid, skip memset of buffers
+ * @mmap-free-nomemset:			mmap and free buffer for each exec, skip memset of buffers
+ * @mmap-free-huge-nomemset:		mmap huge page and free buffer for each exec, skip memset of buffers
+ * @mmap-free-race-nomemset:		mmap and free buffer for each exec with race between cpu and gpu access, skip memset of buffers
+ * @mmap-new-nomemset:			mmap a new buffer for each exec, skip memset of buffers
+ * @mmap-new-huge-nomemset:		mmap huge page new buffer for each exec, skip memset of buffers
+ * @mmap-new-race-nomemset:		mmap a new buffer for each exec with race between cpu and gpu access, skip memset of buffers
+ *
+ * SUBTEST: threads-shared-vm-shared-alloc-many-stride-malloc
+ * Description: Create multiple threads with a shared VM triggering faults on different hardware engines to same addresses
+ * Test category: stress test
+ *
+ * SUBTEST: threads-shared-vm-shared-alloc-many-stride-malloc-race
+ * Description: Create multiple threads with a shared VM triggering faults on different hardware engines to same addresses, racing between CPU and GPU access
+ * Test category: stress test
+ *
+ * SUBTEST: threads-shared-alloc-many-stride-malloc
+ * Description: Create multiple threads with a faults on different hardware engines to same addresses
+ * Test category: stress test
+ *
+ * SUBTEST: threads-shared-alloc-many-stride-malloc-sync
+ * Description: Create multiple threads with a faults on different hardware engines to same addresses, syncing on each exec
+ * Test category: stress test
+ *
+ * SUBTEST: threads-shared-alloc-many-stride-malloc-race
+ * Description: Create multiple threads with a faults on different hardware engines to same addresses, racing between CPU and GPU access
+ * Test category: stress test
+ */
+
+struct test_exec_data {
+	uint32_t batch[32];
+	uint64_t pad;
+	uint64_t vm_sync;
+	uint64_t exec_sync;
+	uint32_t data;
+	uint32_t expected_data;
+};
+
+static void
+test_exec(int fd, struct drm_xe_engine_class_instance *eci,
+	  int n_exec_queues, int n_execs, size_t bo_size,
+	  size_t stride, uint32_t vm, void *alloc, pthread_barrier_t *barrier,
+	  unsigned int flags)
+{
+	uint64_t addr;
+	struct drm_xe_sync sync[1] = {
+		{ .type = DRM_XE_SYNC_TYPE_USER_FENCE, .flags = DRM_XE_SYNC_FLAG_SIGNAL,
+	          .timeline_value = USER_FENCE_VALUE },
+	};
+	struct drm_xe_exec exec = {
+		.num_batch_buffer = 1,
+		.num_syncs = 1,
+		.syncs = to_user_pointer(sync),
+	};
+	uint32_t exec_queues[MAX_N_EXEC_QUEUES];
+	struct test_exec_data *data, *next_data = NULL;
+	uint32_t bo_flags;
+	uint32_t bo = 0;
+	void **pending_free;
+	u64 *exec_ufence = NULL;
+	int i, j, b, file_fd = -1, prev_idx;
+	bool free_vm = false;
+	size_t aligned_size = bo_size ?: xe_get_default_alignment(fd);
+	size_t orig_size = bo_size;
+	struct aligned_alloc_type aligned_alloc_type;
+
+	if (flags & MULTI_FAULT) {
+		if (!bo_size)
+			return;
+
+		bo_size *= N_MULTI_FAULT;
+	}
+
+	if (flags & SHARED_ALLOC)
+		return;
+
+	if (flags & EVERY_OTHER_CHECK && odd(n_execs))
+		return;
+
+	if (flags & EVERY_OTHER_CHECK)
+		igt_assert(flags & MREMAP);
+
+	igt_assert(n_exec_queues <= MAX_N_EXEC_QUEUES);
+
+	if (flags & NEW && !(flags & FREE)) {
+		pending_free = malloc(sizeof(*pending_free) * n_execs);
+		igt_assert(pending_free);
+		memset(pending_free, 0, sizeof(*pending_free) * n_execs);
+	}
+
+	if (!vm) {
+		vm = xe_vm_create(fd, DRM_XE_VM_CREATE_FLAG_LR_MODE |
+				  DRM_XE_VM_CREATE_FLAG_FAULT_MODE, 0);
+		free_vm = true;
+	}
+	if (!bo_size) {
+		if (!stride) {
+			bo_size = sizeof(*data) * n_execs;
+			bo_size = xe_bb_size(fd, bo_size);
+		} else {
+			bo_size = stride * n_execs * sizeof(*data);
+			bo_size = xe_bb_size(fd, bo_size);
+		}
+	}
+	if (flags & HUGE_PAGE) {
+		aligned_size = ALIGN(aligned_size, SZ_2M);
+		bo_size = ALIGN(bo_size, SZ_2M);
+	}
+
+	if (alloc) {
+		data = alloc;
+	} else {
+		if (flags & MMAP) {
+			int mmap_flags = MAP_FIXED;
+
+			aligned_alloc_type = __aligned_alloc(aligned_size, bo_size);
+			data = aligned_alloc_type.ptr;
+			igt_assert(data);
+			__aligned_partial_free(&aligned_alloc_type);
+
+			if (flags & MMAP_SHARED)
+				mmap_flags |= MAP_SHARED;
+			else
+				mmap_flags |= MAP_PRIVATE;
+
+			if (flags & HUGE_PAGE)
+				mmap_flags |= MAP_HUGETLB | MAP_HUGE_2MB;
+
+			if (flags & FILE_BACKED) {
+				char name[] = "/tmp/xe_exec_system_allocator_datXXXXXX";
+
+				igt_assert(!(flags & NEW));
+
+				file_fd = mkstemp(name);
+				posix_fallocate(file_fd, 0, bo_size);
+			} else {
+				mmap_flags |= MAP_ANONYMOUS;
+			}
+
+			data = mmap(data, bo_size, PROT_READ |
+				    PROT_WRITE, mmap_flags, file_fd, 0);
+			igt_assert(data != MAP_FAILED);
+		} else {
+			data = aligned_alloc(aligned_size, bo_size);
+			igt_assert(data);
+		}
+		if (!(flags & SKIP_MEMSET))
+			memset(data, 0, bo_size);
+		if (flags & LOCK) {
+			igt_assert(!(flags & NEW));
+			mlock(data, bo_size);
+		}
+	}
+
+	for (i = 0; i < n_exec_queues; i++)
+		exec_queues[i] = xe_exec_queue_create(fd, vm, eci, 0);
+
+	sync[0].addr = to_user_pointer(&data[0].vm_sync);
+	if (free_vm) {
+		bind_system_allocator(sync, 1);
+		xe_wait_ufence(fd, &data[0].vm_sync, USER_FENCE_VALUE, 0, FIVE_SEC);
+	}
+	data[0].vm_sync = 0;
+
+	addr = to_user_pointer(data);
+
+	if (flags & BO_UNMAP) {
+		bo_flags = DRM_XE_GEM_CREATE_FLAG_NEEDS_VISIBLE_VRAM;
+		bo = xe_bo_create(fd, vm, bo_size,
+				  vram_if_possible(fd, eci->gt_id), bo_flags);
+		xe_vm_bind_async(fd, vm, 0, bo, 0, addr, bo_size, 0, 0);
+
+		__xe_vm_bind_assert(fd, vm, 0,
+				    0, 0, addr, bo_size,
+				    DRM_XE_VM_BIND_OP_MAP,
+				    DRM_XE_VM_BIND_FLAG_CPU_ADDR_MIRROR, sync,
+				    1, 0, 0);
+		xe_wait_ufence(fd, &data[0].vm_sync, USER_FENCE_VALUE, 0,
+			       FIVE_SEC);
+		data[0].vm_sync = 0;
+		gem_close(fd, bo);
+		bo = 0;
+	}
+
+	if (!(flags & RACE)) {
+		exec_ufence = mmap(NULL, SZ_4K, PROT_READ |
+				   PROT_WRITE, MAP_SHARED |
+				   MAP_ANONYMOUS, -1, 0);
+		igt_assert(exec_ufence != MAP_FAILED);
+		memset(exec_ufence, 0, SZ_4K);
+	}
+
+	for (i = 0; i < n_execs; i++) {
+		int idx = !stride ? i : i * stride, next_idx = !stride
+			? (i + 1) : (i + 1) * stride;
+		uint64_t batch_offset = (char *)&data[idx].batch - (char *)data;
+		uint64_t batch_addr = addr + batch_offset;
+		uint64_t sdi_offset = (char *)&data[idx].data - (char *)data;
+		uint64_t sdi_addr = addr + sdi_offset;
+		int e = i % n_exec_queues, err;
+		bool fault_inject = (FAULT & flags) && i == n_execs / 2;
+		bool fault_injected = (FAULT & flags) && i > n_execs;
+
+		if (barrier)
+			pthread_barrier_wait(barrier);
+
+		if (flags & MULTI_FAULT) {
+			b = 0;
+			for (j = 0; j < N_MULTI_FAULT - 1; ++j)
+				__write_dword(data[idx].batch,
+					      sdi_addr + j * orig_size,
+					      WRITE_VALUE(&data[idx], idx), &b);
+			write_dword(data[idx].batch, sdi_addr + j * orig_size,
+				    WRITE_VALUE(&data[idx], idx), &b);
+			igt_assert(b <= ARRAY_SIZE(data[idx].batch));
+		} else if (!(flags & EVERY_OTHER_CHECK)) {
+			b = 0;
+			write_dword(data[idx].batch, sdi_addr,
+				    WRITE_VALUE(&data[idx], idx), &b);
+			igt_assert(b <= ARRAY_SIZE(data[idx].batch));
+		} else if (flags & EVERY_OTHER_CHECK && !odd(i)) {
+			b = 0;
+			write_dword(data[idx].batch, sdi_addr,
+				    WRITE_VALUE(&data[idx], idx), &b);
+			igt_assert(b <= ARRAY_SIZE(data[idx].batch));
+
+			aligned_alloc_type = __aligned_alloc(aligned_size, bo_size);
+			next_data = aligned_alloc_type.ptr;
+			igt_assert(next_data);
+			__aligned_partial_free(&aligned_alloc_type);
+
+			b = 0;
+			write_dword(data[next_idx].batch,
+				    to_user_pointer(next_data) +
+				    (char *)&data[next_idx].data - (char *)data,
+				    WRITE_VALUE(&data[next_idx], next_idx), &b);
+			igt_assert(b <= ARRAY_SIZE(data[next_idx].batch));
+		}
+
+		if (!exec_ufence)
+			data[idx].exec_sync = 0;
+
+		sync[0].addr = exec_ufence ? to_user_pointer(exec_ufence) :
+			addr + (char *)&data[idx].exec_sync - (char *)data;
+
+		exec.exec_queue_id = exec_queues[e];
+		if (fault_inject)
+			exec.address = batch_addr * 2;
+		else
+			exec.address = batch_addr;
+
+		if (fault_injected) {
+			err = __xe_exec(fd, &exec);
+			igt_assert(err == -ENOENT);
+		} else {
+			xe_exec(fd, &exec);
+		}
+
+		if (barrier)
+			pthread_barrier_wait(barrier);
+
+		if (fault_inject || fault_injected) {
+			int64_t timeout = QUARTER_SEC;
+
+			err = __xe_wait_ufence(fd, exec_ufence ? exec_ufence :
+					       &data[idx].exec_sync,
+					       USER_FENCE_VALUE,
+					       exec_queues[e], &timeout);
+			igt_assert(err == -ETIME || err == -EIO);
+		} else {
+			xe_wait_ufence(fd, exec_ufence ? exec_ufence :
+				       &data[idx].exec_sync, USER_FENCE_VALUE,
+				       exec_queues[e], FIVE_SEC);
+			if (flags & LOCK && !i)
+				munlock(data, bo_size);
+
+			if (flags & MREMAP) {
+				void *old = data;
+				int remap_flags = MREMAP_MAYMOVE | MREMAP_FIXED;
+
+				if (flags & DONTUNMAP)
+					remap_flags |= MREMAP_DONTUNMAP;
+
+				if (flags & READ_ONLY_REMAP)
+					igt_assert(!mprotect(old, bo_size,
+							     PROT_READ));
+
+				if (!next_data) {
+					aligned_alloc_type = __aligned_alloc(aligned_size,
+								    bo_size);
+					data = aligned_alloc_type.ptr;
+					__aligned_partial_free(&aligned_alloc_type);
+				} else {
+					data = next_data;
+				}
+				next_data = NULL;
+				igt_assert(data);
+
+				data = mremap(old, bo_size, bo_size,
+					      remap_flags, data);
+				igt_assert(data != MAP_FAILED);
+
+				if (flags & READ_ONLY_REMAP)
+					igt_assert(!mprotect(data, bo_size,
+							     PROT_READ |
+							     PROT_WRITE));
+
+				addr = to_user_pointer(data);
+				if (flags & DONTUNMAP)
+					munmap(old, bo_size);
+			}
+
+			if (!(flags & EVERY_OTHER_CHECK) || odd(i)) {
+				if (flags & FORK_READ) {
+					igt_fork(child, 1)
+						igt_assert_eq(data[idx].data,
+							      READ_VALUE(&data[idx]));
+					if (!(flags & FORK_READ_AFTER))
+						igt_assert_eq(data[idx].data,
+							      READ_VALUE(&data[idx]));
+					igt_waitchildren();
+					if (flags & FORK_READ_AFTER)
+						igt_assert_eq(data[idx].data,
+							      READ_VALUE(&data[idx]));
+				} else {
+					igt_assert_eq(data[idx].data,
+						      READ_VALUE(&data[idx]));
+
+					if (flags & MULTI_FAULT) {
+						for (j = 1; j < N_MULTI_FAULT; ++j) {
+							struct test_exec_data *__data =
+								((void *)data) + j * orig_size;
+
+							igt_assert_eq(__data[idx].data,
+								      READ_VALUE(&data[idx]));
+						}
+					}
+				}
+				if (flags & EVERY_OTHER_CHECK)
+					igt_assert_eq(data[prev_idx].data,
+						      READ_VALUE(&data[prev_idx]));
+			}
+		}
+
+		if (exec_ufence)
+			exec_ufence[0] = 0;
+
+		if (bo) {
+			__xe_vm_bind_assert(fd, vm, 0,
+					    0, 0, addr, bo_size,
+					    DRM_XE_VM_BIND_OP_MAP,
+					    DRM_XE_VM_BIND_FLAG_CPU_ADDR_MIRROR,
+					    NULL, 0, 0, 0);
+			munmap(data, bo_size);
+			gem_close(fd, bo);
+		}
+
+		if (flags & NEW) {
+			if (flags & MMAP) {
+				if (flags & FREE)
+					munmap(data, bo_size);
+				else
+					pending_free[i] = data;
+				data = mmap(NULL, bo_size, PROT_READ |
+					    PROT_WRITE, MAP_SHARED |
+					    MAP_ANONYMOUS, -1, 0);
+				igt_assert(data != MAP_FAILED);
+			} else if (flags & BO_MAP && odd(i)) {
+				if (!bo) {
+					if (flags & FREE)
+						free(data);
+					else
+						pending_free[i] = data;
+				}
+
+				aligned_alloc_type = __aligned_alloc(aligned_size, bo_size);
+				data = aligned_alloc_type.ptr;
+				igt_assert(data);
+				__aligned_partial_free(&aligned_alloc_type);
+
+				bo_flags = DRM_XE_GEM_CREATE_FLAG_NEEDS_VISIBLE_VRAM;
+				bo = xe_bo_create(fd, vm, bo_size,
+						  vram_if_possible(fd, eci->gt_id),
+						  bo_flags);
+				data = xe_bo_map_fixed(fd, bo, bo_size,
+						       to_user_pointer(data));
+
+				xe_vm_bind_async(fd, vm, 0, bo, 0,
+						 to_user_pointer(data),
+						 bo_size, 0, 0);
+			} else {
+				if (!bo) {
+					if (flags & FREE)
+						free(data);
+					else
+						pending_free[i] = data;
+				}
+				bo = 0;
+				data = aligned_alloc(aligned_size, bo_size);
+				igt_assert(data);
+			}
+			addr = to_user_pointer(data);
+			if (!(flags & SKIP_MEMSET))
+				memset(data, 0, bo_size);
+		}
+
+		prev_idx = idx;
+	}
+
+	if (bo) {
+		__xe_vm_bind_assert(fd, vm, 0,
+				    0, 0, addr, bo_size,
+				    DRM_XE_VM_BIND_OP_MAP,
+				    DRM_XE_VM_BIND_FLAG_CPU_ADDR_MIRROR,
+				    NULL, 0, 0, 0);
+		munmap(data, bo_size);
+		data = NULL;
+		gem_close(fd, bo);
+	}
+
+	if (flags & BUSY)
+		igt_assert_eq(unbind_system_allocator(), -EBUSY);
+
+	for (i = 0; i < n_exec_queues; i++)
+		xe_exec_queue_destroy(fd, exec_queues[i]);
+
+	if (exec_ufence)
+		munmap(exec_ufence, SZ_4K);
+
+	if (flags & LOCK)
+		munlock(data, bo_size);
+
+	if (file_fd != -1)
+		close(file_fd);
+
+	if (flags & NEW && !(flags & FREE)) {
+		for (i = 0; i < n_execs; i++) {
+			if (!pending_free[i])
+				continue;
+
+			if (flags & MMAP)
+				munmap(pending_free[i], bo_size);
+			else
+				free(pending_free[i]);
+		}
+		free(pending_free);
+	}
+	if (data) {
+		if (flags & MMAP)
+			munmap(data, bo_size);
+		else if (!alloc)
+			free(data);
+	}
+	if (free_vm)
+		xe_vm_destroy(fd, vm);
+}
+
+struct thread_data {
+	pthread_t thread;
+	pthread_mutex_t *mutex;
+	pthread_cond_t *cond;
+	pthread_barrier_t *barrier;
+	int fd;
+	struct drm_xe_engine_class_instance *eci;
+	int n_exec_queues;
+	int n_execs;
+	size_t bo_size;
+	size_t stride;
+	uint32_t vm;
+	unsigned int flags;
+	void *alloc;
+	bool *go;
+};
+
+static void *thread(void *data)
+{
+	struct thread_data *t = data;
+
+	pthread_mutex_lock(t->mutex);
+	while (!*t->go)
+		pthread_cond_wait(t->cond, t->mutex);
+	pthread_mutex_unlock(t->mutex);
+
+	test_exec(t->fd, t->eci, t->n_exec_queues, t->n_execs,
+		  t->bo_size, t->stride, t->vm, t->alloc, t->barrier,
+		  t->flags);
+
+	return NULL;
+}
+
+static void
+threads(int fd, int n_exec_queues, int n_execs, size_t bo_size,
+	size_t stride, unsigned int flags, bool shared_vm)
+{
+	struct drm_xe_engine_class_instance *hwe;
+	struct thread_data *threads_data;
+	int n_engines = 0, i = 0;
+	pthread_mutex_t mutex;
+	pthread_cond_t cond;
+	pthread_barrier_t barrier;
+	uint32_t vm = 0;
+	bool go = false;
+	void *alloc = NULL;
+
+	if ((FILE_BACKED | FORK_READ) & flags)
+		return;
+
+	xe_for_each_engine(fd, hwe)
+		++n_engines;
+
+	if (shared_vm) {
+		vm = xe_vm_create(fd, DRM_XE_VM_CREATE_FLAG_LR_MODE |
+				  DRM_XE_VM_CREATE_FLAG_FAULT_MODE, 0);
+		bind_system_allocator(NULL, 0);
+	}
+
+	if (flags & SHARED_ALLOC) {
+		uint64_t alloc_size;
+
+		igt_assert(stride);
+
+		alloc_size = sizeof(struct test_exec_data) * stride *
+			n_execs * n_engines;
+		alloc_size = xe_bb_size(fd, alloc_size);
+		alloc = aligned_alloc(SZ_2M, alloc_size);
+		igt_assert(alloc);
+
+		memset(alloc, 0, alloc_size);
+		flags &= ~SHARED_ALLOC;
+	}
+
+	threads_data = calloc(n_engines, sizeof(*threads_data));
+	igt_assert(threads_data);
+
+	pthread_mutex_init(&mutex, 0);
+	pthread_cond_init(&cond, 0);
+	pthread_barrier_init(&barrier, 0, n_engines);
+
+	xe_for_each_engine(fd, hwe) {
+		threads_data[i].mutex = &mutex;
+		threads_data[i].cond = &cond;
+		threads_data[i].barrier = (flags & SYNC_EXEC) ? &barrier : NULL;
+		threads_data[i].fd = fd;
+		threads_data[i].eci = hwe;
+		threads_data[i].n_exec_queues = n_exec_queues;
+		threads_data[i].n_execs = n_execs;
+		threads_data[i].bo_size = bo_size;
+		threads_data[i].stride = stride;
+		threads_data[i].vm = vm;
+		threads_data[i].flags = flags;
+		threads_data[i].alloc = alloc ? alloc + i *
+			sizeof(struct test_exec_data) : NULL;
+		threads_data[i].go = &go;
+		pthread_create(&threads_data[i].thread, 0, thread,
+			       &threads_data[i]);
+		++i;
+	}
+
+	pthread_mutex_lock(&mutex);
+	go = true;
+	pthread_cond_broadcast(&cond);
+	pthread_mutex_unlock(&mutex);
+
+	for (i = 0; i < n_engines; ++i)
+		pthread_join(threads_data[i].thread, NULL);
+
+	if (shared_vm) {
+		int ret;
+
+		if (flags & MMAP) {
+			int tries = 300;
+
+			while (tries && (ret = unbind_system_allocator()) == -EBUSY) {
+				sleep(.01);
+				--tries;
+			}
+			igt_assert_eq(ret, 0);
+		}
+		xe_vm_destroy(fd, vm);
+		if (alloc)
+			free(alloc);
+	}
+	free(threads_data);
+}
+
+static void process(struct drm_xe_engine_class_instance *hwe, int n_exec_queues,
+		    int n_execs, size_t bo_size, size_t stride,
+		    unsigned int flags)
+{
+	struct process_data *pdata;
+	int map_fd;
+	int fd;
+
+	map_fd = open(sync_file, O_RDWR, 0x666);
+	pdata = mmap(NULL, sizeof(*pdata), PROT_READ |
+		     PROT_WRITE, MAP_SHARED, map_fd, 0);
+	wait_pdata(pdata);
+
+	fd = drm_open_driver(DRIVER_XE);
+	test_exec(fd, hwe, n_exec_queues, n_execs,
+		  bo_size, stride, 0, NULL, NULL, flags);
+	drm_close_driver(fd);
+
+	close(map_fd);
+	munmap(pdata, sizeof(*pdata));
+}
+
+static void
+processes(int fd, int n_exec_queues, int n_execs, size_t bo_size,
+	  size_t stride, unsigned int flags)
+{
+	struct drm_xe_engine_class_instance *hwe;
+	struct process_data *pdata;
+	int map_fd;
+
+	if (flags & FORK_READ)
+		return;
+
+	map_fd = open(sync_file, O_RDWR | O_CREAT, 0x666);
+	posix_fallocate(map_fd, 0, sizeof(*pdata));
+	pdata = mmap(NULL, sizeof(*pdata), PROT_READ |
+		     PROT_WRITE, MAP_SHARED, map_fd, 0);
+
+	init_pdata(pdata, 0);
+
+	xe_for_each_engine(fd, hwe) {
+		igt_fork(child, 1)
+			process(hwe, n_exec_queues, n_execs, bo_size,
+				stride, flags);
+	}
+
+	signal_pdata(pdata);
+	igt_waitchildren();
+
+	close(map_fd);
+	munmap(pdata, sizeof(*pdata));
+}
+
+struct section {
+	const char *name;
+	unsigned int flags;
+};
+
+igt_main
+{
+	struct drm_xe_engine_class_instance *hwe;
+	const struct section sections[] = {
+		{ "malloc", 0 },
+		{ "malloc-multi-fault", MULTI_FAULT },
+		{ "malloc-fork-read", FORK_READ },
+		{ "malloc-fork-read-after", FORK_READ | FORK_READ_AFTER },
+		{ "malloc-mlock", LOCK },
+		{ "malloc-race", RACE },
+		{ "malloc-busy", BUSY },
+		{ "malloc-bo-unmap", BO_UNMAP },
+		{ "mmap", MMAP },
+		{ "mmap-remap", MMAP | MREMAP },
+		{ "mmap-remap-dontunmap", MMAP | MREMAP | DONTUNMAP },
+		{ "mmap-remap-ro", MMAP | MREMAP | READ_ONLY_REMAP },
+		{ "mmap-remap-ro-dontunmap", MMAP | MREMAP | DONTUNMAP |
+			READ_ONLY_REMAP },
+		{ "mmap-remap-eocheck", MMAP | MREMAP | EVERY_OTHER_CHECK },
+		{ "mmap-remap-dontunmap-eocheck", MMAP | MREMAP | DONTUNMAP |
+			EVERY_OTHER_CHECK },
+		{ "mmap-remap-ro-eocheck", MMAP | MREMAP | READ_ONLY_REMAP |
+			EVERY_OTHER_CHECK },
+		{ "mmap-remap-ro-dontunmap-eocheck", MMAP | MREMAP | DONTUNMAP |
+			READ_ONLY_REMAP | EVERY_OTHER_CHECK },
+		{ "mmap-huge", MMAP | HUGE_PAGE },
+		{ "mmap-shared", MMAP | LOCK | MMAP_SHARED },
+		{ "mmap-shared-remap", MMAP | LOCK | MMAP_SHARED | MREMAP },
+		{ "mmap-shared-remap-dontunmap", MMAP | LOCK | MMAP_SHARED |
+			MREMAP | DONTUNMAP },
+		{ "mmap-shared-remap-eocheck", MMAP | LOCK | MMAP_SHARED |
+			MREMAP | EVERY_OTHER_CHECK },
+		{ "mmap-shared-remap-dontunmap-eocheck", MMAP | LOCK |
+			MMAP_SHARED | MREMAP | DONTUNMAP | EVERY_OTHER_CHECK },
+		{ "mmap-mlock", MMAP | LOCK },
+		{ "mmap-file", MMAP | FILE_BACKED },
+		{ "mmap-file-mlock", MMAP | LOCK | FILE_BACKED },
+		{ "mmap-race", MMAP | RACE },
+		{ "free", NEW | FREE },
+		{ "free-race", NEW | FREE | RACE },
+		{ "new", NEW },
+		{ "new-race", NEW | RACE },
+		{ "new-bo-map", NEW | BO_MAP },
+		{ "new-busy", NEW | BUSY },
+		{ "mmap-free", MMAP | NEW | FREE },
+		{ "mmap-free-huge", MMAP | NEW | FREE | HUGE_PAGE },
+		{ "mmap-free-race", MMAP | NEW | FREE | RACE },
+		{ "mmap-new", MMAP | NEW },
+		{ "mmap-new-huge", MMAP | NEW | HUGE_PAGE },
+		{ "mmap-new-race", MMAP | NEW | RACE },
+		{ "malloc-nomemset", SKIP_MEMSET },
+		{ "malloc-mlock-nomemset", SKIP_MEMSET | LOCK },
+		{ "malloc-race-nomemset", SKIP_MEMSET | RACE },
+		{ "malloc-busy-nomemset", SKIP_MEMSET | BUSY },
+		{ "malloc-bo-unmap-nomemset", SKIP_MEMSET | BO_UNMAP },
+		{ "mmap-nomemset", SKIP_MEMSET | MMAP },
+		{ "mmap-huge-nomemset", SKIP_MEMSET | MMAP | HUGE_PAGE },
+		{ "mmap-shared-nomemset", SKIP_MEMSET | MMAP | MMAP_SHARED },
+		{ "mmap-mlock-nomemset", SKIP_MEMSET | MMAP | LOCK },
+		{ "mmap-file-nomemset", SKIP_MEMSET | MMAP | FILE_BACKED },
+		{ "mmap-file-mlock-nomemset", SKIP_MEMSET | MMAP | LOCK | FILE_BACKED },
+		{ "mmap-race-nomemset", SKIP_MEMSET | MMAP | RACE },
+		{ "free-nomemset", SKIP_MEMSET | NEW | FREE },
+		{ "free-race-nomemset", SKIP_MEMSET | NEW | FREE | RACE },
+		{ "new-nomemset", SKIP_MEMSET | NEW },
+		{ "new-race-nomemset", SKIP_MEMSET | NEW | RACE },
+		{ "new-bo-map-nomemset", SKIP_MEMSET | NEW | BO_MAP },
+		{ "new-busy-nomemset", SKIP_MEMSET | NEW | BUSY },
+		{ "mmap-free-nomemset", SKIP_MEMSET | MMAP | NEW | FREE },
+		{ "mmap-free-huge-nomemset", SKIP_MEMSET | MMAP | NEW | FREE | HUGE_PAGE },
+		{ "mmap-free-race-nomemset", SKIP_MEMSET | MMAP | NEW | FREE | RACE },
+		{ "mmap-new-nomemset", SKIP_MEMSET | MMAP | NEW },
+		{ "mmap-new-huge-nomemset", SKIP_MEMSET | MMAP | NEW | HUGE_PAGE },
+		{ "mmap-new-race-nomemset", SKIP_MEMSET | MMAP | NEW | RACE },
+		{ NULL },
+	};
+	const struct section psections[] = {
+		{ "munmap-cpu-fault", CPU_FAULT },
+		{ "munmap-no-cpu-fault", 0 },
+		{ "remap-cpu-fault", CPU_FAULT | REMAP },
+		{ "remap-no-cpu-fault", REMAP },
+		{ "middle-munmap-cpu-fault", MIDDLE | CPU_FAULT },
+		{ "middle-munmap-no-cpu-fault", MIDDLE },
+		{ "middle-remap-cpu-fault", MIDDLE | CPU_FAULT | REMAP },
+		{ "middle-remap-no-cpu-fault", MIDDLE | REMAP },
+		{ NULL },
+	};
+	const struct section esections[] = {
+		{ "malloc", 0 },
+		{ "malloc-mix-bo", MIX_BO_ALLOC },
+		{ NULL },
+	};
+	int fd;
+
+	igt_fixture {
+		struct xe_device *xe;
+
+		fd = drm_open_driver(DRIVER_XE);
+		igt_require(!xe_supports_faults(fd));
+
+		xe = xe_device_get(fd);
+		va_bits = xe->va_bits;
+		open_sync_file();
+	}
+
+	for (const struct section *s = sections; s->name; s++) {
+		igt_subtest_f("once-%s", s->name)
+			xe_for_each_engine(fd, hwe)
+				test_exec(fd, hwe, 1, 1, 0, 0, 0, NULL,
+					  NULL, s->flags);
+
+		igt_subtest_f("once-large-%s", s->name)
+			xe_for_each_engine(fd, hwe)
+				test_exec(fd, hwe, 1, 1, SZ_2M, 0, 0, NULL,
+					  NULL, s->flags);
+
+		igt_subtest_f("twice-%s", s->name)
+			xe_for_each_engine(fd, hwe)
+				test_exec(fd, hwe, 1, 2, 0, 0, 0, NULL,
+					  NULL, s->flags);
+
+		igt_subtest_f("twice-large-%s", s->name)
+			xe_for_each_engine(fd, hwe)
+				test_exec(fd, hwe, 1, 2, SZ_2M, 0, 0, NULL,
+					  NULL, s->flags);
+
+		igt_subtest_f("many-%s", s->name)
+			xe_for_each_engine(fd, hwe)
+				test_exec(fd, hwe, 1, 128, 0, 0, 0, NULL,
+					  NULL, s->flags);
+
+		igt_subtest_f("many-stride-%s", s->name)
+			xe_for_each_engine(fd, hwe)
+				test_exec(fd, hwe, 1, 128, 0, 256, 0, NULL,
+					  NULL, s->flags);
+
+		igt_subtest_f("many-execqueues-%s", s->name)
+			xe_for_each_engine(fd, hwe)
+				test_exec(fd, hwe, 16, 128, 0, 0, 0, NULL,
+					  NULL, s->flags);
+
+		igt_subtest_f("many-large-%s", s->name)
+			xe_for_each_engine(fd, hwe)
+				test_exec(fd, hwe, 1, 128, SZ_2M, 0, 0, NULL,
+					  NULL, s->flags);
+
+		igt_subtest_f("many-large-execqueues-%s", s->name)
+			xe_for_each_engine(fd, hwe)
+				test_exec(fd, hwe, 16, 128, SZ_2M, 0, 0, NULL,
+					  NULL, s->flags);
+
+		igt_subtest_f("threads-many-%s", s->name)
+			threads(fd, 1, 128, 0, 0, s->flags, false);
+
+		igt_subtest_f("threads-many-stride-%s", s->name)
+			threads(fd, 1, 128, 0, 256, s->flags, false);
+
+		igt_subtest_f("threads-many-execqueues-%s", s->name)
+			threads(fd, 16, 128, 0, 0, s->flags, false);
+
+		igt_subtest_f("threads-many-large-%s", s->name)
+			threads(fd, 1, 128, SZ_2M, 0, s->flags, false);
+
+		igt_subtest_f("threads-many-large-execqueues-%s", s->name)
+			threads(fd, 16, 128, SZ_2M, 0, s->flags, false);
+
+		igt_subtest_f("threads-shared-vm-many-%s", s->name)
+			threads(fd, 1, 128, 0, 0, s->flags, true);
+
+		igt_subtest_f("threads-shared-vm-many-stride-%s", s->name)
+			threads(fd, 1, 128, 0, 256, s->flags, true);
+
+		igt_subtest_f("threads-shared-vm-many-execqueues-%s", s->name)
+			threads(fd, 16, 128, 0, 0, s->flags, true);
+
+		igt_subtest_f("threads-shared-vm-many-large-%s", s->name)
+			threads(fd, 1, 128, SZ_2M, 0, s->flags, true);
+
+		igt_subtest_f("threads-shared-vm-many-large-execqueues-%s", s->name)
+			threads(fd, 16, 128, SZ_2M, 0, s->flags, true);
+
+		igt_subtest_f("process-many-%s", s->name)
+			processes(fd, 1, 128, 0, 0, s->flags);
+
+		igt_subtest_f("process-many-stride-%s", s->name)
+			processes(fd, 1, 128, 0, 256, s->flags);
+
+		igt_subtest_f("process-many-execqueues-%s", s->name)
+			processes(fd, 16, 128, 0, 0, s->flags);
+
+		igt_subtest_f("process-many-large-%s", s->name)
+			processes(fd, 1, 128, SZ_2M, 0, s->flags);
+
+		igt_subtest_f("process-many-large-execqueues-%s", s->name)
+			processes(fd, 16, 128, SZ_2M, 0, s->flags);
+	}
+
+	igt_subtest("threads-shared-vm-shared-alloc-many-stride-malloc")
+		threads(fd, 1, 128, 0, 256, SHARED_ALLOC, true);
+
+	igt_subtest("threads-shared-vm-shared-alloc-many-stride-malloc-race")
+		threads(fd, 1, 128, 0, 256, RACE | SHARED_ALLOC, true);
+
+	igt_subtest("threads-shared-alloc-many-stride-malloc")
+		threads(fd, 1, 128, 0, 256, SHARED_ALLOC, false);
+
+	igt_subtest("threads-shared-alloc-many-stride-malloc-sync")
+		threads(fd, 1, 128, 0, 256, SHARED_ALLOC | SYNC_EXEC, false);
+
+	igt_subtest("threads-shared-alloc-many-stride-malloc-race")
+		threads(fd, 1, 128, 0, 256, RACE | SHARED_ALLOC, false);
+
+	igt_subtest_f("fault")
+		xe_for_each_engine(fd, hwe)
+			test_exec(fd, hwe, 4, 1, SZ_2M, 0, 0, NULL, NULL,
+				  FAULT);
+
+	for (const struct section *s = psections; s->name; s++) {
+		igt_subtest_f("partial-%s", s->name)
+			xe_for_each_engine(fd, hwe)
+				partial(fd, hwe, s->flags);
+	}
+
+	igt_subtest_f("unaligned-alloc")
+		xe_for_each_engine(fd, hwe) {
+			many_allocs(fd, hwe, (SZ_1M + SZ_512K) * 8,
+				    SZ_1M + SZ_512K, SZ_4K, NULL, 0);
+			break;
+		}
+
+	igt_subtest_f("fault-benchmark")
+		xe_for_each_engine(fd, hwe)
+			many_allocs(fd, hwe, SZ_64M, SZ_64M, SZ_4K, NULL,
+				    BENCHMARK);
+
+	igt_subtest_f("fault-threads-benchmark")
+		xe_for_each_engine(fd, hwe)
+			many_allocs(fd, hwe, SZ_64M, SZ_64M, SZ_4K, NULL,
+				    BENCHMARK | CPU_FAULT_THREADS);
+
+	igt_subtest_f("fault-threads-same-page-benchmark")
+		xe_for_each_engine(fd, hwe)
+			many_allocs(fd, hwe, SZ_64M, SZ_64M, SZ_4K, NULL,
+				    BENCHMARK | CPU_FAULT_THREADS |
+				    CPU_FAULT_SAME_PAGE);
+
+	igt_subtest_f("fault-process-benchmark")
+		xe_for_each_engine(fd, hwe)
+			many_allocs(fd, hwe, SZ_64M, SZ_64M, SZ_4K, NULL,
+				    BENCHMARK | CPU_FAULT_PROCESS);
+
+	igt_subtest_f("fault-process-same-page-benchmark")
+		xe_for_each_engine(fd, hwe)
+			many_allocs(fd, hwe, SZ_64M, SZ_64M, SZ_4K, NULL,
+				    BENCHMARK | CPU_FAULT_PROCESS |
+				    CPU_FAULT_SAME_PAGE);
+
+	for (const struct section *s = esections; s->name; s++) {
+		igt_subtest_f("evict-%s", s->name)
+			xe_for_each_engine(fd, hwe) {
+				many_allocs(fd, hwe,
+					    xe_visible_vram_size(fd, hwe->gt_id),
+					    SZ_8M, SZ_1M, NULL, s->flags);
+				break;
+			}
+	}
+
+	for (const struct section *s = esections; s->name; s++) {
+		igt_subtest_f("processes-evict-%s", s->name)
+			processes_evict(fd, SZ_8M, SZ_1M, s->flags);
+	}
+
+	igt_fixture {
+		xe_device_put(fd);
+		drm_close_driver(fd);
+		close_sync_file();
+	}
+}
diff --git a/tests/meson.build b/tests/meson.build
index 6328792e3a..20ddddb89f 100644
--- a/tests/meson.build
+++ b/tests/meson.build
@@ -295,6 +295,7 @@ intel_xe_progs = [
 	'xe_exec_reset',
 	'xe_exec_sip',
 	'xe_exec_store',
+	'xe_exec_system_allocator',
 	'xe_exec_threads',
 	'xe_exercise_blt',
 	'xe_fault_injection',
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* RE: [PATCH v3 1/2] uapi/xe: Sync latest uAPI KMD headers
  2025-04-24 20:44 ` [PATCH v3 1/2] uapi/xe: Sync latest uAPI KMD headers Matthew Brost
@ 2025-04-24 22:32   ` Cavitt, Jonathan
  2025-04-25  6:54   ` Francois Dugast
  1 sibling, 0 replies; 8+ messages in thread
From: Cavitt, Jonathan @ 2025-04-24 22:32 UTC (permalink / raw)
  To: Brost, Matthew, igt-dev@lists.freedesktop.org
  Cc: Dugast, Francois, Cavitt, Jonathan

-----Original Message-----
> From: igt-dev <igt-dev-bounces@lists.freedesktop.org> On Behalf Of Matthew Brost
> Sent: Thursday, April 24, 2025 1:44 PM
> To: igt-dev@lists.freedesktop.org
> Cc: Dugast, Francois <francois.dugast@intel.com>
> Subject: [PATCH v3 1/2] uapi/xe: Sync latest uAPI KMD headers
> 
> Pull in latest uAPI KMD headers to enable testing of new features.
> 
> Signed-off-by: Matthew Brost <matthew.brost@intel.com>

LGTM, though I am a bit surprised that cpu_addr_mirror_offset is
a signed integer and not a uint.
Reviewed-by: Jonathan Cavitt <jonathan.cavitt@intel.com>
-Jonathan Cavitt

> ---
>  include/drm-uapi/xe_drm.h | 49 ++++++++++++++++++++++++++++++++++++---
>  1 file changed, 46 insertions(+), 3 deletions(-)
> 
> diff --git a/include/drm-uapi/xe_drm.h b/include/drm-uapi/xe_drm.h
> index 154f947ef0..c90fab1b00 100644
> --- a/include/drm-uapi/xe_drm.h
> +++ b/include/drm-uapi/xe_drm.h
> @@ -393,6 +393,10 @@ struct drm_xe_query_mem_regions {
>   *
>   *    - %DRM_XE_QUERY_CONFIG_FLAG_HAS_VRAM - Flag is set if the device
>   *      has usable VRAM
> + *    - %DRM_XE_QUERY_CONFIG_FLAG_HAS_LOW_LATENCY - Flag is set if the device
> + *      has low latency hint support
> + *    - %DRM_XE_QUERY_CONFIG_FLAG_HAS_CPU_ADDR_MIRROR - Flag is set if the
> + *      device has CPU address mirroring support
>   *  - %DRM_XE_QUERY_CONFIG_MIN_ALIGNMENT - Minimal memory alignment
>   *    required by this device, typically SZ_4K or SZ_64K
>   *  - %DRM_XE_QUERY_CONFIG_VA_BITS - Maximum bits of a virtual address
> @@ -409,6 +413,8 @@ struct drm_xe_query_config {
>  #define DRM_XE_QUERY_CONFIG_REV_AND_DEVICE_ID	0
>  #define DRM_XE_QUERY_CONFIG_FLAGS			1
>  	#define DRM_XE_QUERY_CONFIG_FLAG_HAS_VRAM	(1 << 0)
> +	#define DRM_XE_QUERY_CONFIG_FLAG_HAS_LOW_LATENCY	(1 << 1)
> +	#define DRM_XE_QUERY_CONFIG_FLAG_HAS_CPU_ADDR_MIRROR	(1 << 2)
>  #define DRM_XE_QUERY_CONFIG_MIN_ALIGNMENT		2
>  #define DRM_XE_QUERY_CONFIG_VA_BITS			3
>  #define DRM_XE_QUERY_CONFIG_MAX_EXEC_QUEUE_PRIORITY	4
> @@ -911,7 +917,11 @@ struct drm_xe_gem_mmap_offset {
>   * struct drm_xe_vm_create - Input of &DRM_IOCTL_XE_VM_CREATE
>   *
>   * The @flags can be:
> - *  - %DRM_XE_VM_CREATE_FLAG_SCRATCH_PAGE
> + *  - %DRM_XE_VM_CREATE_FLAG_SCRATCH_PAGE - Map the whole virtual address
> + *    space of the VM to scratch page. A vm_bind would overwrite the scratch
> + *    page mapping. This flag is mutually exclusive with the
> + *    %DRM_XE_VM_CREATE_FLAG_FAULT_MODE flag, with an exception of on x2 and
> + *    xe3 platform.
>   *  - %DRM_XE_VM_CREATE_FLAG_LR_MODE - An LR, or Long Running VM accepts
>   *    exec submissions to its exec_queues that don't have an upper time
>   *    limit on the job execution time. But exec submissions to these
> @@ -987,6 +997,12 @@ struct drm_xe_vm_destroy {
>   *  - %DRM_XE_VM_BIND_FLAG_CHECK_PXP - If the object is encrypted via PXP,
>   *    reject the binding if the encryption key is no longer valid. This
>   *    flag has no effect on BOs that are not marked as using PXP.
> + *  - %DRM_XE_VM_BIND_FLAG_CPU_ADDR_MIRROR - When the CPU address mirror flag is
> + *    set, no mappings are created rather the range is reserved for CPU address
> + *    mirroring which will be populated on GPU page faults or prefetches. Only
> + *    valid on VMs with DRM_XE_VM_CREATE_FLAG_FAULT_MODE set. The CPU address
> + *    mirror flag are only valid for DRM_XE_VM_BIND_OP_MAP operations, the BO
> + *    handle MBZ, and the BO offset MBZ.
>   */
>  struct drm_xe_vm_bind_op {
>  	/** @extensions: Pointer to the first extension struct, if any */
> @@ -1039,7 +1055,9 @@ struct drm_xe_vm_bind_op {
>  	 * on the @pat_index. For such mappings there is no actual memory being
>  	 * mapped (the address in the PTE is invalid), so the various PAT memory
>  	 * attributes likely do not apply.  Simply leaving as zero is one
> -	 * option (still a valid pat_index).
> +	 * option (still a valid pat_index). Same applies to
> +	 * DRM_XE_VM_BIND_FLAG_CPU_ADDR_MIRROR bindings as for such mapping
> +	 * there is no actual memory being mapped.
>  	 */
>  	__u16 pat_index;
>  
> @@ -1055,6 +1073,14 @@ struct drm_xe_vm_bind_op {
>  
>  		/** @userptr: user pointer to bind on */
>  		__u64 userptr;
> +
> +		/**
> +		 * @cpu_addr_mirror_offset: Offset from GPU @addr to create
> +		 * CPU address mirror mappings. MBZ with current level of
> +		 * support (e.g. 1 to 1 mapping between GPU and CPU mappings
> +		 * only supported).
> +		 */
> +		__s64 cpu_addr_mirror_offset;
>  	};
>  
>  	/**
> @@ -1078,6 +1104,7 @@ struct drm_xe_vm_bind_op {
>  #define DRM_XE_VM_BIND_FLAG_NULL	(1 << 2)
>  #define DRM_XE_VM_BIND_FLAG_DUMPABLE	(1 << 3)
>  #define DRM_XE_VM_BIND_FLAG_CHECK_PXP	(1 << 4)
> +#define DRM_XE_VM_BIND_FLAG_CPU_ADDR_MIRROR	(1 << 5)
>  	/** @flags: Bind flags */
>  	__u32 flags;
>  
> @@ -1205,6 +1232,21 @@ struct drm_xe_vm_bind {
>   *     };
>   *     ioctl(fd, DRM_IOCTL_XE_EXEC_QUEUE_CREATE, &exec_queue_create);
>   *
> + *     Allow users to provide a hint to kernel for cases demanding low latency
> + *     profile. Please note it will have impact on power consumption. User can
> + *     indicate low latency hint with flag while creating exec queue as
> + *     mentioned below,
> + *
> + *     struct drm_xe_exec_queue_create exec_queue_create = {
> + *          .flags = DRM_XE_EXEC_QUEUE_LOW_LATENCY_HINT,
> + *          .extensions = 0,
> + *          .vm_id = vm,
> + *          .num_bb_per_exec = 1,
> + *          .num_eng_per_bb = 1,
> + *          .instances = to_user_pointer(&instance),
> + *     };
> + *     ioctl(fd, DRM_IOCTL_XE_EXEC_QUEUE_CREATE, &exec_queue_create);
> + *
>   */
>  struct drm_xe_exec_queue_create {
>  #define DRM_XE_EXEC_QUEUE_EXTENSION_SET_PROPERTY		0
> @@ -1223,7 +1265,8 @@ struct drm_xe_exec_queue_create {
>  	/** @vm_id: VM to use for this exec queue */
>  	__u32 vm_id;
>  
> -	/** @flags: MBZ */
> +#define DRM_XE_EXEC_QUEUE_LOW_LATENCY_HINT	(1 << 0)
> +	/** @flags: flags to use for this exec queue */
>  	__u32 flags;
>  
>  	/** @exec_queue_id: Returned exec queue ID */
> -- 
> 2.34.1
> 
> 

^ permalink raw reply	[flat|nested] 8+ messages in thread

* RE: [PATCH v3 2/2] tests/xe: Add system_allocator test
  2025-04-24 20:44 ` [PATCH v3 2/2] tests/xe: Add system_allocator test Matthew Brost
@ 2025-04-24 22:32   ` Cavitt, Jonathan
  2025-04-24 22:39     ` Matthew Brost
  2025-04-25  7:06   ` Francois Dugast
  1 sibling, 1 reply; 8+ messages in thread
From: Cavitt, Jonathan @ 2025-04-24 22:32 UTC (permalink / raw)
  To: Brost, Matthew, igt-dev@lists.freedesktop.org
  Cc: Dugast, Francois, Cavitt, Jonathan

-----Original Message-----
From: igt-dev <igt-dev-bounces@lists.freedesktop.org> On Behalf Of Matthew Brost
Sent: Thursday, April 24, 2025 1:44 PM
To: igt-dev@lists.freedesktop.org
Cc: Dugast, Francois <francois.dugast@intel.com>
Subject: [PATCH v3 2/2] tests/xe: Add system_allocator test
> 
> Test various uses of system allocator in single thread, multiple
> threads, and multiple processes.
> 
> Features tested:
>  - Malloc with various size
>  - Mmap with various sizes and flags including file backed mappings
>  - Mixing BO allocations with system allocator
>  - Various page sizes
>  - Dynamically freeing / unmapping memory
>  - Sharing VM across threads
>  - Faults racing on different hardware engines / GTs / Tiles
>  - GPU faults and CPU faults racing
>  - CPU faults on multiple threads racing
>  - CPU faults on multiple process racing
>  - GPU faults of memory not faulted in by CPU
>  - Partial unmap of allocations
>  - Attempting to unmap system allocations when GPU has mappings
>  - Eviction of both system allocations and BOs
>  - Forking child processes and reading data from VRAM
>  - mremap data in VRAM
>  - Protection changes
>  - Multiple faults per execbuf
> 
> Running on LNL, BMG, PVC 1 tile, and PVC 2 tile.
> 
> v2:
>  - Rebase
>  - Fix memory allocation to not interfear with malloc (Thomas)
> v3:
>  - Fix memory leak (Francois)
>  - Break out uAPI into own patch (Francois)
>  - Use mkstemp for sync file (Francois)
>  - Use mkstemp for file backed data (Francois)
>  - Drop i argument from READ_VALUE (Francois)
>  - Fix test description (Francois)
>  - Add comment to check_all_pages_process (Francois)
>  - Prefer igt_info over printf (Francois)
>  - Fix types in messages (Francois)
>  - Prefer odd macro (Francois)
> 
> Signed-off-by: Matthew Brost <matthew.brost@intel.com>

LGTM, though maybe it would be better to split the various tests
introduced in xe_exec_system_allocator.c across multiple patches,
instead of creating all of the tests at once?

I'm okay with creating the test series either way, so this isn't a
blocking request.

The remaining nits below are non-blocking as well, so this is:
Reviewed-by: Jonathan Cavitt <jonathan.cavitt@intel.com>

> ---
>  lib/xe/xe_ioctl.c                      |   12 +
>  lib/xe/xe_ioctl.h                      |    1 +
>  tests/intel/xe_exec_system_allocator.c | 1849 ++++++++++++++++++++++++
>  tests/meson.build                      |    1 +
>  4 files changed, 1863 insertions(+)
>  create mode 100644 tests/intel/xe_exec_system_allocator.c
> 
> diff --git a/lib/xe/xe_ioctl.c b/lib/xe/xe_ioctl.c
> index fb8c4aef13..785fc9184c 100644
> --- a/lib/xe/xe_ioctl.c
> +++ b/lib/xe/xe_ioctl.c
> @@ -440,6 +440,18 @@ void *xe_bo_map(int fd, uint32_t bo, size_t size)
>  	return __xe_bo_map(fd, bo, size, PROT_WRITE);
>  }
>  
> +void *xe_bo_map_fixed(int fd, uint32_t bo, size_t size, uint64_t addr)
> +{
> +	uint64_t mmo;
> +	void *map;
> +
> +	mmo = xe_bo_mmap_offset(fd, bo);
> +	map = mmap((void *)addr, size, PROT_WRITE, MAP_SHARED | MAP_FIXED, fd, mmo);
> +	igt_assert(map != MAP_FAILED);
> +
> +	return map;
> +}
> +
>  void *xe_bo_mmap_ext(int fd, uint32_t bo, size_t size, int prot)
>  {
>  	return __xe_bo_map(fd, bo, size, prot);
> diff --git a/lib/xe/xe_ioctl.h b/lib/xe/xe_ioctl.h
> index 9bdf73b2bd..554a33c9cd 100644
> --- a/lib/xe/xe_ioctl.h
> +++ b/lib/xe/xe_ioctl.h
> @@ -86,6 +86,7 @@ uint32_t xe_exec_queue_create_class(int fd, uint32_t vm, uint16_t class);
>  void xe_exec_queue_destroy(int fd, uint32_t exec_queue);
>  uint64_t xe_bo_mmap_offset(int fd, uint32_t bo);
>  void *xe_bo_map(int fd, uint32_t bo, size_t size);
> +void *xe_bo_map_fixed(int fd, uint32_t bo, size_t size, long unsigned int addr);
>  void *xe_bo_mmap_ext(int fd, uint32_t bo, size_t size, int prot);
>  int __xe_exec(int fd, struct drm_xe_exec *exec);
>  void xe_exec(int fd, struct drm_xe_exec *exec);
> diff --git a/tests/intel/xe_exec_system_allocator.c b/tests/intel/xe_exec_system_allocator.c
> new file mode 100644
> index 0000000000..4839090cb2
> --- /dev/null
> +++ b/tests/intel/xe_exec_system_allocator.c
> @@ -0,0 +1,1849 @@
> +// SPDX-License-Identifier: MIT
> +/*
> + * Copyright (c) 2024 Intel Corporation

NIT:
Shouldn't this be a 2025 copyright?

> + */
> +
> +/**
> + * TEST: Basic tests for execbuf functionality using system allocator
> + * Category: Core
> + * Mega feature: USM
> + * Sub-category: System allocator
> + * Functionality: fault mode, system allocator
> + * GPU: LNL, BMG, PVC
> + */
> +
> +#include <fcntl.h>
> +#include <linux/mman.h>
> +#include <time.h>
> +
> +#include "igt.h"
> +#include "lib/igt_syncobj.h"
> +#include "lib/intel_reg.h"
> +#include "xe_drm.h"
> +
> +#include "xe/xe_ioctl.h"
> +#include "xe/xe_query.h"
> +#include <string.h>
> +
> +#define USER_FENCE_VALUE	0xdeadbeefdeadbeefull
> +#define QUARTER_SEC		(NSEC_PER_SEC / 4)
> +#define FIVE_SEC		(5LL * NSEC_PER_SEC)
> +
> +struct batch_data {
> +	uint32_t batch[16];
> +	uint64_t pad;
> +	uint32_t data;
> +	uint32_t expected_data;
> +};
> +
> +#define WRITE_VALUE(data__, i__)	({			\
> +	if (!(data__)->expected_data)				\
> +		(data__)->expected_data = rand() << 12 | (i__);	\
> +	(data__)->expected_data;				\
> +})
> +#define READ_VALUE(data__)	((data__)->expected_data)
> +
> +static void __write_dword(uint32_t *batch, uint64_t sdi_addr, uint32_t wdata,
> +			int *idx)
> +{
> +	batch[(*idx)++] = MI_STORE_DWORD_IMM_GEN4;
> +	batch[(*idx)++] = sdi_addr;
> +	batch[(*idx)++] = sdi_addr >> 32;
> +	batch[(*idx)++] = wdata;
> +}
> +
> +static void write_dword(uint32_t *batch, uint64_t sdi_addr, uint32_t wdata,
> +			int *idx)
> +{
> +	__write_dword(batch, sdi_addr, wdata, idx);
> +	batch[(*idx)++] = MI_BATCH_BUFFER_END;
> +}
> +
> +static void check_all_pages(void *ptr, uint64_t alloc_size, uint64_t stride,
> +			    pthread_barrier_t *barrier)
> +{
> +	int i, n_writes = alloc_size / stride;
> +
> +	for (i = 0; i < n_writes; ++i) {
> +		struct batch_data *data = ptr + i * stride;
> +
> +		igt_assert_eq(data->data, READ_VALUE(data));
> +
> +		if (barrier)
> +			pthread_barrier_wait(barrier);
> +	}
> +}
> +
> +static char sync_file[] = "/tmp/xe_exec_system_allocator_syncXXXXXX";
> +static int sync_fd;
> +
> +static void open_sync_file(void)
> +{
> +	sync_fd = mkstemp(sync_file);
> +}
> +
> +static void close_sync_file(void)
> +{
> +	close(sync_fd);
> +}

NIT:
The sync_fd doesn't seem like it's being used for anything.  However, just
because I don't know what this file does doesn't mean it does nothing, so
I won't block on this.

> +
> +struct process_data {
> +	pthread_mutex_t mutex;
> +	pthread_cond_t cond;
> +	pthread_barrier_t barrier;
> +	bool go;
> +};
> +
> +static void wait_pdata(struct process_data *pdata)
> +{
> +	pthread_mutex_lock(&pdata->mutex);
> +	while (!pdata->go)
> +		pthread_cond_wait(&pdata->cond, &pdata->mutex);
> +	pthread_mutex_unlock(&pdata->mutex);
> +}
> +
> +static void init_pdata(struct process_data *pdata, int n_engine)
> +{
> +	pthread_mutexattr_t mutex_attr;
> +	pthread_condattr_t cond_attr;
> +	pthread_barrierattr_t barrier_attr;
> +
> +	pthread_mutexattr_init(&mutex_attr);
> +	pthread_mutexattr_setpshared(&mutex_attr, PTHREAD_PROCESS_SHARED);
> +	pthread_mutex_init(&pdata->mutex, &mutex_attr);
> +
> +	pthread_condattr_init(&cond_attr);
> +	pthread_condattr_setpshared(&cond_attr, PTHREAD_PROCESS_SHARED);
> +	pthread_cond_init(&pdata->cond, &cond_attr);
> +
> +	pthread_barrierattr_init(&barrier_attr);
> +	pthread_barrierattr_setpshared(&barrier_attr, PTHREAD_PROCESS_SHARED);
> +	pthread_barrier_init(&pdata->barrier, &barrier_attr, n_engine);
> +
> +	pdata->go = false;
> +}
> +
> +static void signal_pdata(struct process_data *pdata)
> +{
> +	pthread_mutex_lock(&pdata->mutex);
> +	pdata->go = true;
> +	pthread_cond_broadcast(&pdata->cond);
> +	pthread_mutex_unlock(&pdata->mutex);
> +}
> +
> +/* many_alloc flags */
> +#define MIX_BO_ALLOC		(0x1 << 0)
> +#define BENCHMARK		(0x1 << 1)
> +#define CPU_FAULT_THREADS	(0x1 << 2)
> +#define CPU_FAULT_PROCESS	(0x1 << 3)
> +#define CPU_FAULT_SAME_PAGE	(0x1 << 4)
> +
> +static void process_check(void *ptr, uint64_t alloc_size, uint64_t stride,
> +			  unsigned int flags)
> +{
> +	struct process_data *pdata;
> +	int map_fd;
> +
> +	map_fd = open(sync_file, O_RDWR, 0x666);
> +	pdata = mmap(NULL, sizeof(*pdata), PROT_READ |
> +		     PROT_WRITE, MAP_SHARED, map_fd, 0);
> +	wait_pdata(pdata);
> +
> +	if (flags & CPU_FAULT_SAME_PAGE)
> +		check_all_pages(ptr, alloc_size, stride, &pdata->barrier);
> +	else
> +		check_all_pages(ptr, alloc_size, stride, NULL);
> +
> +	close(map_fd);
> +	munmap(pdata, sizeof(*pdata));
> +}
> +
> +/*
> + * Partition checking of results in chunks which causes multiple processes to
> + * fault same VRAM allocation in parallel.
> + */
> +static void
> +check_all_pages_process(void *ptr, uint64_t alloc_size, uint64_t stride,
> +			int n_process, unsigned int flags)
> +{
> +	struct process_data *pdata;
> +	int map_fd, i;
> +
> +	map_fd = open(sync_file, O_RDWR | O_CREAT, 0x666);
> +	posix_fallocate(map_fd, 0, sizeof(*pdata));
> +	pdata = mmap(NULL, sizeof(*pdata), PROT_READ |
> +		     PROT_WRITE, MAP_SHARED, map_fd, 0);
> +
> +	init_pdata(pdata, n_process);
> +
> +	for (i = 0; i < n_process; ++i) {
> +		igt_fork(child, 1)
> +			if (flags & CPU_FAULT_SAME_PAGE)
> +				process_check(ptr, alloc_size, stride, flags);
> +			else
> +				process_check(ptr + stride * i, alloc_size,
> +					      stride * n_process, flags);
> +	}
> +
> +	signal_pdata(pdata);
> +	igt_waitchildren();
> +
> +	close(map_fd);
> +	munmap(pdata, sizeof(*pdata));
> +}
> +
> +struct thread_check_data {
> +	pthread_t thread;
> +	pthread_mutex_t *mutex;
> +	pthread_cond_t *cond;
> +	pthread_barrier_t *barrier;
> +	void *ptr;
> +	uint64_t alloc_size;
> +	uint64_t stride;
> +	bool *go;
> +};
> +
> +static void *thread_check(void *data)
> +{
> +	struct thread_check_data *t = data;
> +
> +	pthread_mutex_lock(t->mutex);
> +	while (!*t->go)
> +		pthread_cond_wait(t->cond, t->mutex);
> +	pthread_mutex_unlock(t->mutex);
> +
> +	check_all_pages(t->ptr, t->alloc_size, t->stride, t->barrier);
> +
> +	return NULL;
> +}
> +
> +/*
> + * Partition checking of results in chunks which causes multiple threads to
> + * fault same VRAM allocation in parallel.
> + */
> +static void
> +check_all_pages_threads(void *ptr, uint64_t alloc_size, uint64_t stride,
> +			int n_threads, unsigned int flags)
> +{
> +	struct thread_check_data *threads_check_data;
> +	pthread_mutex_t mutex;
> +	pthread_cond_t cond;
> +	pthread_barrier_t barrier;
> +	int i;
> +	bool go = false;
> +
> +	threads_check_data = calloc(n_threads, sizeof(*threads_check_data));
> +	igt_assert(threads_check_data);
> +
> +	pthread_mutex_init(&mutex, 0);
> +	pthread_cond_init(&cond, 0);
> +	pthread_barrier_init(&barrier, 0, n_threads);
> +
> +	for (i = 0; i < n_threads; ++i) {
> +		threads_check_data[i].mutex = &mutex;
> +		threads_check_data[i].cond = &cond;
> +		if (flags & CPU_FAULT_SAME_PAGE) {
> +			threads_check_data[i].barrier = &barrier;
> +			threads_check_data[i].ptr = ptr;
> +			threads_check_data[i].alloc_size = alloc_size;
> +			threads_check_data[i].stride = stride;
> +		} else {
> +			threads_check_data[i].barrier = NULL;
> +			threads_check_data[i].ptr = ptr + stride * i;
> +			threads_check_data[i].alloc_size = alloc_size;
> +			threads_check_data[i].stride = n_threads * stride;
> +		}
> +		threads_check_data[i].go = &go;
> +
> +		pthread_create(&threads_check_data[i].thread, 0, thread_check,
> +			       &threads_check_data[i]);
> +	}
> +
> +	pthread_mutex_lock(&mutex);
> +	go = true;
> +	pthread_cond_broadcast(&cond);
> +	pthread_mutex_unlock(&mutex);
> +
> +	for (i = 0; i < n_threads; ++i)
> +		pthread_join(threads_check_data[i].thread, NULL);
> +	free(threads_check_data);
> +}
> +
> +static void touch_all_pages(int fd, uint32_t exec_queue, void *ptr,
> +			    uint64_t alloc_size, uint64_t stride,
> +			    struct timespec *tv, uint64_t *submit)
> +{
> +	struct drm_xe_sync sync[1] = {
> +		{ .type = DRM_XE_SYNC_TYPE_USER_FENCE,
> +		  .flags = DRM_XE_SYNC_FLAG_SIGNAL,
> +		  .timeline_value = USER_FENCE_VALUE },
> +	};
> +	struct drm_xe_exec exec = {
> +		.num_batch_buffer = 1,
> +		.num_syncs = 0,
> +		.exec_queue_id = exec_queue,
> +		.syncs = to_user_pointer(&sync),
> +	};
> +	uint64_t addr = to_user_pointer(ptr);
> +	int i, ret, n_writes = alloc_size / stride;
> +	u64 *exec_ufence = NULL;
> +	int64_t timeout = FIVE_SEC;
> +
> +	exec_ufence = mmap(NULL, SZ_4K, PROT_READ |
> +			   PROT_WRITE, MAP_SHARED |
> +			   MAP_ANONYMOUS, -1, 0);
> +	igt_assert(exec_ufence != MAP_FAILED);
> +	memset(exec_ufence, 0, SZ_4K);
> +	sync[0].addr = to_user_pointer(exec_ufence);
> +
> +	for (i = 0; i < n_writes; ++i, addr += stride) {
> +		struct batch_data *data = ptr + i * stride;
> +		uint64_t sdi_offset = (char *)&data->data - (char *)data;
> +		uint64_t sdi_addr = addr + sdi_offset;
> +		int b = 0;
> +
> +		write_dword(data->batch, sdi_addr, WRITE_VALUE(data, i), &b);
> +		igt_assert(b <= ARRAY_SIZE(data->batch));
> +	}
> +
> +	igt_nsec_elapsed(tv);
> +	*submit = igt_nsec_elapsed(tv);
> +
> +	addr = to_user_pointer(ptr);
> +	for (i = 0; i < n_writes; ++i, addr += stride) {
> +		struct batch_data *data = ptr + i * stride;
> +		uint64_t batch_offset = (char *)&data->batch - (char *)data;
> +		uint64_t batch_addr = addr + batch_offset;
> +
> +		exec.address = batch_addr;
> +		if (i + 1 == n_writes)
> +			exec.num_syncs = 1;
> +		xe_exec(fd, &exec);
> +	}
> +
> +	ret = __xe_wait_ufence(fd, exec_ufence, USER_FENCE_VALUE, exec_queue,
> +			       &timeout);
> +	if (ret) {
> +		igt_info("FAIL EXEC_UFENCE_ADDR: 0x%016llx\n", sync[0].addr);
> +		igt_info("FAIL EXEC_UFENCE: EXPECTED=0x%016llx, ACTUAL=0x%016lx\n",
> +			 USER_FENCE_VALUE, exec_ufence[0]);
> +
> +		addr = to_user_pointer(ptr);
> +		for (i = 0; i < n_writes; ++i, addr += stride) {
> +			struct batch_data *data = ptr + i * stride;
> +			uint64_t batch_offset = (char *)&data->batch - (char *)data;
> +			uint64_t batch_addr = addr + batch_offset;
> +			uint64_t sdi_offset = (char *)&data->data - (char *)data;
> +			uint64_t sdi_addr = addr + sdi_offset;
> +
> +			igt_info("FAIL BATCH_ADDR: 0x%016lx\n", batch_addr);
> +			igt_info("FAIL SDI_ADDR: 0x%016lx\n", sdi_addr);
> +			igt_info("FAIL SDI_ADDR (in batch): 0x%016lx\n",
> +				 (((u64)data->batch[2]) << 32) | data->batch[1]);
> +			igt_info("FAIL DATA: EXPECTED=0x%08x, ACTUAL=0x%08x\n",
> +				 data->expected_data, data->data);
> +		}
> +		igt_assert_eq(ret, 0);
> +	}
> +	munmap(exec_ufence, SZ_4K);
> +}
> +
> +static int va_bits;
> +
> +#define bind_system_allocator(__sync, __num_sync)			\
> +	__xe_vm_bind_assert(fd, vm, 0,					\
> +			    0, 0, 0, 0x1ull << va_bits,			\
> +			    DRM_XE_VM_BIND_OP_MAP,			\
> +			    DRM_XE_VM_BIND_FLAG_CPU_ADDR_MIRROR,	\
> +			    (__sync), (__num_sync), 0, 0)
> +
> +#define unbind_system_allocator()				\
> +	__xe_vm_bind(fd, vm, 0, 0, 0, 0, 0x1ull << va_bits,	\
> +		     DRM_XE_VM_BIND_OP_UNMAP, 0,		\
> +		     NULL, 0, 0, 0, 0)
> +
> +#define odd(__i)	(__i & 1)
> +
> +struct aligned_alloc_type {
> +	void *__ptr;
> +	void *ptr;
> +	size_t __size;
> +	size_t size;
> +};
> +
> +static struct aligned_alloc_type __aligned_alloc(size_t alignment, size_t size)
> +{
> +	struct aligned_alloc_type aligned_alloc_type;
> +
> +	aligned_alloc_type.__ptr = mmap(NULL, alignment + size, PROT_NONE, MAP_PRIVATE |
> +			      MAP_ANONYMOUS, -1, 0);
> +	igt_assert(aligned_alloc_type.__ptr != MAP_FAILED);
> +
> +	aligned_alloc_type.ptr = (void *)ALIGN((uint64_t)aligned_alloc_type.__ptr, alignment);
> +	aligned_alloc_type.size = size;
> +	aligned_alloc_type.__size = size + alignment;
> +
> +	return aligned_alloc_type;
> +}
> +
> +static void __aligned_free(struct aligned_alloc_type  *aligned_alloc_type)
> +{
> +	munmap(aligned_alloc_type->__ptr, aligned_alloc_type->__size);
> +}
> +
> +static void __aligned_partial_free(struct aligned_alloc_type  *aligned_alloc_type)
> +{
> +	size_t begin_size = (size_t)(aligned_alloc_type->ptr - aligned_alloc_type->__ptr);
> +
> +	if (begin_size)
> +		munmap(aligned_alloc_type->__ptr, begin_size);
> +	if (aligned_alloc_type->__size - aligned_alloc_type->size - begin_size)
> +		munmap(aligned_alloc_type->ptr + aligned_alloc_type->size,
> +		       aligned_alloc_type->__size - aligned_alloc_type->size - begin_size);
> +}
> +
> +/**
> + * SUBTEST: unaligned-alloc
> + * Description: allocate unaligned sizes of memory
> + * Test category: functionality test
> + *
> + * SUBTEST: fault-benchmark
> + * Description: Benchmark how long GPU / CPU take
> + * Test category: performance test
> + *
> + * SUBTEST: fault-threads-benchmark
> + * Description: Benchmark how long GPU / CPU take, reading results with multiple threads
> + * Test category: performance and functionality test
> + *
> + * SUBTEST: fault-threads-same-page-benchmark
> + * Description: Benchmark how long GPU / CPU take, reading results with multiple threads, hammer same page
> + * Test category: performance and functionality test
> + *
> + * SUBTEST: fault-process-benchmark
> + * Description: Benchmark how long GPU / CPU take, reading results with multiple process
> + * Test category: performance and functionality test
> + *
> + * SUBTEST: fault-process-same-page-benchmark
> + * Description: Benchmark how long GPU / CPU take, reading results with multiple process, hammer same page
> + * Test category: performance and functionality test
> + *
> + * SUBTEST: evict-malloc
> + * Description: trigger eviction of VRAM allocated via malloc
> + * Test category: functionality test
> + *
> + * SUBTEST: evict-malloc-mix-bo
> + * Description: trigger eviction of VRAM allocated via malloc and BO create
> + * Test category: functionality test
> + *
> + * SUBTEST: processes-evict-malloc
> + * Description: multi-process trigger eviction of VRAM allocated via malloc
> + * Test category: stress test
> + *
> + * SUBTEST: processes-evict-malloc-mix-bo
> + * Description: multi-process trigger eviction of VRAM allocated via malloc and BO create
> + * Test category: stress test
> + */
> +
> +static void
> +many_allocs(int fd, struct drm_xe_engine_class_instance *eci,
> +	    uint64_t total_alloc, uint64_t alloc_size, uint64_t stride,
> +	    pthread_barrier_t *barrier, unsigned int flags)
> +{
> +	uint32_t vm, exec_queue;
> +	int num_allocs = flags & BENCHMARK ? 1 :
> +		(9 * (total_alloc / alloc_size)) / 8;
> +	struct aligned_alloc_type *allocs;
> +	uint32_t *bos = NULL;
> +	struct timespec tv = {};
> +	uint64_t submit, read, elapsed;
> +	int i;
> +
> +	vm = xe_vm_create(fd, DRM_XE_VM_CREATE_FLAG_LR_MODE |
> +			  DRM_XE_VM_CREATE_FLAG_FAULT_MODE, 0);
> +	exec_queue = xe_exec_queue_create(fd, vm, eci, 0);
> +
> +	bind_system_allocator(NULL, 0);
> +
> +	allocs = malloc(sizeof(*allocs) * num_allocs);
> +	igt_assert(allocs);
> +	memset(allocs, 0, sizeof(*allocs) * num_allocs);
> +
> +	if (flags & MIX_BO_ALLOC) {
> +		bos = malloc(sizeof(*bos) * num_allocs);
> +		igt_assert(bos);
> +		memset(bos, 0, sizeof(*bos) * num_allocs);
> +	}
> +
> +	for (i = 0; i < num_allocs; ++i) {
> +		struct aligned_alloc_type alloc;
> +
> +		if (flags & MIX_BO_ALLOC && odd(i)) {
> +			uint32_t bo_flags =
> +				DRM_XE_GEM_CREATE_FLAG_NEEDS_VISIBLE_VRAM;
> +
> +			alloc = __aligned_alloc(SZ_2M, alloc_size);
> +			igt_assert(alloc.ptr);
> +
> +			bos[i] = xe_bo_create(fd, vm, alloc_size,
> +					      vram_if_possible(fd, eci->gt_id),
> +					      bo_flags);
> +			alloc.ptr = xe_bo_map_fixed(fd, bos[i], alloc_size,
> +						    to_user_pointer(alloc.ptr));
> +			xe_vm_bind_async(fd, vm, 0, bos[i], 0,
> +					 to_user_pointer(alloc.ptr),
> +					 alloc_size, 0, 0);
> +		} else {
> +			alloc.ptr = aligned_alloc(SZ_2M, alloc_size);
> +			igt_assert(alloc.ptr);
> +		}
> +		allocs[i] = alloc;
> +
> +		touch_all_pages(fd, exec_queue, allocs[i].ptr, alloc_size, stride,
> +				&tv, &submit);
> +	}
> +
> +	if (barrier)
> +		pthread_barrier_wait(barrier);
> +
> +	for (i = 0; i < num_allocs; ++i) {
> +		if (flags & BENCHMARK)
> +			read = igt_nsec_elapsed(&tv);
> +#define NUM_CHECK_THREADS	8
> +		if (flags & CPU_FAULT_PROCESS)
> +			check_all_pages_process(allocs[i].ptr, alloc_size, stride,
> +						NUM_CHECK_THREADS, flags);
> +		else if (flags & CPU_FAULT_THREADS)
> +			check_all_pages_threads(allocs[i].ptr, alloc_size, stride,
> +						NUM_CHECK_THREADS, flags);
> +		else
> +			check_all_pages(allocs[i].ptr, alloc_size, stride, NULL);
> +		if (flags & BENCHMARK) {
> +			elapsed = igt_nsec_elapsed(&tv);
> +			igt_info("Execution took %.3fms (submit %.1fus, read %.1fus, total %.1fus, read_total %.1fus)\n",
> +				 1e-6 * elapsed, 1e-3 * submit, 1e-3 * read,
> +				 1e-3 * (elapsed - submit),
> +				 1e-3 * (elapsed - read));
> +		}
> +		if (bos && bos[i]) {
> +			__aligned_free(allocs + i);
> +			gem_close(fd, bos[i]);
> +		} else {
> +			free(allocs[i].ptr);
> +		}
> +	}
> +	if (bos)
> +		free(bos);
> +	free(allocs);
> +	xe_exec_queue_destroy(fd, exec_queue);
> +	xe_vm_destroy(fd, vm);
> +}
> +
> +static void process_evict(struct drm_xe_engine_class_instance *hwe,
> +			  uint64_t total_alloc, uint64_t alloc_size,
> +			  uint64_t stride, unsigned int flags)
> +{
> +	struct process_data *pdata;
> +	int map_fd;
> +	int fd;
> +
> +	map_fd = open(sync_file, O_RDWR, 0x666);
> +	pdata = mmap(NULL, sizeof(*pdata), PROT_READ |
> +		     PROT_WRITE, MAP_SHARED, map_fd, 0);
> +	wait_pdata(pdata);
> +
> +	fd = drm_open_driver(DRIVER_XE);
> +	many_allocs(fd, hwe, total_alloc, alloc_size, stride, &pdata->barrier,
> +		    flags);
> +	drm_close_driver(fd);
> +
> +	close(map_fd);
> +	munmap(pdata, sizeof(*pdata));
> +}
> +
> +static void
> +processes_evict(int fd, uint64_t alloc_size, uint64_t stride,
> +		unsigned int flags)
> +{
> +	struct drm_xe_engine_class_instance *hwe;
> +	struct process_data *pdata;
> +	int n_engine_gt[2] = { 0, 0 }, n_engine = 0;
> +	int map_fd;
> +
> +	map_fd = open(sync_file, O_RDWR | O_CREAT, 0x666);
> +	posix_fallocate(map_fd, 0, sizeof(*pdata));
> +	pdata = mmap(NULL, sizeof(*pdata), PROT_READ |
> +		     PROT_WRITE, MAP_SHARED, map_fd, 0);
> +
> +	xe_for_each_engine(fd, hwe) {
> +		igt_assert(hwe->gt_id < 2);
> +		n_engine_gt[hwe->gt_id]++;
> +		n_engine++;
> +	}
> +
> +	init_pdata(pdata, n_engine);
> +
> +	xe_for_each_engine(fd, hwe) {
> +		igt_fork(child, 1)
> +			process_evict(hwe,
> +				      xe_visible_vram_size(fd, hwe->gt_id) /
> +				      n_engine_gt[hwe->gt_id], alloc_size,
> +				      stride, flags);
> +	}
> +
> +	signal_pdata(pdata);
> +	igt_waitchildren();
> +
> +	close(map_fd);
> +	munmap(pdata, sizeof(*pdata));
> +}
> +
> +#define CPU_FAULT	(0x1 << 0)
> +#define REMAP		(0x1 << 1)
> +#define MIDDLE		(0x1 << 2)
> +
> +/**
> + * SUBTEST: partial-munmap-cpu-fault
> + * Description: munmap partially with cpu access in between
> + * Test category: functionality test
> + *
> + * SUBTEST: partial-munmap-no-cpu-fault
> + * Description: munmap partially with no cpu access in between
> + * Test category: functionality test
> + *
> + * SUBTEST: partial-remap-cpu-fault
> + * Description: remap partially with cpu access in between
> + * Test category: functionality test
> + *
> + * SUBTEST: partial-remap-no-cpu-fault
> + * Description: remap partially with no cpu access in between
> + * Test category: functionality test
> + *
> + * SUBTEST: partial-middle-munmap-cpu-fault
> + * Description: munmap middle with cpu access in between
> + * Test category: functionality test
> + *
> + * SUBTEST: partial-middle-munmap-no-cpu-fault
> + * Description: munmap middle with no cpu access in between
> + * Test category: functionality test
> + *
> + * SUBTEST: partial-middle-remap-cpu-fault
> + * Description: remap middle with cpu access in between
> + * Test category: functionality test
> + *
> + * SUBTEST: partial-middle-remap-no-cpu-fault
> + * Description: remap middle with no cpu access in between
> + * Test category: functionality test
> + */
> +
> +static void
> +partial(int fd, struct drm_xe_engine_class_instance *eci, unsigned int flags)
> +{
> +	struct drm_xe_sync sync[1] = {
> +		{ .type = DRM_XE_SYNC_TYPE_USER_FENCE, .flags = DRM_XE_SYNC_FLAG_SIGNAL,
> +	          .timeline_value = USER_FENCE_VALUE },
> +	};
> +	struct drm_xe_exec exec = {
> +		.num_batch_buffer = 1,
> +		.num_syncs = 1,
> +		.syncs = to_user_pointer(sync),
> +	};
> +	struct {
> +		uint32_t batch[16];
> +		uint64_t pad;
> +		uint64_t vm_sync;
> +		uint64_t exec_sync;
> +		uint32_t data;
> +		uint32_t expected_data;
> +	} *data;
> +	size_t bo_size = SZ_2M, unmap_offset = 0;
> +	uint32_t vm, exec_queue;
> +	u64 *exec_ufence = NULL;
> +	int i;
> +	void *old, *new = NULL;
> +	struct aligned_alloc_type alloc;
> +
> +	if (flags & MIDDLE)
> +		unmap_offset = bo_size / 4;
> +
> +	vm = xe_vm_create(fd, DRM_XE_VM_CREATE_FLAG_LR_MODE |
> +			  DRM_XE_VM_CREATE_FLAG_FAULT_MODE, 0);
> +
> +	alloc = __aligned_alloc(bo_size, bo_size);
> +	igt_assert(alloc.ptr);
> +
> +	data = mmap(alloc.ptr, bo_size, PROT_READ | PROT_WRITE,
> +		    MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0);
> +	igt_assert(data != MAP_FAILED);
> +	memset(data, 0, bo_size);
> +	old = data;
> +
> +	exec_queue = xe_exec_queue_create(fd, vm, eci, 0);
> +
> +	sync[0].addr = to_user_pointer(&data[0].vm_sync);
> +	bind_system_allocator(sync, 1);
> +	xe_wait_ufence(fd, &data[0].vm_sync, USER_FENCE_VALUE, 0, FIVE_SEC);
> +	data[0].vm_sync = 0;
> +
> +	exec_ufence = mmap(NULL, SZ_4K, PROT_READ |
> +			   PROT_WRITE, MAP_SHARED |
> +			   MAP_ANONYMOUS, -1, 0);
> +	igt_assert(exec_ufence != MAP_FAILED);
> +	memset(exec_ufence, 0, SZ_4K);
> +
> +	for (i = 0; i < 2; i++) {
> +		uint64_t addr = to_user_pointer(data);
> +		uint64_t sdi_offset = (char *)&data[i].data - (char *)data;
> +		uint64_t sdi_addr = addr + sdi_offset;
> +		int b = 0;
> +
> +		write_dword(data[i].batch, sdi_addr, WRITE_VALUE(&data[i], i), &b);
> +		igt_assert(b <= ARRAY_SIZE(data[i].batch));
> +
> +		if (!i)
> +			data = old + unmap_offset + bo_size / 2;
> +	}
> +
> +	data = old;
> +	exec.exec_queue_id = exec_queue;
> +
> +	for (i = 0; i < 2; i++) {
> +		uint64_t addr = to_user_pointer(data);
> +		uint64_t batch_offset = (char *)&data[i].batch - (char *)data;
> +		uint64_t batch_addr = addr + batch_offset;
> +
> +		sync[0].addr = new ? to_user_pointer(new) :
> +			to_user_pointer(exec_ufence);
> +		exec.address = batch_addr;
> +		xe_exec(fd, &exec);
> +
> +		xe_wait_ufence(fd, new ?: exec_ufence, USER_FENCE_VALUE,
> +			       exec_queue, FIVE_SEC);
> +		if (i || (flags & CPU_FAULT))
> +			igt_assert_eq(data[i].data, READ_VALUE(&data[i]));
> +		exec_ufence[0] = 0;
> +
> +		if (!i) {
> +			data = old + unmap_offset + bo_size / 2;
> +			munmap(old + unmap_offset, bo_size / 2);
> +			if (flags & REMAP) {
> +				new = mmap(old + unmap_offset, bo_size / 2,
> +					   PROT_READ | PROT_WRITE,
> +					   MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED |
> +					   MAP_LOCKED, -1, 0);
> +				igt_assert(new != MAP_FAILED);
> +			}
> +		}
> +	}
> +
> +	xe_exec_queue_destroy(fd, exec_queue);
> +	munmap(exec_ufence, SZ_4K);
> +	__aligned_free(&alloc);
> +	if (new)
> +		munmap(new, bo_size / 2);
> +	xe_vm_destroy(fd, vm);
> +}
> +
> +#define MAX_N_EXEC_QUEUES	16
> +
> +#define MMAP		(0x1 << 0)
> +#define NEW		(0x1 << 1)
> +#define BO_UNMAP	(0x1 << 2)
> +#define FREE		(0x1 << 3)
> +#define BUSY		(0x1 << 4)
> +#define BO_MAP		(0x1 << 5)
> +#define RACE		(0x1 << 6)
> +#define SKIP_MEMSET	(0x1 << 7)
> +#define FAULT		(0x1 << 8)
> +#define FILE_BACKED	(0x1 << 9)
> +#define LOCK		(0x1 << 10)
> +#define MMAP_SHARED	(0x1 << 11)
> +#define HUGE_PAGE	(0x1 << 12)
> +#define SHARED_ALLOC	(0x1 << 13)
> +#define FORK_READ	(0x1 << 14)
> +#define FORK_READ_AFTER	(0x1 << 15)
> +#define MREMAP		(0x1 << 16)
> +#define DONTUNMAP	(0x1 << 17)
> +#define READ_ONLY_REMAP	(0x1 << 18)
> +#define SYNC_EXEC	(0x1 << 19)
> +#define EVERY_OTHER_CHECK	(0x1 << 20)
> +#define MULTI_FAULT	(0x1 << 21)

NIT:
The above doesn't look aligned, but I'm willing to bet that has
to do with my email client and that it's aligned in code.

However, there is a notable standout with EVERY_OTHER_CHECK.
IMO, I think all of the values should be aligned to that if they aren't
already.

-Jonathan Cavitt

> +
> +#define N_MULTI_FAULT	4
> +
> +/**
> + * SUBTEST: once-%s
> + * Description: Run %arg[1] system allocator test only once
> + * Test category: functionality test
> + *
> + * SUBTEST: once-large-%s
> + * Description: Run %arg[1] system allocator test only once with large allocation
> + * Test category: functionality test
> + *
> + * SUBTEST: twice-%s
> + * Description: Run %arg[1] system allocator test twice
> + * Test category: functionality test
> + *
> + * SUBTEST: twice-large-%s
> + * Description: Run %arg[1] system allocator test twice with large allocation
> + * Test category: functionality test
> + *
> + * SUBTEST: many-%s
> + * Description: Run %arg[1] system allocator test many times
> + * Test category: stress test
> + *
> + * SUBTEST: many-stride-%s
> + * Description: Run %arg[1] system allocator test many times with a stride on each exec
> + * Test category: stress test
> + *
> + * SUBTEST: many-execqueues-%s
> + * Description: Run %arg[1] system allocator test on many exec_queues
> + * Test category: stress test
> + *
> + * SUBTEST: many-large-%s
> + * Description: Run %arg[1] system allocator test many times with large allocations
> + * Test category: stress test
> + *
> + * SUBTEST: many-large-execqueues-%s
> + * Description: Run %arg[1] system allocator test on many exec_queues with large allocations
> + *
> + * SUBTEST: threads-many-%s
> + * Description: Run %arg[1] system allocator threaded test many times
> + * Test category: stress test
> + *
> + * SUBTEST: threads-many-stride-%s
> + * Description: Run %arg[1] system allocator threaded test many times with a stride on each exec
> + * Test category: stress test
> + *
> + * SUBTEST: threads-many-execqueues-%s
> + * Description: Run %arg[1] system allocator threaded test on many exec_queues
> + * Test category: stress test
> + *
> + * SUBTEST: threads-many-large-%s
> + * Description: Run %arg[1] system allocator threaded test many times with large allocations
> + * Test category: stress test
> + *
> + * SUBTEST: threads-many-large-execqueues-%s
> + * Description: Run %arg[1] system allocator threaded test on many exec_queues with large allocations
> + *
> + * SUBTEST: threads-shared-vm-many-%s
> + * Description: Run %arg[1] system allocator threaded, shared vm test many times
> + * Test category: stress test
> + *
> + * SUBTEST: threads-shared-vm-many-stride-%s
> + * Description: Run %arg[1] system allocator threaded, shared vm test many times with a stride on each exec
> + * Test category: stress test
> + *
> + * SUBTEST: threads-shared-vm-many-execqueues-%s
> + * Description: Run %arg[1] system allocator threaded, shared vm test on many exec_queues
> + * Test category: stress test
> + *
> + * SUBTEST: threads-shared-vm-many-large-%s
> + * Description: Run %arg[1] system allocator threaded, shared vm test many times with large allocations
> + * Test category: stress test
> + *
> + * SUBTEST: threads-shared-vm-many-large-execqueues-%s
> + * Description: Run %arg[1] system allocator threaded, shared vm test on many exec_queues with large allocations
> + * Test category: stress test
> + *
> + * SUBTEST: process-many-%s
> + * Description: Run %arg[1] system allocator multi-process test many times
> + * Test category: stress test
> + *
> + * SUBTEST: process-many-stride-%s
> + * Description: Run %arg[1] system allocator multi-process test many times with a stride on each exec
> + * Test category: stress test
> + *
> + * SUBTEST: process-many-execqueues-%s
> + * Description: Run %arg[1] system allocator multi-process test on many exec_queues
> + * Test category: stress test
> + *
> + * SUBTEST: process-many-large-%s
> + * Description: Run %arg[1] system allocator multi-process test many times with large allocations
> + * Test category: stress test
> + *
> + * SUBTEST: process-many-large-execqueues-%s
> + * Description: Run %arg[1] system allocator multi-process test on many exec_queues with large allocations
> + *
> + * SUBTEST: fault
> + * Description: use a bad system allocator address resulting in a fault
> + * Test category: bad input
> + *
> + * arg[1]:
> + *
> + * @malloc:				malloc single buffer for all execs, issue a command which will trigger multiple faults
> + * @malloc-multi-fault:			malloc single buffer for all execs
> + * @malloc-fork-read:			malloc single buffer for all execs, fork a process to read test output
> + * @malloc-fork-read-after:		malloc single buffer for all execs, fork a process to read test output, check again after fork returns in parent
> + * @malloc-mlock:			malloc and mlock single buffer for all execs
> + * @malloc-race:			malloc single buffer for all execs with race between cpu and gpu access
> + * @malloc-bo-unmap:			malloc single buffer for all execs, bind and unbind a BO to same address before execs
> + * @malloc-busy:			malloc single buffer for all execs, try to unbind while buffer valid
> + * @mmap:				mmap single buffer for all execs
> + * @mmap-remap:				mmap and mremap a buffer for all execs
> + * @mmap-remap-dontunmap:		mmap and mremap a buffer with dontunmap flag for all execs
> + * @mmap-remap-ro:			mmap and mremap a read-only buffer for all execs
> + * @mmap-remap-ro-dontunmap:		mmap and mremap a read-only buffer with dontunmap flag for all execs
> + * @mmap-remap-eocheck:			mmap and mremap a buffer for all execs, check data every other loop iteration
> + * @mmap-remap-dontunmap-eocheck:	mmap and mremap a buffer with dontunmap flag for all execs, check data every other loop iteration
> + * @mmap-remap-ro-eocheck:		mmap and mremap a read-only buffer for all execs, check data every other loop iteration
> + * @mmap-remap-ro-dontunmap-eocheck:	mmap and mremap a read-only buffer with dontunmap flag for all execs, check data every other loop iteration
> + * @mmap-huge:				mmap huge page single buffer for all execs
> + * @mmap-shared:			mmap shared single buffer for all execs
> + * @mmap-shared-remap:			mmap shared and mremap a buffer for all execs
> + * @mmap-shared-remap-dontunmap:	mmap shared and mremap a buffer with dontunmap flag for all execs
> + * @mmap-shared-remap-eocheck:		mmap shared and mremap a buffer for all execs, check data every other loop iteration
> + * @mmap-shared-remap-dontunmap-eocheck:	mmap shared and mremap a buffer with dontunmap flag for all execs, check data every other loop iteration
> + * @mmap-mlock:				mmap and mlock single buffer for all execs
> + * @mmap-file:				mmap single buffer, with file backing, for all execs
> + * @mmap-file-mlock:			mmap and mlock single buffer, with file backing, for all execs
> + * @mmap-race:				mmap single buffer for all execs with race between cpu and gpu access
> + * @free:				malloc and free buffer for each exec
> + * @free-race:				malloc and free buffer for each exec with race between cpu and gpu access
> + * @new:				malloc a new buffer for each exec
> + * @new-race:				malloc a new buffer for each exec with race between cpu and gpu access
> + * @new-bo-map:				malloc a new buffer or map BO for each exec
> + * @new-busy:				malloc a new buffer for each exec, try to unbind while buffers valid
> + * @mmap-free:				mmap and free buffer for each exec
> + * @mmap-free-huge:			mmap huge page and free buffer for each exec
> + * @mmap-free-race:			mmap and free buffer for each exec with race between cpu and gpu access
> + * @mmap-new:				mmap a new buffer for each exec
> + * @mmap-new-huge:			mmap huge page a new buffer for each exec
> + * @mmap-new-race:			mmap a new buffer for each exec with race between cpu and gpu access
> + * @malloc-nomemset:			malloc single buffer for all execs, skip memset of buffers
> + * @malloc-mlock-nomemset:		malloc and mlock single buffer for all execs, skip memset of buffers
> + * @malloc-race-nomemset:		malloc single buffer for all execs with race between cpu and gpu access, skip memset of buffers
> + * @malloc-bo-unmap-nomemset:		malloc single buffer for all execs, bind and unbind a BO to same address before execs, skip memset of buffers
> + * @malloc-busy-nomemset:		malloc single buffer for all execs, try to unbind while buffer valid, skip memset of buffers
> + * @mmap-nomemset:			mmap single buffer for all execs, skip memset of buffers
> + * @mmap-huge-nomemset:			mmap huge page single buffer for all execs, skip memset of buffers
> + * @mmap-shared-nomemset:		mmap shared single buffer for all execs, skip memset of buffers
> + * @mmap-mlock-nomemset:		mmap and mlock single buffer for all execs, skip memset of buffers
> + * @mmap-file-nomemset:			mmap single buffer, with file backing, for all execs, skip memset of buffers
> + * @mmap-file-mlock-nomemset:		mmap and mlock single buffer, with file backing, for all execs, skip memset of buffers
> + * @mmap-race-nomemset:			mmap single buffer for all execs with race between cpu and gpu access, skip memset of buffers
> + * @free-nomemset:			malloc and free buffer for each exec, skip memset of buffers
> + * @free-race-nomemset:			malloc and free buffer for each exec with race between cpu and gpu access, skip memset of buffers
> + * @new-nomemset:			malloc a new buffer for each exec, skip memset of buffers
> + * @new-race-nomemset:			malloc a new buffer for each exec with race between cpu and gpu access, skip memset of buffers
> + * @new-bo-map-nomemset:		malloc a new buffer or map BO for each exec, skip memset of buffers
> + * @new-busy-nomemset:			malloc a new buffer for each exec, try to unbind while buffers valid, skip memset of buffers
> + * @mmap-free-nomemset:			mmap and free buffer for each exec, skip memset of buffers
> + * @mmap-free-huge-nomemset:		mmap huge page and free buffer for each exec, skip memset of buffers
> + * @mmap-free-race-nomemset:		mmap and free buffer for each exec with race between cpu and gpu access, skip memset of buffers
> + * @mmap-new-nomemset:			mmap a new buffer for each exec, skip memset of buffers
> + * @mmap-new-huge-nomemset:		mmap huge page new buffer for each exec, skip memset of buffers
> + * @mmap-new-race-nomemset:		mmap a new buffer for each exec with race between cpu and gpu access, skip memset of buffers
> + *
> + * SUBTEST: threads-shared-vm-shared-alloc-many-stride-malloc
> + * Description: Create multiple threads with a shared VM triggering faults on different hardware engines to same addresses
> + * Test category: stress test
> + *
> + * SUBTEST: threads-shared-vm-shared-alloc-many-stride-malloc-race
> + * Description: Create multiple threads with a shared VM triggering faults on different hardware engines to same addresses, racing between CPU and GPU access
> + * Test category: stress test
> + *
> + * SUBTEST: threads-shared-alloc-many-stride-malloc
> + * Description: Create multiple threads with a faults on different hardware engines to same addresses
> + * Test category: stress test
> + *
> + * SUBTEST: threads-shared-alloc-many-stride-malloc-sync
> + * Description: Create multiple threads with a faults on different hardware engines to same addresses, syncing on each exec
> + * Test category: stress test
> + *
> + * SUBTEST: threads-shared-alloc-many-stride-malloc-race
> + * Description: Create multiple threads with a faults on different hardware engines to same addresses, racing between CPU and GPU access
> + * Test category: stress test
> + */
> +
> +struct test_exec_data {
> +	uint32_t batch[32];
> +	uint64_t pad;
> +	uint64_t vm_sync;
> +	uint64_t exec_sync;
> +	uint32_t data;
> +	uint32_t expected_data;
> +};
> +
> +static void
> +test_exec(int fd, struct drm_xe_engine_class_instance *eci,
> +	  int n_exec_queues, int n_execs, size_t bo_size,
> +	  size_t stride, uint32_t vm, void *alloc, pthread_barrier_t *barrier,
> +	  unsigned int flags)
> +{
> +	uint64_t addr;
> +	struct drm_xe_sync sync[1] = {
> +		{ .type = DRM_XE_SYNC_TYPE_USER_FENCE, .flags = DRM_XE_SYNC_FLAG_SIGNAL,
> +	          .timeline_value = USER_FENCE_VALUE },
> +	};
> +	struct drm_xe_exec exec = {
> +		.num_batch_buffer = 1,
> +		.num_syncs = 1,
> +		.syncs = to_user_pointer(sync),
> +	};
> +	uint32_t exec_queues[MAX_N_EXEC_QUEUES];
> +	struct test_exec_data *data, *next_data = NULL;
> +	uint32_t bo_flags;
> +	uint32_t bo = 0;
> +	void **pending_free;
> +	u64 *exec_ufence = NULL;
> +	int i, j, b, file_fd = -1, prev_idx;
> +	bool free_vm = false;
> +	size_t aligned_size = bo_size ?: xe_get_default_alignment(fd);
> +	size_t orig_size = bo_size;
> +	struct aligned_alloc_type aligned_alloc_type;
> +
> +	if (flags & MULTI_FAULT) {
> +		if (!bo_size)
> +			return;
> +
> +		bo_size *= N_MULTI_FAULT;
> +	}
> +
> +	if (flags & SHARED_ALLOC)
> +		return;
> +
> +	if (flags & EVERY_OTHER_CHECK && odd(n_execs))
> +		return;
> +
> +	if (flags & EVERY_OTHER_CHECK)
> +		igt_assert(flags & MREMAP);
> +
> +	igt_assert(n_exec_queues <= MAX_N_EXEC_QUEUES);
> +
> +	if (flags & NEW && !(flags & FREE)) {
> +		pending_free = malloc(sizeof(*pending_free) * n_execs);
> +		igt_assert(pending_free);
> +		memset(pending_free, 0, sizeof(*pending_free) * n_execs);
> +	}
> +
> +	if (!vm) {
> +		vm = xe_vm_create(fd, DRM_XE_VM_CREATE_FLAG_LR_MODE |
> +				  DRM_XE_VM_CREATE_FLAG_FAULT_MODE, 0);
> +		free_vm = true;
> +	}
> +	if (!bo_size) {
> +		if (!stride) {
> +			bo_size = sizeof(*data) * n_execs;
> +			bo_size = xe_bb_size(fd, bo_size);
> +		} else {
> +			bo_size = stride * n_execs * sizeof(*data);
> +			bo_size = xe_bb_size(fd, bo_size);
> +		}
> +	}
> +	if (flags & HUGE_PAGE) {
> +		aligned_size = ALIGN(aligned_size, SZ_2M);
> +		bo_size = ALIGN(bo_size, SZ_2M);
> +	}
> +
> +	if (alloc) {
> +		data = alloc;
> +	} else {
> +		if (flags & MMAP) {
> +			int mmap_flags = MAP_FIXED;
> +
> +			aligned_alloc_type = __aligned_alloc(aligned_size, bo_size);
> +			data = aligned_alloc_type.ptr;
> +			igt_assert(data);
> +			__aligned_partial_free(&aligned_alloc_type);
> +
> +			if (flags & MMAP_SHARED)
> +				mmap_flags |= MAP_SHARED;
> +			else
> +				mmap_flags |= MAP_PRIVATE;
> +
> +			if (flags & HUGE_PAGE)
> +				mmap_flags |= MAP_HUGETLB | MAP_HUGE_2MB;
> +
> +			if (flags & FILE_BACKED) {
> +				char name[] = "/tmp/xe_exec_system_allocator_datXXXXXX";
> +
> +				igt_assert(!(flags & NEW));
> +
> +				file_fd = mkstemp(name);
> +				posix_fallocate(file_fd, 0, bo_size);
> +			} else {
> +				mmap_flags |= MAP_ANONYMOUS;
> +			}
> +
> +			data = mmap(data, bo_size, PROT_READ |
> +				    PROT_WRITE, mmap_flags, file_fd, 0);
> +			igt_assert(data != MAP_FAILED);
> +		} else {
> +			data = aligned_alloc(aligned_size, bo_size);
> +			igt_assert(data);
> +		}
> +		if (!(flags & SKIP_MEMSET))
> +			memset(data, 0, bo_size);
> +		if (flags & LOCK) {
> +			igt_assert(!(flags & NEW));
> +			mlock(data, bo_size);
> +		}
> +	}
> +
> +	for (i = 0; i < n_exec_queues; i++)
> +		exec_queues[i] = xe_exec_queue_create(fd, vm, eci, 0);
> +
> +	sync[0].addr = to_user_pointer(&data[0].vm_sync);
> +	if (free_vm) {
> +		bind_system_allocator(sync, 1);
> +		xe_wait_ufence(fd, &data[0].vm_sync, USER_FENCE_VALUE, 0, FIVE_SEC);
> +	}
> +	data[0].vm_sync = 0;
> +
> +	addr = to_user_pointer(data);
> +
> +	if (flags & BO_UNMAP) {
> +		bo_flags = DRM_XE_GEM_CREATE_FLAG_NEEDS_VISIBLE_VRAM;
> +		bo = xe_bo_create(fd, vm, bo_size,
> +				  vram_if_possible(fd, eci->gt_id), bo_flags);
> +		xe_vm_bind_async(fd, vm, 0, bo, 0, addr, bo_size, 0, 0);
> +
> +		__xe_vm_bind_assert(fd, vm, 0,
> +				    0, 0, addr, bo_size,
> +				    DRM_XE_VM_BIND_OP_MAP,
> +				    DRM_XE_VM_BIND_FLAG_CPU_ADDR_MIRROR, sync,
> +				    1, 0, 0);
> +		xe_wait_ufence(fd, &data[0].vm_sync, USER_FENCE_VALUE, 0,
> +			       FIVE_SEC);
> +		data[0].vm_sync = 0;
> +		gem_close(fd, bo);
> +		bo = 0;
> +	}
> +
> +	if (!(flags & RACE)) {
> +		exec_ufence = mmap(NULL, SZ_4K, PROT_READ |
> +				   PROT_WRITE, MAP_SHARED |
> +				   MAP_ANONYMOUS, -1, 0);
> +		igt_assert(exec_ufence != MAP_FAILED);
> +		memset(exec_ufence, 0, SZ_4K);
> +	}
> +
> +	for (i = 0; i < n_execs; i++) {
> +		int idx = !stride ? i : i * stride, next_idx = !stride
> +			? (i + 1) : (i + 1) * stride;
> +		uint64_t batch_offset = (char *)&data[idx].batch - (char *)data;
> +		uint64_t batch_addr = addr + batch_offset;
> +		uint64_t sdi_offset = (char *)&data[idx].data - (char *)data;
> +		uint64_t sdi_addr = addr + sdi_offset;
> +		int e = i % n_exec_queues, err;
> +		bool fault_inject = (FAULT & flags) && i == n_execs / 2;
> +		bool fault_injected = (FAULT & flags) && i > n_execs;
> +
> +		if (barrier)
> +			pthread_barrier_wait(barrier);
> +
> +		if (flags & MULTI_FAULT) {
> +			b = 0;
> +			for (j = 0; j < N_MULTI_FAULT - 1; ++j)
> +				__write_dword(data[idx].batch,
> +					      sdi_addr + j * orig_size,
> +					      WRITE_VALUE(&data[idx], idx), &b);
> +			write_dword(data[idx].batch, sdi_addr + j * orig_size,
> +				    WRITE_VALUE(&data[idx], idx), &b);
> +			igt_assert(b <= ARRAY_SIZE(data[idx].batch));
> +		} else if (!(flags & EVERY_OTHER_CHECK)) {
> +			b = 0;
> +			write_dword(data[idx].batch, sdi_addr,
> +				    WRITE_VALUE(&data[idx], idx), &b);
> +			igt_assert(b <= ARRAY_SIZE(data[idx].batch));
> +		} else if (flags & EVERY_OTHER_CHECK && !odd(i)) {
> +			b = 0;
> +			write_dword(data[idx].batch, sdi_addr,
> +				    WRITE_VALUE(&data[idx], idx), &b);
> +			igt_assert(b <= ARRAY_SIZE(data[idx].batch));
> +
> +			aligned_alloc_type = __aligned_alloc(aligned_size, bo_size);
> +			next_data = aligned_alloc_type.ptr;
> +			igt_assert(next_data);
> +			__aligned_partial_free(&aligned_alloc_type);
> +
> +			b = 0;
> +			write_dword(data[next_idx].batch,
> +				    to_user_pointer(next_data) +
> +				    (char *)&data[next_idx].data - (char *)data,
> +				    WRITE_VALUE(&data[next_idx], next_idx), &b);
> +			igt_assert(b <= ARRAY_SIZE(data[next_idx].batch));
> +		}
> +
> +		if (!exec_ufence)
> +			data[idx].exec_sync = 0;
> +
> +		sync[0].addr = exec_ufence ? to_user_pointer(exec_ufence) :
> +			addr + (char *)&data[idx].exec_sync - (char *)data;
> +
> +		exec.exec_queue_id = exec_queues[e];
> +		if (fault_inject)
> +			exec.address = batch_addr * 2;
> +		else
> +			exec.address = batch_addr;
> +
> +		if (fault_injected) {
> +			err = __xe_exec(fd, &exec);
> +			igt_assert(err == -ENOENT);
> +		} else {
> +			xe_exec(fd, &exec);
> +		}
> +
> +		if (barrier)
> +			pthread_barrier_wait(barrier);
> +
> +		if (fault_inject || fault_injected) {
> +			int64_t timeout = QUARTER_SEC;
> +
> +			err = __xe_wait_ufence(fd, exec_ufence ? exec_ufence :
> +					       &data[idx].exec_sync,
> +					       USER_FENCE_VALUE,
> +					       exec_queues[e], &timeout);
> +			igt_assert(err == -ETIME || err == -EIO);
> +		} else {
> +			xe_wait_ufence(fd, exec_ufence ? exec_ufence :
> +				       &data[idx].exec_sync, USER_FENCE_VALUE,
> +				       exec_queues[e], FIVE_SEC);
> +			if (flags & LOCK && !i)
> +				munlock(data, bo_size);
> +
> +			if (flags & MREMAP) {
> +				void *old = data;
> +				int remap_flags = MREMAP_MAYMOVE | MREMAP_FIXED;
> +
> +				if (flags & DONTUNMAP)
> +					remap_flags |= MREMAP_DONTUNMAP;
> +
> +				if (flags & READ_ONLY_REMAP)
> +					igt_assert(!mprotect(old, bo_size,
> +							     PROT_READ));
> +
> +				if (!next_data) {
> +					aligned_alloc_type = __aligned_alloc(aligned_size,
> +								    bo_size);
> +					data = aligned_alloc_type.ptr;
> +					__aligned_partial_free(&aligned_alloc_type);
> +				} else {
> +					data = next_data;
> +				}
> +				next_data = NULL;
> +				igt_assert(data);
> +
> +				data = mremap(old, bo_size, bo_size,
> +					      remap_flags, data);
> +				igt_assert(data != MAP_FAILED);
> +
> +				if (flags & READ_ONLY_REMAP)
> +					igt_assert(!mprotect(data, bo_size,
> +							     PROT_READ |
> +							     PROT_WRITE));
> +
> +				addr = to_user_pointer(data);
> +				if (flags & DONTUNMAP)
> +					munmap(old, bo_size);
> +			}
> +
> +			if (!(flags & EVERY_OTHER_CHECK) || odd(i)) {
> +				if (flags & FORK_READ) {
> +					igt_fork(child, 1)
> +						igt_assert_eq(data[idx].data,
> +							      READ_VALUE(&data[idx]));
> +					if (!(flags & FORK_READ_AFTER))
> +						igt_assert_eq(data[idx].data,
> +							      READ_VALUE(&data[idx]));
> +					igt_waitchildren();
> +					if (flags & FORK_READ_AFTER)
> +						igt_assert_eq(data[idx].data,
> +							      READ_VALUE(&data[idx]));
> +				} else {
> +					igt_assert_eq(data[idx].data,
> +						      READ_VALUE(&data[idx]));
> +
> +					if (flags & MULTI_FAULT) {
> +						for (j = 1; j < N_MULTI_FAULT; ++j) {
> +							struct test_exec_data *__data =
> +								((void *)data) + j * orig_size;
> +
> +							igt_assert_eq(__data[idx].data,
> +								      READ_VALUE(&data[idx]));
> +						}
> +					}
> +				}
> +				if (flags & EVERY_OTHER_CHECK)
> +					igt_assert_eq(data[prev_idx].data,
> +						      READ_VALUE(&data[prev_idx]));
> +			}
> +		}
> +
> +		if (exec_ufence)
> +			exec_ufence[0] = 0;
> +
> +		if (bo) {
> +			__xe_vm_bind_assert(fd, vm, 0,
> +					    0, 0, addr, bo_size,
> +					    DRM_XE_VM_BIND_OP_MAP,
> +					    DRM_XE_VM_BIND_FLAG_CPU_ADDR_MIRROR,
> +					    NULL, 0, 0, 0);
> +			munmap(data, bo_size);
> +			gem_close(fd, bo);
> +		}
> +
> +		if (flags & NEW) {
> +			if (flags & MMAP) {
> +				if (flags & FREE)
> +					munmap(data, bo_size);
> +				else
> +					pending_free[i] = data;
> +				data = mmap(NULL, bo_size, PROT_READ |
> +					    PROT_WRITE, MAP_SHARED |
> +					    MAP_ANONYMOUS, -1, 0);
> +				igt_assert(data != MAP_FAILED);
> +			} else if (flags & BO_MAP && odd(i)) {
> +				if (!bo) {
> +					if (flags & FREE)
> +						free(data);
> +					else
> +						pending_free[i] = data;
> +				}
> +
> +				aligned_alloc_type = __aligned_alloc(aligned_size, bo_size);
> +				data = aligned_alloc_type.ptr;
> +				igt_assert(data);
> +				__aligned_partial_free(&aligned_alloc_type);
> +
> +				bo_flags = DRM_XE_GEM_CREATE_FLAG_NEEDS_VISIBLE_VRAM;
> +				bo = xe_bo_create(fd, vm, bo_size,
> +						  vram_if_possible(fd, eci->gt_id),
> +						  bo_flags);
> +				data = xe_bo_map_fixed(fd, bo, bo_size,
> +						       to_user_pointer(data));
> +
> +				xe_vm_bind_async(fd, vm, 0, bo, 0,
> +						 to_user_pointer(data),
> +						 bo_size, 0, 0);
> +			} else {
> +				if (!bo) {
> +					if (flags & FREE)
> +						free(data);
> +					else
> +						pending_free[i] = data;
> +				}
> +				bo = 0;
> +				data = aligned_alloc(aligned_size, bo_size);
> +				igt_assert(data);
> +			}
> +			addr = to_user_pointer(data);
> +			if (!(flags & SKIP_MEMSET))
> +				memset(data, 0, bo_size);
> +		}
> +
> +		prev_idx = idx;
> +	}
> +
> +	if (bo) {
> +		__xe_vm_bind_assert(fd, vm, 0,
> +				    0, 0, addr, bo_size,
> +				    DRM_XE_VM_BIND_OP_MAP,
> +				    DRM_XE_VM_BIND_FLAG_CPU_ADDR_MIRROR,
> +				    NULL, 0, 0, 0);
> +		munmap(data, bo_size);
> +		data = NULL;
> +		gem_close(fd, bo);
> +	}
> +
> +	if (flags & BUSY)
> +		igt_assert_eq(unbind_system_allocator(), -EBUSY);
> +
> +	for (i = 0; i < n_exec_queues; i++)
> +		xe_exec_queue_destroy(fd, exec_queues[i]);
> +
> +	if (exec_ufence)
> +		munmap(exec_ufence, SZ_4K);
> +
> +	if (flags & LOCK)
> +		munlock(data, bo_size);
> +
> +	if (file_fd != -1)
> +		close(file_fd);
> +
> +	if (flags & NEW && !(flags & FREE)) {
> +		for (i = 0; i < n_execs; i++) {
> +			if (!pending_free[i])
> +				continue;
> +
> +			if (flags & MMAP)
> +				munmap(pending_free[i], bo_size);
> +			else
> +				free(pending_free[i]);
> +		}
> +		free(pending_free);
> +	}
> +	if (data) {
> +		if (flags & MMAP)
> +			munmap(data, bo_size);
> +		else if (!alloc)
> +			free(data);
> +	}
> +	if (free_vm)
> +		xe_vm_destroy(fd, vm);
> +}
> +
> +struct thread_data {
> +	pthread_t thread;
> +	pthread_mutex_t *mutex;
> +	pthread_cond_t *cond;
> +	pthread_barrier_t *barrier;
> +	int fd;
> +	struct drm_xe_engine_class_instance *eci;
> +	int n_exec_queues;
> +	int n_execs;
> +	size_t bo_size;
> +	size_t stride;
> +	uint32_t vm;
> +	unsigned int flags;
> +	void *alloc;
> +	bool *go;
> +};
> +
> +static void *thread(void *data)
> +{
> +	struct thread_data *t = data;
> +
> +	pthread_mutex_lock(t->mutex);
> +	while (!*t->go)
> +		pthread_cond_wait(t->cond, t->mutex);
> +	pthread_mutex_unlock(t->mutex);
> +
> +	test_exec(t->fd, t->eci, t->n_exec_queues, t->n_execs,
> +		  t->bo_size, t->stride, t->vm, t->alloc, t->barrier,
> +		  t->flags);
> +
> +	return NULL;
> +}
> +
> +static void
> +threads(int fd, int n_exec_queues, int n_execs, size_t bo_size,
> +	size_t stride, unsigned int flags, bool shared_vm)
> +{
> +	struct drm_xe_engine_class_instance *hwe;
> +	struct thread_data *threads_data;
> +	int n_engines = 0, i = 0;
> +	pthread_mutex_t mutex;
> +	pthread_cond_t cond;
> +	pthread_barrier_t barrier;
> +	uint32_t vm = 0;
> +	bool go = false;
> +	void *alloc = NULL;
> +
> +	if ((FILE_BACKED | FORK_READ) & flags)
> +		return;
> +
> +	xe_for_each_engine(fd, hwe)
> +		++n_engines;
> +
> +	if (shared_vm) {
> +		vm = xe_vm_create(fd, DRM_XE_VM_CREATE_FLAG_LR_MODE |
> +				  DRM_XE_VM_CREATE_FLAG_FAULT_MODE, 0);
> +		bind_system_allocator(NULL, 0);
> +	}
> +
> +	if (flags & SHARED_ALLOC) {
> +		uint64_t alloc_size;
> +
> +		igt_assert(stride);
> +
> +		alloc_size = sizeof(struct test_exec_data) * stride *
> +			n_execs * n_engines;
> +		alloc_size = xe_bb_size(fd, alloc_size);
> +		alloc = aligned_alloc(SZ_2M, alloc_size);
> +		igt_assert(alloc);
> +
> +		memset(alloc, 0, alloc_size);
> +		flags &= ~SHARED_ALLOC;
> +	}
> +
> +	threads_data = calloc(n_engines, sizeof(*threads_data));
> +	igt_assert(threads_data);
> +
> +	pthread_mutex_init(&mutex, 0);
> +	pthread_cond_init(&cond, 0);
> +	pthread_barrier_init(&barrier, 0, n_engines);
> +
> +	xe_for_each_engine(fd, hwe) {
> +		threads_data[i].mutex = &mutex;
> +		threads_data[i].cond = &cond;
> +		threads_data[i].barrier = (flags & SYNC_EXEC) ? &barrier : NULL;
> +		threads_data[i].fd = fd;
> +		threads_data[i].eci = hwe;
> +		threads_data[i].n_exec_queues = n_exec_queues;
> +		threads_data[i].n_execs = n_execs;
> +		threads_data[i].bo_size = bo_size;
> +		threads_data[i].stride = stride;
> +		threads_data[i].vm = vm;
> +		threads_data[i].flags = flags;
> +		threads_data[i].alloc = alloc ? alloc + i *
> +			sizeof(struct test_exec_data) : NULL;
> +		threads_data[i].go = &go;
> +		pthread_create(&threads_data[i].thread, 0, thread,
> +			       &threads_data[i]);
> +		++i;
> +	}
> +
> +	pthread_mutex_lock(&mutex);
> +	go = true;
> +	pthread_cond_broadcast(&cond);
> +	pthread_mutex_unlock(&mutex);
> +
> +	for (i = 0; i < n_engines; ++i)
> +		pthread_join(threads_data[i].thread, NULL);
> +
> +	if (shared_vm) {
> +		int ret;
> +
> +		if (flags & MMAP) {
> +			int tries = 300;
> +
> +			while (tries && (ret = unbind_system_allocator()) == -EBUSY) {
> +				sleep(.01);
> +				--tries;
> +			}
> +			igt_assert_eq(ret, 0);
> +		}
> +		xe_vm_destroy(fd, vm);
> +		if (alloc)
> +			free(alloc);
> +	}
> +	free(threads_data);
> +}
> +
> +static void process(struct drm_xe_engine_class_instance *hwe, int n_exec_queues,
> +		    int n_execs, size_t bo_size, size_t stride,
> +		    unsigned int flags)
> +{
> +	struct process_data *pdata;
> +	int map_fd;
> +	int fd;
> +
> +	map_fd = open(sync_file, O_RDWR, 0x666);
> +	pdata = mmap(NULL, sizeof(*pdata), PROT_READ |
> +		     PROT_WRITE, MAP_SHARED, map_fd, 0);
> +	wait_pdata(pdata);
> +
> +	fd = drm_open_driver(DRIVER_XE);
> +	test_exec(fd, hwe, n_exec_queues, n_execs,
> +		  bo_size, stride, 0, NULL, NULL, flags);
> +	drm_close_driver(fd);
> +
> +	close(map_fd);
> +	munmap(pdata, sizeof(*pdata));
> +}
> +
> +static void
> +processes(int fd, int n_exec_queues, int n_execs, size_t bo_size,
> +	  size_t stride, unsigned int flags)
> +{
> +	struct drm_xe_engine_class_instance *hwe;
> +	struct process_data *pdata;
> +	int map_fd;
> +
> +	if (flags & FORK_READ)
> +		return;
> +
> +	map_fd = open(sync_file, O_RDWR | O_CREAT, 0x666);
> +	posix_fallocate(map_fd, 0, sizeof(*pdata));
> +	pdata = mmap(NULL, sizeof(*pdata), PROT_READ |
> +		     PROT_WRITE, MAP_SHARED, map_fd, 0);
> +
> +	init_pdata(pdata, 0);
> +
> +	xe_for_each_engine(fd, hwe) {
> +		igt_fork(child, 1)
> +			process(hwe, n_exec_queues, n_execs, bo_size,
> +				stride, flags);
> +	}
> +
> +	signal_pdata(pdata);
> +	igt_waitchildren();
> +
> +	close(map_fd);
> +	munmap(pdata, sizeof(*pdata));
> +}
> +
> +struct section {
> +	const char *name;
> +	unsigned int flags;
> +};
> +
> +igt_main
> +{
> +	struct drm_xe_engine_class_instance *hwe;
> +	const struct section sections[] = {
> +		{ "malloc", 0 },
> +		{ "malloc-multi-fault", MULTI_FAULT },
> +		{ "malloc-fork-read", FORK_READ },
> +		{ "malloc-fork-read-after", FORK_READ | FORK_READ_AFTER },
> +		{ "malloc-mlock", LOCK },
> +		{ "malloc-race", RACE },
> +		{ "malloc-busy", BUSY },
> +		{ "malloc-bo-unmap", BO_UNMAP },
> +		{ "mmap", MMAP },
> +		{ "mmap-remap", MMAP | MREMAP },
> +		{ "mmap-remap-dontunmap", MMAP | MREMAP | DONTUNMAP },
> +		{ "mmap-remap-ro", MMAP | MREMAP | READ_ONLY_REMAP },
> +		{ "mmap-remap-ro-dontunmap", MMAP | MREMAP | DONTUNMAP |
> +			READ_ONLY_REMAP },
> +		{ "mmap-remap-eocheck", MMAP | MREMAP | EVERY_OTHER_CHECK },
> +		{ "mmap-remap-dontunmap-eocheck", MMAP | MREMAP | DONTUNMAP |
> +			EVERY_OTHER_CHECK },
> +		{ "mmap-remap-ro-eocheck", MMAP | MREMAP | READ_ONLY_REMAP |
> +			EVERY_OTHER_CHECK },
> +		{ "mmap-remap-ro-dontunmap-eocheck", MMAP | MREMAP | DONTUNMAP |
> +			READ_ONLY_REMAP | EVERY_OTHER_CHECK },
> +		{ "mmap-huge", MMAP | HUGE_PAGE },
> +		{ "mmap-shared", MMAP | LOCK | MMAP_SHARED },
> +		{ "mmap-shared-remap", MMAP | LOCK | MMAP_SHARED | MREMAP },
> +		{ "mmap-shared-remap-dontunmap", MMAP | LOCK | MMAP_SHARED |
> +			MREMAP | DONTUNMAP },
> +		{ "mmap-shared-remap-eocheck", MMAP | LOCK | MMAP_SHARED |
> +			MREMAP | EVERY_OTHER_CHECK },
> +		{ "mmap-shared-remap-dontunmap-eocheck", MMAP | LOCK |
> +			MMAP_SHARED | MREMAP | DONTUNMAP | EVERY_OTHER_CHECK },
> +		{ "mmap-mlock", MMAP | LOCK },
> +		{ "mmap-file", MMAP | FILE_BACKED },
> +		{ "mmap-file-mlock", MMAP | LOCK | FILE_BACKED },
> +		{ "mmap-race", MMAP | RACE },
> +		{ "free", NEW | FREE },
> +		{ "free-race", NEW | FREE | RACE },
> +		{ "new", NEW },
> +		{ "new-race", NEW | RACE },
> +		{ "new-bo-map", NEW | BO_MAP },
> +		{ "new-busy", NEW | BUSY },
> +		{ "mmap-free", MMAP | NEW | FREE },
> +		{ "mmap-free-huge", MMAP | NEW | FREE | HUGE_PAGE },
> +		{ "mmap-free-race", MMAP | NEW | FREE | RACE },
> +		{ "mmap-new", MMAP | NEW },
> +		{ "mmap-new-huge", MMAP | NEW | HUGE_PAGE },
> +		{ "mmap-new-race", MMAP | NEW | RACE },
> +		{ "malloc-nomemset", SKIP_MEMSET },
> +		{ "malloc-mlock-nomemset", SKIP_MEMSET | LOCK },
> +		{ "malloc-race-nomemset", SKIP_MEMSET | RACE },
> +		{ "malloc-busy-nomemset", SKIP_MEMSET | BUSY },
> +		{ "malloc-bo-unmap-nomemset", SKIP_MEMSET | BO_UNMAP },
> +		{ "mmap-nomemset", SKIP_MEMSET | MMAP },
> +		{ "mmap-huge-nomemset", SKIP_MEMSET | MMAP | HUGE_PAGE },
> +		{ "mmap-shared-nomemset", SKIP_MEMSET | MMAP | MMAP_SHARED },
> +		{ "mmap-mlock-nomemset", SKIP_MEMSET | MMAP | LOCK },
> +		{ "mmap-file-nomemset", SKIP_MEMSET | MMAP | FILE_BACKED },
> +		{ "mmap-file-mlock-nomemset", SKIP_MEMSET | MMAP | LOCK | FILE_BACKED },
> +		{ "mmap-race-nomemset", SKIP_MEMSET | MMAP | RACE },
> +		{ "free-nomemset", SKIP_MEMSET | NEW | FREE },
> +		{ "free-race-nomemset", SKIP_MEMSET | NEW | FREE | RACE },
> +		{ "new-nomemset", SKIP_MEMSET | NEW },
> +		{ "new-race-nomemset", SKIP_MEMSET | NEW | RACE },
> +		{ "new-bo-map-nomemset", SKIP_MEMSET | NEW | BO_MAP },
> +		{ "new-busy-nomemset", SKIP_MEMSET | NEW | BUSY },
> +		{ "mmap-free-nomemset", SKIP_MEMSET | MMAP | NEW | FREE },
> +		{ "mmap-free-huge-nomemset", SKIP_MEMSET | MMAP | NEW | FREE | HUGE_PAGE },
> +		{ "mmap-free-race-nomemset", SKIP_MEMSET | MMAP | NEW | FREE | RACE },
> +		{ "mmap-new-nomemset", SKIP_MEMSET | MMAP | NEW },
> +		{ "mmap-new-huge-nomemset", SKIP_MEMSET | MMAP | NEW | HUGE_PAGE },
> +		{ "mmap-new-race-nomemset", SKIP_MEMSET | MMAP | NEW | RACE },
> +		{ NULL },
> +	};
> +	const struct section psections[] = {
> +		{ "munmap-cpu-fault", CPU_FAULT },
> +		{ "munmap-no-cpu-fault", 0 },
> +		{ "remap-cpu-fault", CPU_FAULT | REMAP },
> +		{ "remap-no-cpu-fault", REMAP },
> +		{ "middle-munmap-cpu-fault", MIDDLE | CPU_FAULT },
> +		{ "middle-munmap-no-cpu-fault", MIDDLE },
> +		{ "middle-remap-cpu-fault", MIDDLE | CPU_FAULT | REMAP },
> +		{ "middle-remap-no-cpu-fault", MIDDLE | REMAP },
> +		{ NULL },
> +	};
> +	const struct section esections[] = {
> +		{ "malloc", 0 },
> +		{ "malloc-mix-bo", MIX_BO_ALLOC },
> +		{ NULL },
> +	};
> +	int fd;
> +
> +	igt_fixture {
> +		struct xe_device *xe;
> +
> +		fd = drm_open_driver(DRIVER_XE);
> +		igt_require(!xe_supports_faults(fd));
> +
> +		xe = xe_device_get(fd);
> +		va_bits = xe->va_bits;
> +		open_sync_file();
> +	}
> +
> +	for (const struct section *s = sections; s->name; s++) {
> +		igt_subtest_f("once-%s", s->name)
> +			xe_for_each_engine(fd, hwe)
> +				test_exec(fd, hwe, 1, 1, 0, 0, 0, NULL,
> +					  NULL, s->flags);
> +
> +		igt_subtest_f("once-large-%s", s->name)
> +			xe_for_each_engine(fd, hwe)
> +				test_exec(fd, hwe, 1, 1, SZ_2M, 0, 0, NULL,
> +					  NULL, s->flags);
> +
> +		igt_subtest_f("twice-%s", s->name)
> +			xe_for_each_engine(fd, hwe)
> +				test_exec(fd, hwe, 1, 2, 0, 0, 0, NULL,
> +					  NULL, s->flags);
> +
> +		igt_subtest_f("twice-large-%s", s->name)
> +			xe_for_each_engine(fd, hwe)
> +				test_exec(fd, hwe, 1, 2, SZ_2M, 0, 0, NULL,
> +					  NULL, s->flags);
> +
> +		igt_subtest_f("many-%s", s->name)
> +			xe_for_each_engine(fd, hwe)
> +				test_exec(fd, hwe, 1, 128, 0, 0, 0, NULL,
> +					  NULL, s->flags);
> +
> +		igt_subtest_f("many-stride-%s", s->name)
> +			xe_for_each_engine(fd, hwe)
> +				test_exec(fd, hwe, 1, 128, 0, 256, 0, NULL,
> +					  NULL, s->flags);
> +
> +		igt_subtest_f("many-execqueues-%s", s->name)
> +			xe_for_each_engine(fd, hwe)
> +				test_exec(fd, hwe, 16, 128, 0, 0, 0, NULL,
> +					  NULL, s->flags);
> +
> +		igt_subtest_f("many-large-%s", s->name)
> +			xe_for_each_engine(fd, hwe)
> +				test_exec(fd, hwe, 1, 128, SZ_2M, 0, 0, NULL,
> +					  NULL, s->flags);
> +
> +		igt_subtest_f("many-large-execqueues-%s", s->name)
> +			xe_for_each_engine(fd, hwe)
> +				test_exec(fd, hwe, 16, 128, SZ_2M, 0, 0, NULL,
> +					  NULL, s->flags);
> +
> +		igt_subtest_f("threads-many-%s", s->name)
> +			threads(fd, 1, 128, 0, 0, s->flags, false);
> +
> +		igt_subtest_f("threads-many-stride-%s", s->name)
> +			threads(fd, 1, 128, 0, 256, s->flags, false);
> +
> +		igt_subtest_f("threads-many-execqueues-%s", s->name)
> +			threads(fd, 16, 128, 0, 0, s->flags, false);
> +
> +		igt_subtest_f("threads-many-large-%s", s->name)
> +			threads(fd, 1, 128, SZ_2M, 0, s->flags, false);
> +
> +		igt_subtest_f("threads-many-large-execqueues-%s", s->name)
> +			threads(fd, 16, 128, SZ_2M, 0, s->flags, false);
> +
> +		igt_subtest_f("threads-shared-vm-many-%s", s->name)
> +			threads(fd, 1, 128, 0, 0, s->flags, true);
> +
> +		igt_subtest_f("threads-shared-vm-many-stride-%s", s->name)
> +			threads(fd, 1, 128, 0, 256, s->flags, true);
> +
> +		igt_subtest_f("threads-shared-vm-many-execqueues-%s", s->name)
> +			threads(fd, 16, 128, 0, 0, s->flags, true);
> +
> +		igt_subtest_f("threads-shared-vm-many-large-%s", s->name)
> +			threads(fd, 1, 128, SZ_2M, 0, s->flags, true);
> +
> +		igt_subtest_f("threads-shared-vm-many-large-execqueues-%s", s->name)
> +			threads(fd, 16, 128, SZ_2M, 0, s->flags, true);
> +
> +		igt_subtest_f("process-many-%s", s->name)
> +			processes(fd, 1, 128, 0, 0, s->flags);
> +
> +		igt_subtest_f("process-many-stride-%s", s->name)
> +			processes(fd, 1, 128, 0, 256, s->flags);
> +
> +		igt_subtest_f("process-many-execqueues-%s", s->name)
> +			processes(fd, 16, 128, 0, 0, s->flags);
> +
> +		igt_subtest_f("process-many-large-%s", s->name)
> +			processes(fd, 1, 128, SZ_2M, 0, s->flags);
> +
> +		igt_subtest_f("process-many-large-execqueues-%s", s->name)
> +			processes(fd, 16, 128, SZ_2M, 0, s->flags);
> +	}
> +
> +	igt_subtest("threads-shared-vm-shared-alloc-many-stride-malloc")
> +		threads(fd, 1, 128, 0, 256, SHARED_ALLOC, true);
> +
> +	igt_subtest("threads-shared-vm-shared-alloc-many-stride-malloc-race")
> +		threads(fd, 1, 128, 0, 256, RACE | SHARED_ALLOC, true);
> +
> +	igt_subtest("threads-shared-alloc-many-stride-malloc")
> +		threads(fd, 1, 128, 0, 256, SHARED_ALLOC, false);
> +
> +	igt_subtest("threads-shared-alloc-many-stride-malloc-sync")
> +		threads(fd, 1, 128, 0, 256, SHARED_ALLOC | SYNC_EXEC, false);
> +
> +	igt_subtest("threads-shared-alloc-many-stride-malloc-race")
> +		threads(fd, 1, 128, 0, 256, RACE | SHARED_ALLOC, false);
> +
> +	igt_subtest_f("fault")
> +		xe_for_each_engine(fd, hwe)
> +			test_exec(fd, hwe, 4, 1, SZ_2M, 0, 0, NULL, NULL,
> +				  FAULT);
> +
> +	for (const struct section *s = psections; s->name; s++) {
> +		igt_subtest_f("partial-%s", s->name)
> +			xe_for_each_engine(fd, hwe)
> +				partial(fd, hwe, s->flags);
> +	}
> +
> +	igt_subtest_f("unaligned-alloc")
> +		xe_for_each_engine(fd, hwe) {
> +			many_allocs(fd, hwe, (SZ_1M + SZ_512K) * 8,
> +				    SZ_1M + SZ_512K, SZ_4K, NULL, 0);
> +			break;
> +		}
> +
> +	igt_subtest_f("fault-benchmark")
> +		xe_for_each_engine(fd, hwe)
> +			many_allocs(fd, hwe, SZ_64M, SZ_64M, SZ_4K, NULL,
> +				    BENCHMARK);
> +
> +	igt_subtest_f("fault-threads-benchmark")
> +		xe_for_each_engine(fd, hwe)
> +			many_allocs(fd, hwe, SZ_64M, SZ_64M, SZ_4K, NULL,
> +				    BENCHMARK | CPU_FAULT_THREADS);
> +
> +	igt_subtest_f("fault-threads-same-page-benchmark")
> +		xe_for_each_engine(fd, hwe)
> +			many_allocs(fd, hwe, SZ_64M, SZ_64M, SZ_4K, NULL,
> +				    BENCHMARK | CPU_FAULT_THREADS |
> +				    CPU_FAULT_SAME_PAGE);
> +
> +	igt_subtest_f("fault-process-benchmark")
> +		xe_for_each_engine(fd, hwe)
> +			many_allocs(fd, hwe, SZ_64M, SZ_64M, SZ_4K, NULL,
> +				    BENCHMARK | CPU_FAULT_PROCESS);
> +
> +	igt_subtest_f("fault-process-same-page-benchmark")
> +		xe_for_each_engine(fd, hwe)
> +			many_allocs(fd, hwe, SZ_64M, SZ_64M, SZ_4K, NULL,
> +				    BENCHMARK | CPU_FAULT_PROCESS |
> +				    CPU_FAULT_SAME_PAGE);
> +
> +	for (const struct section *s = esections; s->name; s++) {
> +		igt_subtest_f("evict-%s", s->name)
> +			xe_for_each_engine(fd, hwe) {
> +				many_allocs(fd, hwe,
> +					    xe_visible_vram_size(fd, hwe->gt_id),
> +					    SZ_8M, SZ_1M, NULL, s->flags);
> +				break;
> +			}
> +	}
> +
> +	for (const struct section *s = esections; s->name; s++) {
> +		igt_subtest_f("processes-evict-%s", s->name)
> +			processes_evict(fd, SZ_8M, SZ_1M, s->flags);
> +	}
> +
> +	igt_fixture {
> +		xe_device_put(fd);
> +		drm_close_driver(fd);
> +		close_sync_file();
> +	}
> +}
> diff --git a/tests/meson.build b/tests/meson.build
> index 6328792e3a..20ddddb89f 100644
> --- a/tests/meson.build
> +++ b/tests/meson.build
> @@ -295,6 +295,7 @@ intel_xe_progs = [
>  	'xe_exec_reset',
>  	'xe_exec_sip',
>  	'xe_exec_store',
> +	'xe_exec_system_allocator',
>  	'xe_exec_threads',
>  	'xe_exercise_blt',
>  	'xe_fault_injection',
> -- 
> 2.34.1
> 
> 

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v3 2/2] tests/xe: Add system_allocator test
  2025-04-24 22:32   ` Cavitt, Jonathan
@ 2025-04-24 22:39     ` Matthew Brost
  0 siblings, 0 replies; 8+ messages in thread
From: Matthew Brost @ 2025-04-24 22:39 UTC (permalink / raw)
  To: Cavitt, Jonathan; +Cc: igt-dev@lists.freedesktop.org, Dugast,  Francois

On Thu, Apr 24, 2025 at 04:32:06PM -0600, Cavitt, Jonathan wrote:
> -----Original Message-----
> From: igt-dev <igt-dev-bounces@lists.freedesktop.org> On Behalf Of Matthew Brost
> Sent: Thursday, April 24, 2025 1:44 PM
> To: igt-dev@lists.freedesktop.org
> Cc: Dugast, Francois <francois.dugast@intel.com>
> Subject: [PATCH v3 2/2] tests/xe: Add system_allocator test
> > 
> > Test various uses of system allocator in single thread, multiple
> > threads, and multiple processes.
> > 
> > Features tested:
> >  - Malloc with various size
> >  - Mmap with various sizes and flags including file backed mappings
> >  - Mixing BO allocations with system allocator
> >  - Various page sizes
> >  - Dynamically freeing / unmapping memory
> >  - Sharing VM across threads
> >  - Faults racing on different hardware engines / GTs / Tiles
> >  - GPU faults and CPU faults racing
> >  - CPU faults on multiple threads racing
> >  - CPU faults on multiple process racing
> >  - GPU faults of memory not faulted in by CPU
> >  - Partial unmap of allocations
> >  - Attempting to unmap system allocations when GPU has mappings
> >  - Eviction of both system allocations and BOs
> >  - Forking child processes and reading data from VRAM
> >  - mremap data in VRAM
> >  - Protection changes
> >  - Multiple faults per execbuf
> > 
> > Running on LNL, BMG, PVC 1 tile, and PVC 2 tile.
> > 
> > v2:
> >  - Rebase
> >  - Fix memory allocation to not interfear with malloc (Thomas)
> > v3:
> >  - Fix memory leak (Francois)
> >  - Break out uAPI into own patch (Francois)
> >  - Use mkstemp for sync file (Francois)
> >  - Use mkstemp for file backed data (Francois)
> >  - Drop i argument from READ_VALUE (Francois)
> >  - Fix test description (Francois)
> >  - Add comment to check_all_pages_process (Francois)
> >  - Prefer igt_info over printf (Francois)
> >  - Fix types in messages (Francois)
> >  - Prefer odd macro (Francois)
> > 
> > Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> 
> LGTM, though maybe it would be better to split the various tests
> introduced in xe_exec_system_allocator.c across multiple patches,
> instead of creating all of the tests at once?
> 

Francois and I discussed this but there is really a great way to split
this and IMO large patches as IGTs matters a whole lot less than large
KMD patches.

> I'm okay with creating the test series either way, so this isn't a
> blocking request.
> 
> The remaining nits below are non-blocking as well, so this is:
> Reviewed-by: Jonathan Cavitt <jonathan.cavitt@intel.com>
> 
> > ---
> >  lib/xe/xe_ioctl.c                      |   12 +
> >  lib/xe/xe_ioctl.h                      |    1 +
> >  tests/intel/xe_exec_system_allocator.c | 1849 ++++++++++++++++++++++++
> >  tests/meson.build                      |    1 +
> >  4 files changed, 1863 insertions(+)
> >  create mode 100644 tests/intel/xe_exec_system_allocator.c
> > 
> > diff --git a/lib/xe/xe_ioctl.c b/lib/xe/xe_ioctl.c
> > index fb8c4aef13..785fc9184c 100644
> > --- a/lib/xe/xe_ioctl.c
> > +++ b/lib/xe/xe_ioctl.c
> > @@ -440,6 +440,18 @@ void *xe_bo_map(int fd, uint32_t bo, size_t size)
> >  	return __xe_bo_map(fd, bo, size, PROT_WRITE);
> >  }
> >  
> > +void *xe_bo_map_fixed(int fd, uint32_t bo, size_t size, uint64_t addr)
> > +{
> > +	uint64_t mmo;
> > +	void *map;
> > +
> > +	mmo = xe_bo_mmap_offset(fd, bo);
> > +	map = mmap((void *)addr, size, PROT_WRITE, MAP_SHARED | MAP_FIXED, fd, mmo);
> > +	igt_assert(map != MAP_FAILED);
> > +
> > +	return map;
> > +}
> > +
> >  void *xe_bo_mmap_ext(int fd, uint32_t bo, size_t size, int prot)
> >  {
> >  	return __xe_bo_map(fd, bo, size, prot);
> > diff --git a/lib/xe/xe_ioctl.h b/lib/xe/xe_ioctl.h
> > index 9bdf73b2bd..554a33c9cd 100644
> > --- a/lib/xe/xe_ioctl.h
> > +++ b/lib/xe/xe_ioctl.h
> > @@ -86,6 +86,7 @@ uint32_t xe_exec_queue_create_class(int fd, uint32_t vm, uint16_t class);
> >  void xe_exec_queue_destroy(int fd, uint32_t exec_queue);
> >  uint64_t xe_bo_mmap_offset(int fd, uint32_t bo);
> >  void *xe_bo_map(int fd, uint32_t bo, size_t size);
> > +void *xe_bo_map_fixed(int fd, uint32_t bo, size_t size, long unsigned int addr);
> >  void *xe_bo_mmap_ext(int fd, uint32_t bo, size_t size, int prot);
> >  int __xe_exec(int fd, struct drm_xe_exec *exec);
> >  void xe_exec(int fd, struct drm_xe_exec *exec);
> > diff --git a/tests/intel/xe_exec_system_allocator.c b/tests/intel/xe_exec_system_allocator.c
> > new file mode 100644
> > index 0000000000..4839090cb2
> > --- /dev/null
> > +++ b/tests/intel/xe_exec_system_allocator.c
> > @@ -0,0 +1,1849 @@
> > +// SPDX-License-Identifier: MIT
> > +/*
> > + * Copyright (c) 2024 Intel Corporation
> 
> NIT:
> Shouldn't this be a 2025 copyright?
> 

Originally authored in 2024.

> > + */
> > +
> > +/**
> > + * TEST: Basic tests for execbuf functionality using system allocator
> > + * Category: Core
> > + * Mega feature: USM
> > + * Sub-category: System allocator
> > + * Functionality: fault mode, system allocator
> > + * GPU: LNL, BMG, PVC
> > + */
> > +
> > +#include <fcntl.h>
> > +#include <linux/mman.h>
> > +#include <time.h>
> > +
> > +#include "igt.h"
> > +#include "lib/igt_syncobj.h"
> > +#include "lib/intel_reg.h"
> > +#include "xe_drm.h"
> > +
> > +#include "xe/xe_ioctl.h"
> > +#include "xe/xe_query.h"
> > +#include <string.h>
> > +
> > +#define USER_FENCE_VALUE	0xdeadbeefdeadbeefull
> > +#define QUARTER_SEC		(NSEC_PER_SEC / 4)
> > +#define FIVE_SEC		(5LL * NSEC_PER_SEC)
> > +
> > +struct batch_data {
> > +	uint32_t batch[16];
> > +	uint64_t pad;
> > +	uint32_t data;
> > +	uint32_t expected_data;
> > +};
> > +
> > +#define WRITE_VALUE(data__, i__)	({			\
> > +	if (!(data__)->expected_data)				\
> > +		(data__)->expected_data = rand() << 12 | (i__);	\
> > +	(data__)->expected_data;				\
> > +})
> > +#define READ_VALUE(data__)	((data__)->expected_data)
> > +
> > +static void __write_dword(uint32_t *batch, uint64_t sdi_addr, uint32_t wdata,
> > +			int *idx)
> > +{
> > +	batch[(*idx)++] = MI_STORE_DWORD_IMM_GEN4;
> > +	batch[(*idx)++] = sdi_addr;
> > +	batch[(*idx)++] = sdi_addr >> 32;
> > +	batch[(*idx)++] = wdata;
> > +}
> > +
> > +static void write_dword(uint32_t *batch, uint64_t sdi_addr, uint32_t wdata,
> > +			int *idx)
> > +{
> > +	__write_dword(batch, sdi_addr, wdata, idx);
> > +	batch[(*idx)++] = MI_BATCH_BUFFER_END;
> > +}
> > +
> > +static void check_all_pages(void *ptr, uint64_t alloc_size, uint64_t stride,
> > +			    pthread_barrier_t *barrier)
> > +{
> > +	int i, n_writes = alloc_size / stride;
> > +
> > +	for (i = 0; i < n_writes; ++i) {
> > +		struct batch_data *data = ptr + i * stride;
> > +
> > +		igt_assert_eq(data->data, READ_VALUE(data));
> > +
> > +		if (barrier)
> > +			pthread_barrier_wait(barrier);
> > +	}
> > +}
> > +
> > +static char sync_file[] = "/tmp/xe_exec_system_allocator_syncXXXXXX";
> > +static int sync_fd;
> > +
> > +static void open_sync_file(void)
> > +{
> > +	sync_fd = mkstemp(sync_file);
> > +}
> > +
> > +static void close_sync_file(void)
> > +{
> > +	close(sync_fd);
> > +}
> 
> NIT:
> The sync_fd doesn't seem like it's being used for anything.  However, just
> because I don't know what this file does doesn't mean it does nothing, so
> I won't block on this.
>

The name is used by child processes to open their open fd. The sync_fd
is just used by the master process to close the file upon exit.

> > +
> > +struct process_data {
> > +	pthread_mutex_t mutex;
> > +	pthread_cond_t cond;
> > +	pthread_barrier_t barrier;
> > +	bool go;
> > +};
> > +
> > +static void wait_pdata(struct process_data *pdata)
> > +{
> > +	pthread_mutex_lock(&pdata->mutex);
> > +	while (!pdata->go)
> > +		pthread_cond_wait(&pdata->cond, &pdata->mutex);
> > +	pthread_mutex_unlock(&pdata->mutex);
> > +}
> > +
> > +static void init_pdata(struct process_data *pdata, int n_engine)
> > +{
> > +	pthread_mutexattr_t mutex_attr;
> > +	pthread_condattr_t cond_attr;
> > +	pthread_barrierattr_t barrier_attr;
> > +
> > +	pthread_mutexattr_init(&mutex_attr);
> > +	pthread_mutexattr_setpshared(&mutex_attr, PTHREAD_PROCESS_SHARED);
> > +	pthread_mutex_init(&pdata->mutex, &mutex_attr);
> > +
> > +	pthread_condattr_init(&cond_attr);
> > +	pthread_condattr_setpshared(&cond_attr, PTHREAD_PROCESS_SHARED);
> > +	pthread_cond_init(&pdata->cond, &cond_attr);
> > +
> > +	pthread_barrierattr_init(&barrier_attr);
> > +	pthread_barrierattr_setpshared(&barrier_attr, PTHREAD_PROCESS_SHARED);
> > +	pthread_barrier_init(&pdata->barrier, &barrier_attr, n_engine);
> > +
> > +	pdata->go = false;
> > +}
> > +
> > +static void signal_pdata(struct process_data *pdata)
> > +{
> > +	pthread_mutex_lock(&pdata->mutex);
> > +	pdata->go = true;
> > +	pthread_cond_broadcast(&pdata->cond);
> > +	pthread_mutex_unlock(&pdata->mutex);
> > +}
> > +
> > +/* many_alloc flags */
> > +#define MIX_BO_ALLOC		(0x1 << 0)
> > +#define BENCHMARK		(0x1 << 1)
> > +#define CPU_FAULT_THREADS	(0x1 << 2)
> > +#define CPU_FAULT_PROCESS	(0x1 << 3)
> > +#define CPU_FAULT_SAME_PAGE	(0x1 << 4)
> > +
> > +static void process_check(void *ptr, uint64_t alloc_size, uint64_t stride,
> > +			  unsigned int flags)
> > +{
> > +	struct process_data *pdata;
> > +	int map_fd;
> > +
> > +	map_fd = open(sync_file, O_RDWR, 0x666);
> > +	pdata = mmap(NULL, sizeof(*pdata), PROT_READ |
> > +		     PROT_WRITE, MAP_SHARED, map_fd, 0);
> > +	wait_pdata(pdata);
> > +
> > +	if (flags & CPU_FAULT_SAME_PAGE)
> > +		check_all_pages(ptr, alloc_size, stride, &pdata->barrier);
> > +	else
> > +		check_all_pages(ptr, alloc_size, stride, NULL);
> > +
> > +	close(map_fd);
> > +	munmap(pdata, sizeof(*pdata));
> > +}
> > +
> > +/*
> > + * Partition checking of results in chunks which causes multiple processes to
> > + * fault same VRAM allocation in parallel.
> > + */
> > +static void
> > +check_all_pages_process(void *ptr, uint64_t alloc_size, uint64_t stride,
> > +			int n_process, unsigned int flags)
> > +{
> > +	struct process_data *pdata;
> > +	int map_fd, i;
> > +
> > +	map_fd = open(sync_file, O_RDWR | O_CREAT, 0x666);
> > +	posix_fallocate(map_fd, 0, sizeof(*pdata));
> > +	pdata = mmap(NULL, sizeof(*pdata), PROT_READ |
> > +		     PROT_WRITE, MAP_SHARED, map_fd, 0);
> > +
> > +	init_pdata(pdata, n_process);
> > +
> > +	for (i = 0; i < n_process; ++i) {
> > +		igt_fork(child, 1)
> > +			if (flags & CPU_FAULT_SAME_PAGE)
> > +				process_check(ptr, alloc_size, stride, flags);
> > +			else
> > +				process_check(ptr + stride * i, alloc_size,
> > +					      stride * n_process, flags);
> > +	}
> > +
> > +	signal_pdata(pdata);
> > +	igt_waitchildren();
> > +
> > +	close(map_fd);
> > +	munmap(pdata, sizeof(*pdata));
> > +}
> > +
> > +struct thread_check_data {
> > +	pthread_t thread;
> > +	pthread_mutex_t *mutex;
> > +	pthread_cond_t *cond;
> > +	pthread_barrier_t *barrier;
> > +	void *ptr;
> > +	uint64_t alloc_size;
> > +	uint64_t stride;
> > +	bool *go;
> > +};
> > +
> > +static void *thread_check(void *data)
> > +{
> > +	struct thread_check_data *t = data;
> > +
> > +	pthread_mutex_lock(t->mutex);
> > +	while (!*t->go)
> > +		pthread_cond_wait(t->cond, t->mutex);
> > +	pthread_mutex_unlock(t->mutex);
> > +
> > +	check_all_pages(t->ptr, t->alloc_size, t->stride, t->barrier);
> > +
> > +	return NULL;
> > +}
> > +
> > +/*
> > + * Partition checking of results in chunks which causes multiple threads to
> > + * fault same VRAM allocation in parallel.
> > + */
> > +static void
> > +check_all_pages_threads(void *ptr, uint64_t alloc_size, uint64_t stride,
> > +			int n_threads, unsigned int flags)
> > +{
> > +	struct thread_check_data *threads_check_data;
> > +	pthread_mutex_t mutex;
> > +	pthread_cond_t cond;
> > +	pthread_barrier_t barrier;
> > +	int i;
> > +	bool go = false;
> > +
> > +	threads_check_data = calloc(n_threads, sizeof(*threads_check_data));
> > +	igt_assert(threads_check_data);
> > +
> > +	pthread_mutex_init(&mutex, 0);
> > +	pthread_cond_init(&cond, 0);
> > +	pthread_barrier_init(&barrier, 0, n_threads);
> > +
> > +	for (i = 0; i < n_threads; ++i) {
> > +		threads_check_data[i].mutex = &mutex;
> > +		threads_check_data[i].cond = &cond;
> > +		if (flags & CPU_FAULT_SAME_PAGE) {
> > +			threads_check_data[i].barrier = &barrier;
> > +			threads_check_data[i].ptr = ptr;
> > +			threads_check_data[i].alloc_size = alloc_size;
> > +			threads_check_data[i].stride = stride;
> > +		} else {
> > +			threads_check_data[i].barrier = NULL;
> > +			threads_check_data[i].ptr = ptr + stride * i;
> > +			threads_check_data[i].alloc_size = alloc_size;
> > +			threads_check_data[i].stride = n_threads * stride;
> > +		}
> > +		threads_check_data[i].go = &go;
> > +
> > +		pthread_create(&threads_check_data[i].thread, 0, thread_check,
> > +			       &threads_check_data[i]);
> > +	}
> > +
> > +	pthread_mutex_lock(&mutex);
> > +	go = true;
> > +	pthread_cond_broadcast(&cond);
> > +	pthread_mutex_unlock(&mutex);
> > +
> > +	for (i = 0; i < n_threads; ++i)
> > +		pthread_join(threads_check_data[i].thread, NULL);
> > +	free(threads_check_data);
> > +}
> > +
> > +static void touch_all_pages(int fd, uint32_t exec_queue, void *ptr,
> > +			    uint64_t alloc_size, uint64_t stride,
> > +			    struct timespec *tv, uint64_t *submit)
> > +{
> > +	struct drm_xe_sync sync[1] = {
> > +		{ .type = DRM_XE_SYNC_TYPE_USER_FENCE,
> > +		  .flags = DRM_XE_SYNC_FLAG_SIGNAL,
> > +		  .timeline_value = USER_FENCE_VALUE },
> > +	};
> > +	struct drm_xe_exec exec = {
> > +		.num_batch_buffer = 1,
> > +		.num_syncs = 0,
> > +		.exec_queue_id = exec_queue,
> > +		.syncs = to_user_pointer(&sync),
> > +	};
> > +	uint64_t addr = to_user_pointer(ptr);
> > +	int i, ret, n_writes = alloc_size / stride;
> > +	u64 *exec_ufence = NULL;
> > +	int64_t timeout = FIVE_SEC;
> > +
> > +	exec_ufence = mmap(NULL, SZ_4K, PROT_READ |
> > +			   PROT_WRITE, MAP_SHARED |
> > +			   MAP_ANONYMOUS, -1, 0);
> > +	igt_assert(exec_ufence != MAP_FAILED);
> > +	memset(exec_ufence, 0, SZ_4K);
> > +	sync[0].addr = to_user_pointer(exec_ufence);
> > +
> > +	for (i = 0; i < n_writes; ++i, addr += stride) {
> > +		struct batch_data *data = ptr + i * stride;
> > +		uint64_t sdi_offset = (char *)&data->data - (char *)data;
> > +		uint64_t sdi_addr = addr + sdi_offset;
> > +		int b = 0;
> > +
> > +		write_dword(data->batch, sdi_addr, WRITE_VALUE(data, i), &b);
> > +		igt_assert(b <= ARRAY_SIZE(data->batch));
> > +	}
> > +
> > +	igt_nsec_elapsed(tv);
> > +	*submit = igt_nsec_elapsed(tv);
> > +
> > +	addr = to_user_pointer(ptr);
> > +	for (i = 0; i < n_writes; ++i, addr += stride) {
> > +		struct batch_data *data = ptr + i * stride;
> > +		uint64_t batch_offset = (char *)&data->batch - (char *)data;
> > +		uint64_t batch_addr = addr + batch_offset;
> > +
> > +		exec.address = batch_addr;
> > +		if (i + 1 == n_writes)
> > +			exec.num_syncs = 1;
> > +		xe_exec(fd, &exec);
> > +	}
> > +
> > +	ret = __xe_wait_ufence(fd, exec_ufence, USER_FENCE_VALUE, exec_queue,
> > +			       &timeout);
> > +	if (ret) {
> > +		igt_info("FAIL EXEC_UFENCE_ADDR: 0x%016llx\n", sync[0].addr);
> > +		igt_info("FAIL EXEC_UFENCE: EXPECTED=0x%016llx, ACTUAL=0x%016lx\n",
> > +			 USER_FENCE_VALUE, exec_ufence[0]);
> > +
> > +		addr = to_user_pointer(ptr);
> > +		for (i = 0; i < n_writes; ++i, addr += stride) {
> > +			struct batch_data *data = ptr + i * stride;
> > +			uint64_t batch_offset = (char *)&data->batch - (char *)data;
> > +			uint64_t batch_addr = addr + batch_offset;
> > +			uint64_t sdi_offset = (char *)&data->data - (char *)data;
> > +			uint64_t sdi_addr = addr + sdi_offset;
> > +
> > +			igt_info("FAIL BATCH_ADDR: 0x%016lx\n", batch_addr);
> > +			igt_info("FAIL SDI_ADDR: 0x%016lx\n", sdi_addr);
> > +			igt_info("FAIL SDI_ADDR (in batch): 0x%016lx\n",
> > +				 (((u64)data->batch[2]) << 32) | data->batch[1]);
> > +			igt_info("FAIL DATA: EXPECTED=0x%08x, ACTUAL=0x%08x\n",
> > +				 data->expected_data, data->data);
> > +		}
> > +		igt_assert_eq(ret, 0);
> > +	}
> > +	munmap(exec_ufence, SZ_4K);
> > +}
> > +
> > +static int va_bits;
> > +
> > +#define bind_system_allocator(__sync, __num_sync)			\
> > +	__xe_vm_bind_assert(fd, vm, 0,					\
> > +			    0, 0, 0, 0x1ull << va_bits,			\
> > +			    DRM_XE_VM_BIND_OP_MAP,			\
> > +			    DRM_XE_VM_BIND_FLAG_CPU_ADDR_MIRROR,	\
> > +			    (__sync), (__num_sync), 0, 0)
> > +
> > +#define unbind_system_allocator()				\
> > +	__xe_vm_bind(fd, vm, 0, 0, 0, 0, 0x1ull << va_bits,	\
> > +		     DRM_XE_VM_BIND_OP_UNMAP, 0,		\
> > +		     NULL, 0, 0, 0, 0)
> > +
> > +#define odd(__i)	(__i & 1)
> > +
> > +struct aligned_alloc_type {
> > +	void *__ptr;
> > +	void *ptr;
> > +	size_t __size;
> > +	size_t size;
> > +};
> > +
> > +static struct aligned_alloc_type __aligned_alloc(size_t alignment, size_t size)
> > +{
> > +	struct aligned_alloc_type aligned_alloc_type;
> > +
> > +	aligned_alloc_type.__ptr = mmap(NULL, alignment + size, PROT_NONE, MAP_PRIVATE |
> > +			      MAP_ANONYMOUS, -1, 0);
> > +	igt_assert(aligned_alloc_type.__ptr != MAP_FAILED);
> > +
> > +	aligned_alloc_type.ptr = (void *)ALIGN((uint64_t)aligned_alloc_type.__ptr, alignment);
> > +	aligned_alloc_type.size = size;
> > +	aligned_alloc_type.__size = size + alignment;
> > +
> > +	return aligned_alloc_type;
> > +}
> > +
> > +static void __aligned_free(struct aligned_alloc_type  *aligned_alloc_type)
> > +{
> > +	munmap(aligned_alloc_type->__ptr, aligned_alloc_type->__size);
> > +}
> > +
> > +static void __aligned_partial_free(struct aligned_alloc_type  *aligned_alloc_type)
> > +{
> > +	size_t begin_size = (size_t)(aligned_alloc_type->ptr - aligned_alloc_type->__ptr);
> > +
> > +	if (begin_size)
> > +		munmap(aligned_alloc_type->__ptr, begin_size);
> > +	if (aligned_alloc_type->__size - aligned_alloc_type->size - begin_size)
> > +		munmap(aligned_alloc_type->ptr + aligned_alloc_type->size,
> > +		       aligned_alloc_type->__size - aligned_alloc_type->size - begin_size);
> > +}
> > +
> > +/**
> > + * SUBTEST: unaligned-alloc
> > + * Description: allocate unaligned sizes of memory
> > + * Test category: functionality test
> > + *
> > + * SUBTEST: fault-benchmark
> > + * Description: Benchmark how long GPU / CPU take
> > + * Test category: performance test
> > + *
> > + * SUBTEST: fault-threads-benchmark
> > + * Description: Benchmark how long GPU / CPU take, reading results with multiple threads
> > + * Test category: performance and functionality test
> > + *
> > + * SUBTEST: fault-threads-same-page-benchmark
> > + * Description: Benchmark how long GPU / CPU take, reading results with multiple threads, hammer same page
> > + * Test category: performance and functionality test
> > + *
> > + * SUBTEST: fault-process-benchmark
> > + * Description: Benchmark how long GPU / CPU take, reading results with multiple process
> > + * Test category: performance and functionality test
> > + *
> > + * SUBTEST: fault-process-same-page-benchmark
> > + * Description: Benchmark how long GPU / CPU take, reading results with multiple process, hammer same page
> > + * Test category: performance and functionality test
> > + *
> > + * SUBTEST: evict-malloc
> > + * Description: trigger eviction of VRAM allocated via malloc
> > + * Test category: functionality test
> > + *
> > + * SUBTEST: evict-malloc-mix-bo
> > + * Description: trigger eviction of VRAM allocated via malloc and BO create
> > + * Test category: functionality test
> > + *
> > + * SUBTEST: processes-evict-malloc
> > + * Description: multi-process trigger eviction of VRAM allocated via malloc
> > + * Test category: stress test
> > + *
> > + * SUBTEST: processes-evict-malloc-mix-bo
> > + * Description: multi-process trigger eviction of VRAM allocated via malloc and BO create
> > + * Test category: stress test
> > + */
> > +
> > +static void
> > +many_allocs(int fd, struct drm_xe_engine_class_instance *eci,
> > +	    uint64_t total_alloc, uint64_t alloc_size, uint64_t stride,
> > +	    pthread_barrier_t *barrier, unsigned int flags)
> > +{
> > +	uint32_t vm, exec_queue;
> > +	int num_allocs = flags & BENCHMARK ? 1 :
> > +		(9 * (total_alloc / alloc_size)) / 8;
> > +	struct aligned_alloc_type *allocs;
> > +	uint32_t *bos = NULL;
> > +	struct timespec tv = {};
> > +	uint64_t submit, read, elapsed;
> > +	int i;
> > +
> > +	vm = xe_vm_create(fd, DRM_XE_VM_CREATE_FLAG_LR_MODE |
> > +			  DRM_XE_VM_CREATE_FLAG_FAULT_MODE, 0);
> > +	exec_queue = xe_exec_queue_create(fd, vm, eci, 0);
> > +
> > +	bind_system_allocator(NULL, 0);
> > +
> > +	allocs = malloc(sizeof(*allocs) * num_allocs);
> > +	igt_assert(allocs);
> > +	memset(allocs, 0, sizeof(*allocs) * num_allocs);
> > +
> > +	if (flags & MIX_BO_ALLOC) {
> > +		bos = malloc(sizeof(*bos) * num_allocs);
> > +		igt_assert(bos);
> > +		memset(bos, 0, sizeof(*bos) * num_allocs);
> > +	}
> > +
> > +	for (i = 0; i < num_allocs; ++i) {
> > +		struct aligned_alloc_type alloc;
> > +
> > +		if (flags & MIX_BO_ALLOC && odd(i)) {
> > +			uint32_t bo_flags =
> > +				DRM_XE_GEM_CREATE_FLAG_NEEDS_VISIBLE_VRAM;
> > +
> > +			alloc = __aligned_alloc(SZ_2M, alloc_size);
> > +			igt_assert(alloc.ptr);
> > +
> > +			bos[i] = xe_bo_create(fd, vm, alloc_size,
> > +					      vram_if_possible(fd, eci->gt_id),
> > +					      bo_flags);
> > +			alloc.ptr = xe_bo_map_fixed(fd, bos[i], alloc_size,
> > +						    to_user_pointer(alloc.ptr));
> > +			xe_vm_bind_async(fd, vm, 0, bos[i], 0,
> > +					 to_user_pointer(alloc.ptr),
> > +					 alloc_size, 0, 0);
> > +		} else {
> > +			alloc.ptr = aligned_alloc(SZ_2M, alloc_size);
> > +			igt_assert(alloc.ptr);
> > +		}
> > +		allocs[i] = alloc;
> > +
> > +		touch_all_pages(fd, exec_queue, allocs[i].ptr, alloc_size, stride,
> > +				&tv, &submit);
> > +	}
> > +
> > +	if (barrier)
> > +		pthread_barrier_wait(barrier);
> > +
> > +	for (i = 0; i < num_allocs; ++i) {
> > +		if (flags & BENCHMARK)
> > +			read = igt_nsec_elapsed(&tv);
> > +#define NUM_CHECK_THREADS	8
> > +		if (flags & CPU_FAULT_PROCESS)
> > +			check_all_pages_process(allocs[i].ptr, alloc_size, stride,
> > +						NUM_CHECK_THREADS, flags);
> > +		else if (flags & CPU_FAULT_THREADS)
> > +			check_all_pages_threads(allocs[i].ptr, alloc_size, stride,
> > +						NUM_CHECK_THREADS, flags);
> > +		else
> > +			check_all_pages(allocs[i].ptr, alloc_size, stride, NULL);
> > +		if (flags & BENCHMARK) {
> > +			elapsed = igt_nsec_elapsed(&tv);
> > +			igt_info("Execution took %.3fms (submit %.1fus, read %.1fus, total %.1fus, read_total %.1fus)\n",
> > +				 1e-6 * elapsed, 1e-3 * submit, 1e-3 * read,
> > +				 1e-3 * (elapsed - submit),
> > +				 1e-3 * (elapsed - read));
> > +		}
> > +		if (bos && bos[i]) {
> > +			__aligned_free(allocs + i);
> > +			gem_close(fd, bos[i]);
> > +		} else {
> > +			free(allocs[i].ptr);
> > +		}
> > +	}
> > +	if (bos)
> > +		free(bos);
> > +	free(allocs);
> > +	xe_exec_queue_destroy(fd, exec_queue);
> > +	xe_vm_destroy(fd, vm);
> > +}
> > +
> > +static void process_evict(struct drm_xe_engine_class_instance *hwe,
> > +			  uint64_t total_alloc, uint64_t alloc_size,
> > +			  uint64_t stride, unsigned int flags)
> > +{
> > +	struct process_data *pdata;
> > +	int map_fd;
> > +	int fd;
> > +
> > +	map_fd = open(sync_file, O_RDWR, 0x666);
> > +	pdata = mmap(NULL, sizeof(*pdata), PROT_READ |
> > +		     PROT_WRITE, MAP_SHARED, map_fd, 0);
> > +	wait_pdata(pdata);
> > +
> > +	fd = drm_open_driver(DRIVER_XE);
> > +	many_allocs(fd, hwe, total_alloc, alloc_size, stride, &pdata->barrier,
> > +		    flags);
> > +	drm_close_driver(fd);
> > +
> > +	close(map_fd);
> > +	munmap(pdata, sizeof(*pdata));
> > +}
> > +
> > +static void
> > +processes_evict(int fd, uint64_t alloc_size, uint64_t stride,
> > +		unsigned int flags)
> > +{
> > +	struct drm_xe_engine_class_instance *hwe;
> > +	struct process_data *pdata;
> > +	int n_engine_gt[2] = { 0, 0 }, n_engine = 0;
> > +	int map_fd;
> > +
> > +	map_fd = open(sync_file, O_RDWR | O_CREAT, 0x666);
> > +	posix_fallocate(map_fd, 0, sizeof(*pdata));
> > +	pdata = mmap(NULL, sizeof(*pdata), PROT_READ |
> > +		     PROT_WRITE, MAP_SHARED, map_fd, 0);
> > +
> > +	xe_for_each_engine(fd, hwe) {
> > +		igt_assert(hwe->gt_id < 2);
> > +		n_engine_gt[hwe->gt_id]++;
> > +		n_engine++;
> > +	}
> > +
> > +	init_pdata(pdata, n_engine);
> > +
> > +	xe_for_each_engine(fd, hwe) {
> > +		igt_fork(child, 1)
> > +			process_evict(hwe,
> > +				      xe_visible_vram_size(fd, hwe->gt_id) /
> > +				      n_engine_gt[hwe->gt_id], alloc_size,
> > +				      stride, flags);
> > +	}
> > +
> > +	signal_pdata(pdata);
> > +	igt_waitchildren();
> > +
> > +	close(map_fd);
> > +	munmap(pdata, sizeof(*pdata));
> > +}
> > +
> > +#define CPU_FAULT	(0x1 << 0)
> > +#define REMAP		(0x1 << 1)
> > +#define MIDDLE		(0x1 << 2)
> > +
> > +/**
> > + * SUBTEST: partial-munmap-cpu-fault
> > + * Description: munmap partially with cpu access in between
> > + * Test category: functionality test
> > + *
> > + * SUBTEST: partial-munmap-no-cpu-fault
> > + * Description: munmap partially with no cpu access in between
> > + * Test category: functionality test
> > + *
> > + * SUBTEST: partial-remap-cpu-fault
> > + * Description: remap partially with cpu access in between
> > + * Test category: functionality test
> > + *
> > + * SUBTEST: partial-remap-no-cpu-fault
> > + * Description: remap partially with no cpu access in between
> > + * Test category: functionality test
> > + *
> > + * SUBTEST: partial-middle-munmap-cpu-fault
> > + * Description: munmap middle with cpu access in between
> > + * Test category: functionality test
> > + *
> > + * SUBTEST: partial-middle-munmap-no-cpu-fault
> > + * Description: munmap middle with no cpu access in between
> > + * Test category: functionality test
> > + *
> > + * SUBTEST: partial-middle-remap-cpu-fault
> > + * Description: remap middle with cpu access in between
> > + * Test category: functionality test
> > + *
> > + * SUBTEST: partial-middle-remap-no-cpu-fault
> > + * Description: remap middle with no cpu access in between
> > + * Test category: functionality test
> > + */
> > +
> > +static void
> > +partial(int fd, struct drm_xe_engine_class_instance *eci, unsigned int flags)
> > +{
> > +	struct drm_xe_sync sync[1] = {
> > +		{ .type = DRM_XE_SYNC_TYPE_USER_FENCE, .flags = DRM_XE_SYNC_FLAG_SIGNAL,
> > +	          .timeline_value = USER_FENCE_VALUE },
> > +	};
> > +	struct drm_xe_exec exec = {
> > +		.num_batch_buffer = 1,
> > +		.num_syncs = 1,
> > +		.syncs = to_user_pointer(sync),
> > +	};
> > +	struct {
> > +		uint32_t batch[16];
> > +		uint64_t pad;
> > +		uint64_t vm_sync;
> > +		uint64_t exec_sync;
> > +		uint32_t data;
> > +		uint32_t expected_data;
> > +	} *data;
> > +	size_t bo_size = SZ_2M, unmap_offset = 0;
> > +	uint32_t vm, exec_queue;
> > +	u64 *exec_ufence = NULL;
> > +	int i;
> > +	void *old, *new = NULL;
> > +	struct aligned_alloc_type alloc;
> > +
> > +	if (flags & MIDDLE)
> > +		unmap_offset = bo_size / 4;
> > +
> > +	vm = xe_vm_create(fd, DRM_XE_VM_CREATE_FLAG_LR_MODE |
> > +			  DRM_XE_VM_CREATE_FLAG_FAULT_MODE, 0);
> > +
> > +	alloc = __aligned_alloc(bo_size, bo_size);
> > +	igt_assert(alloc.ptr);
> > +
> > +	data = mmap(alloc.ptr, bo_size, PROT_READ | PROT_WRITE,
> > +		    MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0);
> > +	igt_assert(data != MAP_FAILED);
> > +	memset(data, 0, bo_size);
> > +	old = data;
> > +
> > +	exec_queue = xe_exec_queue_create(fd, vm, eci, 0);
> > +
> > +	sync[0].addr = to_user_pointer(&data[0].vm_sync);
> > +	bind_system_allocator(sync, 1);
> > +	xe_wait_ufence(fd, &data[0].vm_sync, USER_FENCE_VALUE, 0, FIVE_SEC);
> > +	data[0].vm_sync = 0;
> > +
> > +	exec_ufence = mmap(NULL, SZ_4K, PROT_READ |
> > +			   PROT_WRITE, MAP_SHARED |
> > +			   MAP_ANONYMOUS, -1, 0);
> > +	igt_assert(exec_ufence != MAP_FAILED);
> > +	memset(exec_ufence, 0, SZ_4K);
> > +
> > +	for (i = 0; i < 2; i++) {
> > +		uint64_t addr = to_user_pointer(data);
> > +		uint64_t sdi_offset = (char *)&data[i].data - (char *)data;
> > +		uint64_t sdi_addr = addr + sdi_offset;
> > +		int b = 0;
> > +
> > +		write_dword(data[i].batch, sdi_addr, WRITE_VALUE(&data[i], i), &b);
> > +		igt_assert(b <= ARRAY_SIZE(data[i].batch));
> > +
> > +		if (!i)
> > +			data = old + unmap_offset + bo_size / 2;
> > +	}
> > +
> > +	data = old;
> > +	exec.exec_queue_id = exec_queue;
> > +
> > +	for (i = 0; i < 2; i++) {
> > +		uint64_t addr = to_user_pointer(data);
> > +		uint64_t batch_offset = (char *)&data[i].batch - (char *)data;
> > +		uint64_t batch_addr = addr + batch_offset;
> > +
> > +		sync[0].addr = new ? to_user_pointer(new) :
> > +			to_user_pointer(exec_ufence);
> > +		exec.address = batch_addr;
> > +		xe_exec(fd, &exec);
> > +
> > +		xe_wait_ufence(fd, new ?: exec_ufence, USER_FENCE_VALUE,
> > +			       exec_queue, FIVE_SEC);
> > +		if (i || (flags & CPU_FAULT))
> > +			igt_assert_eq(data[i].data, READ_VALUE(&data[i]));
> > +		exec_ufence[0] = 0;
> > +
> > +		if (!i) {
> > +			data = old + unmap_offset + bo_size / 2;
> > +			munmap(old + unmap_offset, bo_size / 2);
> > +			if (flags & REMAP) {
> > +				new = mmap(old + unmap_offset, bo_size / 2,
> > +					   PROT_READ | PROT_WRITE,
> > +					   MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED |
> > +					   MAP_LOCKED, -1, 0);
> > +				igt_assert(new != MAP_FAILED);
> > +			}
> > +		}
> > +	}
> > +
> > +	xe_exec_queue_destroy(fd, exec_queue);
> > +	munmap(exec_ufence, SZ_4K);
> > +	__aligned_free(&alloc);
> > +	if (new)
> > +		munmap(new, bo_size / 2);
> > +	xe_vm_destroy(fd, vm);
> > +}
> > +
> > +#define MAX_N_EXEC_QUEUES	16
> > +
> > +#define MMAP		(0x1 << 0)
> > +#define NEW		(0x1 << 1)
> > +#define BO_UNMAP	(0x1 << 2)
> > +#define FREE		(0x1 << 3)
> > +#define BUSY		(0x1 << 4)
> > +#define BO_MAP		(0x1 << 5)
> > +#define RACE		(0x1 << 6)
> > +#define SKIP_MEMSET	(0x1 << 7)
> > +#define FAULT		(0x1 << 8)
> > +#define FILE_BACKED	(0x1 << 9)
> > +#define LOCK		(0x1 << 10)
> > +#define MMAP_SHARED	(0x1 << 11)
> > +#define HUGE_PAGE	(0x1 << 12)
> > +#define SHARED_ALLOC	(0x1 << 13)
> > +#define FORK_READ	(0x1 << 14)
> > +#define FORK_READ_AFTER	(0x1 << 15)
> > +#define MREMAP		(0x1 << 16)
> > +#define DONTUNMAP	(0x1 << 17)
> > +#define READ_ONLY_REMAP	(0x1 << 18)
> > +#define SYNC_EXEC	(0x1 << 19)
> > +#define EVERY_OTHER_CHECK	(0x1 << 20)
> > +#define MULTI_FAULT	(0x1 << 21)
> 
> NIT:
> The above doesn't look aligned, but I'm willing to bet that has
> to do with my email client and that it's aligned in code.
> 
> However, there is a notable standout with EVERY_OTHER_CHECK.
> IMO, I think all of the values should be aligned to that if they aren't
> already.
>

Yea EVERY_OTHER_CHECK is unaligned, I can align all the flags to that
alignment.

Matt
 
> -Jonathan Cavitt
> 
> > +
> > +#define N_MULTI_FAULT	4
> > +
> > +/**
> > + * SUBTEST: once-%s
> > + * Description: Run %arg[1] system allocator test only once
> > + * Test category: functionality test
> > + *
> > + * SUBTEST: once-large-%s
> > + * Description: Run %arg[1] system allocator test only once with large allocation
> > + * Test category: functionality test
> > + *
> > + * SUBTEST: twice-%s
> > + * Description: Run %arg[1] system allocator test twice
> > + * Test category: functionality test
> > + *
> > + * SUBTEST: twice-large-%s
> > + * Description: Run %arg[1] system allocator test twice with large allocation
> > + * Test category: functionality test
> > + *
> > + * SUBTEST: many-%s
> > + * Description: Run %arg[1] system allocator test many times
> > + * Test category: stress test
> > + *
> > + * SUBTEST: many-stride-%s
> > + * Description: Run %arg[1] system allocator test many times with a stride on each exec
> > + * Test category: stress test
> > + *
> > + * SUBTEST: many-execqueues-%s
> > + * Description: Run %arg[1] system allocator test on many exec_queues
> > + * Test category: stress test
> > + *
> > + * SUBTEST: many-large-%s
> > + * Description: Run %arg[1] system allocator test many times with large allocations
> > + * Test category: stress test
> > + *
> > + * SUBTEST: many-large-execqueues-%s
> > + * Description: Run %arg[1] system allocator test on many exec_queues with large allocations
> > + *
> > + * SUBTEST: threads-many-%s
> > + * Description: Run %arg[1] system allocator threaded test many times
> > + * Test category: stress test
> > + *
> > + * SUBTEST: threads-many-stride-%s
> > + * Description: Run %arg[1] system allocator threaded test many times with a stride on each exec
> > + * Test category: stress test
> > + *
> > + * SUBTEST: threads-many-execqueues-%s
> > + * Description: Run %arg[1] system allocator threaded test on many exec_queues
> > + * Test category: stress test
> > + *
> > + * SUBTEST: threads-many-large-%s
> > + * Description: Run %arg[1] system allocator threaded test many times with large allocations
> > + * Test category: stress test
> > + *
> > + * SUBTEST: threads-many-large-execqueues-%s
> > + * Description: Run %arg[1] system allocator threaded test on many exec_queues with large allocations
> > + *
> > + * SUBTEST: threads-shared-vm-many-%s
> > + * Description: Run %arg[1] system allocator threaded, shared vm test many times
> > + * Test category: stress test
> > + *
> > + * SUBTEST: threads-shared-vm-many-stride-%s
> > + * Description: Run %arg[1] system allocator threaded, shared vm test many times with a stride on each exec
> > + * Test category: stress test
> > + *
> > + * SUBTEST: threads-shared-vm-many-execqueues-%s
> > + * Description: Run %arg[1] system allocator threaded, shared vm test on many exec_queues
> > + * Test category: stress test
> > + *
> > + * SUBTEST: threads-shared-vm-many-large-%s
> > + * Description: Run %arg[1] system allocator threaded, shared vm test many times with large allocations
> > + * Test category: stress test
> > + *
> > + * SUBTEST: threads-shared-vm-many-large-execqueues-%s
> > + * Description: Run %arg[1] system allocator threaded, shared vm test on many exec_queues with large allocations
> > + * Test category: stress test
> > + *
> > + * SUBTEST: process-many-%s
> > + * Description: Run %arg[1] system allocator multi-process test many times
> > + * Test category: stress test
> > + *
> > + * SUBTEST: process-many-stride-%s
> > + * Description: Run %arg[1] system allocator multi-process test many times with a stride on each exec
> > + * Test category: stress test
> > + *
> > + * SUBTEST: process-many-execqueues-%s
> > + * Description: Run %arg[1] system allocator multi-process test on many exec_queues
> > + * Test category: stress test
> > + *
> > + * SUBTEST: process-many-large-%s
> > + * Description: Run %arg[1] system allocator multi-process test many times with large allocations
> > + * Test category: stress test
> > + *
> > + * SUBTEST: process-many-large-execqueues-%s
> > + * Description: Run %arg[1] system allocator multi-process test on many exec_queues with large allocations
> > + *
> > + * SUBTEST: fault
> > + * Description: use a bad system allocator address resulting in a fault
> > + * Test category: bad input
> > + *
> > + * arg[1]:
> > + *
> > + * @malloc:				malloc single buffer for all execs, issue a command which will trigger multiple faults
> > + * @malloc-multi-fault:			malloc single buffer for all execs
> > + * @malloc-fork-read:			malloc single buffer for all execs, fork a process to read test output
> > + * @malloc-fork-read-after:		malloc single buffer for all execs, fork a process to read test output, check again after fork returns in parent
> > + * @malloc-mlock:			malloc and mlock single buffer for all execs
> > + * @malloc-race:			malloc single buffer for all execs with race between cpu and gpu access
> > + * @malloc-bo-unmap:			malloc single buffer for all execs, bind and unbind a BO to same address before execs
> > + * @malloc-busy:			malloc single buffer for all execs, try to unbind while buffer valid
> > + * @mmap:				mmap single buffer for all execs
> > + * @mmap-remap:				mmap and mremap a buffer for all execs
> > + * @mmap-remap-dontunmap:		mmap and mremap a buffer with dontunmap flag for all execs
> > + * @mmap-remap-ro:			mmap and mremap a read-only buffer for all execs
> > + * @mmap-remap-ro-dontunmap:		mmap and mremap a read-only buffer with dontunmap flag for all execs
> > + * @mmap-remap-eocheck:			mmap and mremap a buffer for all execs, check data every other loop iteration
> > + * @mmap-remap-dontunmap-eocheck:	mmap and mremap a buffer with dontunmap flag for all execs, check data every other loop iteration
> > + * @mmap-remap-ro-eocheck:		mmap and mremap a read-only buffer for all execs, check data every other loop iteration
> > + * @mmap-remap-ro-dontunmap-eocheck:	mmap and mremap a read-only buffer with dontunmap flag for all execs, check data every other loop iteration
> > + * @mmap-huge:				mmap huge page single buffer for all execs
> > + * @mmap-shared:			mmap shared single buffer for all execs
> > + * @mmap-shared-remap:			mmap shared and mremap a buffer for all execs
> > + * @mmap-shared-remap-dontunmap:	mmap shared and mremap a buffer with dontunmap flag for all execs
> > + * @mmap-shared-remap-eocheck:		mmap shared and mremap a buffer for all execs, check data every other loop iteration
> > + * @mmap-shared-remap-dontunmap-eocheck:	mmap shared and mremap a buffer with dontunmap flag for all execs, check data every other loop iteration
> > + * @mmap-mlock:				mmap and mlock single buffer for all execs
> > + * @mmap-file:				mmap single buffer, with file backing, for all execs
> > + * @mmap-file-mlock:			mmap and mlock single buffer, with file backing, for all execs
> > + * @mmap-race:				mmap single buffer for all execs with race between cpu and gpu access
> > + * @free:				malloc and free buffer for each exec
> > + * @free-race:				malloc and free buffer for each exec with race between cpu and gpu access
> > + * @new:				malloc a new buffer for each exec
> > + * @new-race:				malloc a new buffer for each exec with race between cpu and gpu access
> > + * @new-bo-map:				malloc a new buffer or map BO for each exec
> > + * @new-busy:				malloc a new buffer for each exec, try to unbind while buffers valid
> > + * @mmap-free:				mmap and free buffer for each exec
> > + * @mmap-free-huge:			mmap huge page and free buffer for each exec
> > + * @mmap-free-race:			mmap and free buffer for each exec with race between cpu and gpu access
> > + * @mmap-new:				mmap a new buffer for each exec
> > + * @mmap-new-huge:			mmap huge page a new buffer for each exec
> > + * @mmap-new-race:			mmap a new buffer for each exec with race between cpu and gpu access
> > + * @malloc-nomemset:			malloc single buffer for all execs, skip memset of buffers
> > + * @malloc-mlock-nomemset:		malloc and mlock single buffer for all execs, skip memset of buffers
> > + * @malloc-race-nomemset:		malloc single buffer for all execs with race between cpu and gpu access, skip memset of buffers
> > + * @malloc-bo-unmap-nomemset:		malloc single buffer for all execs, bind and unbind a BO to same address before execs, skip memset of buffers
> > + * @malloc-busy-nomemset:		malloc single buffer for all execs, try to unbind while buffer valid, skip memset of buffers
> > + * @mmap-nomemset:			mmap single buffer for all execs, skip memset of buffers
> > + * @mmap-huge-nomemset:			mmap huge page single buffer for all execs, skip memset of buffers
> > + * @mmap-shared-nomemset:		mmap shared single buffer for all execs, skip memset of buffers
> > + * @mmap-mlock-nomemset:		mmap and mlock single buffer for all execs, skip memset of buffers
> > + * @mmap-file-nomemset:			mmap single buffer, with file backing, for all execs, skip memset of buffers
> > + * @mmap-file-mlock-nomemset:		mmap and mlock single buffer, with file backing, for all execs, skip memset of buffers
> > + * @mmap-race-nomemset:			mmap single buffer for all execs with race between cpu and gpu access, skip memset of buffers
> > + * @free-nomemset:			malloc and free buffer for each exec, skip memset of buffers
> > + * @free-race-nomemset:			malloc and free buffer for each exec with race between cpu and gpu access, skip memset of buffers
> > + * @new-nomemset:			malloc a new buffer for each exec, skip memset of buffers
> > + * @new-race-nomemset:			malloc a new buffer for each exec with race between cpu and gpu access, skip memset of buffers
> > + * @new-bo-map-nomemset:		malloc a new buffer or map BO for each exec, skip memset of buffers
> > + * @new-busy-nomemset:			malloc a new buffer for each exec, try to unbind while buffers valid, skip memset of buffers
> > + * @mmap-free-nomemset:			mmap and free buffer for each exec, skip memset of buffers
> > + * @mmap-free-huge-nomemset:		mmap huge page and free buffer for each exec, skip memset of buffers
> > + * @mmap-free-race-nomemset:		mmap and free buffer for each exec with race between cpu and gpu access, skip memset of buffers
> > + * @mmap-new-nomemset:			mmap a new buffer for each exec, skip memset of buffers
> > + * @mmap-new-huge-nomemset:		mmap huge page new buffer for each exec, skip memset of buffers
> > + * @mmap-new-race-nomemset:		mmap a new buffer for each exec with race between cpu and gpu access, skip memset of buffers
> > + *
> > + * SUBTEST: threads-shared-vm-shared-alloc-many-stride-malloc
> > + * Description: Create multiple threads with a shared VM triggering faults on different hardware engines to same addresses
> > + * Test category: stress test
> > + *
> > + * SUBTEST: threads-shared-vm-shared-alloc-many-stride-malloc-race
> > + * Description: Create multiple threads with a shared VM triggering faults on different hardware engines to same addresses, racing between CPU and GPU access
> > + * Test category: stress test
> > + *
> > + * SUBTEST: threads-shared-alloc-many-stride-malloc
> > + * Description: Create multiple threads with a faults on different hardware engines to same addresses
> > + * Test category: stress test
> > + *
> > + * SUBTEST: threads-shared-alloc-many-stride-malloc-sync
> > + * Description: Create multiple threads with a faults on different hardware engines to same addresses, syncing on each exec
> > + * Test category: stress test
> > + *
> > + * SUBTEST: threads-shared-alloc-many-stride-malloc-race
> > + * Description: Create multiple threads with a faults on different hardware engines to same addresses, racing between CPU and GPU access
> > + * Test category: stress test
> > + */
> > +
> > +struct test_exec_data {
> > +	uint32_t batch[32];
> > +	uint64_t pad;
> > +	uint64_t vm_sync;
> > +	uint64_t exec_sync;
> > +	uint32_t data;
> > +	uint32_t expected_data;
> > +};
> > +
> > +static void
> > +test_exec(int fd, struct drm_xe_engine_class_instance *eci,
> > +	  int n_exec_queues, int n_execs, size_t bo_size,
> > +	  size_t stride, uint32_t vm, void *alloc, pthread_barrier_t *barrier,
> > +	  unsigned int flags)
> > +{
> > +	uint64_t addr;
> > +	struct drm_xe_sync sync[1] = {
> > +		{ .type = DRM_XE_SYNC_TYPE_USER_FENCE, .flags = DRM_XE_SYNC_FLAG_SIGNAL,
> > +	          .timeline_value = USER_FENCE_VALUE },
> > +	};
> > +	struct drm_xe_exec exec = {
> > +		.num_batch_buffer = 1,
> > +		.num_syncs = 1,
> > +		.syncs = to_user_pointer(sync),
> > +	};
> > +	uint32_t exec_queues[MAX_N_EXEC_QUEUES];
> > +	struct test_exec_data *data, *next_data = NULL;
> > +	uint32_t bo_flags;
> > +	uint32_t bo = 0;
> > +	void **pending_free;
> > +	u64 *exec_ufence = NULL;
> > +	int i, j, b, file_fd = -1, prev_idx;
> > +	bool free_vm = false;
> > +	size_t aligned_size = bo_size ?: xe_get_default_alignment(fd);
> > +	size_t orig_size = bo_size;
> > +	struct aligned_alloc_type aligned_alloc_type;
> > +
> > +	if (flags & MULTI_FAULT) {
> > +		if (!bo_size)
> > +			return;
> > +
> > +		bo_size *= N_MULTI_FAULT;
> > +	}
> > +
> > +	if (flags & SHARED_ALLOC)
> > +		return;
> > +
> > +	if (flags & EVERY_OTHER_CHECK && odd(n_execs))
> > +		return;
> > +
> > +	if (flags & EVERY_OTHER_CHECK)
> > +		igt_assert(flags & MREMAP);
> > +
> > +	igt_assert(n_exec_queues <= MAX_N_EXEC_QUEUES);
> > +
> > +	if (flags & NEW && !(flags & FREE)) {
> > +		pending_free = malloc(sizeof(*pending_free) * n_execs);
> > +		igt_assert(pending_free);
> > +		memset(pending_free, 0, sizeof(*pending_free) * n_execs);
> > +	}
> > +
> > +	if (!vm) {
> > +		vm = xe_vm_create(fd, DRM_XE_VM_CREATE_FLAG_LR_MODE |
> > +				  DRM_XE_VM_CREATE_FLAG_FAULT_MODE, 0);
> > +		free_vm = true;
> > +	}
> > +	if (!bo_size) {
> > +		if (!stride) {
> > +			bo_size = sizeof(*data) * n_execs;
> > +			bo_size = xe_bb_size(fd, bo_size);
> > +		} else {
> > +			bo_size = stride * n_execs * sizeof(*data);
> > +			bo_size = xe_bb_size(fd, bo_size);
> > +		}
> > +	}
> > +	if (flags & HUGE_PAGE) {
> > +		aligned_size = ALIGN(aligned_size, SZ_2M);
> > +		bo_size = ALIGN(bo_size, SZ_2M);
> > +	}
> > +
> > +	if (alloc) {
> > +		data = alloc;
> > +	} else {
> > +		if (flags & MMAP) {
> > +			int mmap_flags = MAP_FIXED;
> > +
> > +			aligned_alloc_type = __aligned_alloc(aligned_size, bo_size);
> > +			data = aligned_alloc_type.ptr;
> > +			igt_assert(data);
> > +			__aligned_partial_free(&aligned_alloc_type);
> > +
> > +			if (flags & MMAP_SHARED)
> > +				mmap_flags |= MAP_SHARED;
> > +			else
> > +				mmap_flags |= MAP_PRIVATE;
> > +
> > +			if (flags & HUGE_PAGE)
> > +				mmap_flags |= MAP_HUGETLB | MAP_HUGE_2MB;
> > +
> > +			if (flags & FILE_BACKED) {
> > +				char name[] = "/tmp/xe_exec_system_allocator_datXXXXXX";
> > +
> > +				igt_assert(!(flags & NEW));
> > +
> > +				file_fd = mkstemp(name);
> > +				posix_fallocate(file_fd, 0, bo_size);
> > +			} else {
> > +				mmap_flags |= MAP_ANONYMOUS;
> > +			}
> > +
> > +			data = mmap(data, bo_size, PROT_READ |
> > +				    PROT_WRITE, mmap_flags, file_fd, 0);
> > +			igt_assert(data != MAP_FAILED);
> > +		} else {
> > +			data = aligned_alloc(aligned_size, bo_size);
> > +			igt_assert(data);
> > +		}
> > +		if (!(flags & SKIP_MEMSET))
> > +			memset(data, 0, bo_size);
> > +		if (flags & LOCK) {
> > +			igt_assert(!(flags & NEW));
> > +			mlock(data, bo_size);
> > +		}
> > +	}
> > +
> > +	for (i = 0; i < n_exec_queues; i++)
> > +		exec_queues[i] = xe_exec_queue_create(fd, vm, eci, 0);
> > +
> > +	sync[0].addr = to_user_pointer(&data[0].vm_sync);
> > +	if (free_vm) {
> > +		bind_system_allocator(sync, 1);
> > +		xe_wait_ufence(fd, &data[0].vm_sync, USER_FENCE_VALUE, 0, FIVE_SEC);
> > +	}
> > +	data[0].vm_sync = 0;
> > +
> > +	addr = to_user_pointer(data);
> > +
> > +	if (flags & BO_UNMAP) {
> > +		bo_flags = DRM_XE_GEM_CREATE_FLAG_NEEDS_VISIBLE_VRAM;
> > +		bo = xe_bo_create(fd, vm, bo_size,
> > +				  vram_if_possible(fd, eci->gt_id), bo_flags);
> > +		xe_vm_bind_async(fd, vm, 0, bo, 0, addr, bo_size, 0, 0);
> > +
> > +		__xe_vm_bind_assert(fd, vm, 0,
> > +				    0, 0, addr, bo_size,
> > +				    DRM_XE_VM_BIND_OP_MAP,
> > +				    DRM_XE_VM_BIND_FLAG_CPU_ADDR_MIRROR, sync,
> > +				    1, 0, 0);
> > +		xe_wait_ufence(fd, &data[0].vm_sync, USER_FENCE_VALUE, 0,
> > +			       FIVE_SEC);
> > +		data[0].vm_sync = 0;
> > +		gem_close(fd, bo);
> > +		bo = 0;
> > +	}
> > +
> > +	if (!(flags & RACE)) {
> > +		exec_ufence = mmap(NULL, SZ_4K, PROT_READ |
> > +				   PROT_WRITE, MAP_SHARED |
> > +				   MAP_ANONYMOUS, -1, 0);
> > +		igt_assert(exec_ufence != MAP_FAILED);
> > +		memset(exec_ufence, 0, SZ_4K);
> > +	}
> > +
> > +	for (i = 0; i < n_execs; i++) {
> > +		int idx = !stride ? i : i * stride, next_idx = !stride
> > +			? (i + 1) : (i + 1) * stride;
> > +		uint64_t batch_offset = (char *)&data[idx].batch - (char *)data;
> > +		uint64_t batch_addr = addr + batch_offset;
> > +		uint64_t sdi_offset = (char *)&data[idx].data - (char *)data;
> > +		uint64_t sdi_addr = addr + sdi_offset;
> > +		int e = i % n_exec_queues, err;
> > +		bool fault_inject = (FAULT & flags) && i == n_execs / 2;
> > +		bool fault_injected = (FAULT & flags) && i > n_execs;
> > +
> > +		if (barrier)
> > +			pthread_barrier_wait(barrier);
> > +
> > +		if (flags & MULTI_FAULT) {
> > +			b = 0;
> > +			for (j = 0; j < N_MULTI_FAULT - 1; ++j)
> > +				__write_dword(data[idx].batch,
> > +					      sdi_addr + j * orig_size,
> > +					      WRITE_VALUE(&data[idx], idx), &b);
> > +			write_dword(data[idx].batch, sdi_addr + j * orig_size,
> > +				    WRITE_VALUE(&data[idx], idx), &b);
> > +			igt_assert(b <= ARRAY_SIZE(data[idx].batch));
> > +		} else if (!(flags & EVERY_OTHER_CHECK)) {
> > +			b = 0;
> > +			write_dword(data[idx].batch, sdi_addr,
> > +				    WRITE_VALUE(&data[idx], idx), &b);
> > +			igt_assert(b <= ARRAY_SIZE(data[idx].batch));
> > +		} else if (flags & EVERY_OTHER_CHECK && !odd(i)) {
> > +			b = 0;
> > +			write_dword(data[idx].batch, sdi_addr,
> > +				    WRITE_VALUE(&data[idx], idx), &b);
> > +			igt_assert(b <= ARRAY_SIZE(data[idx].batch));
> > +
> > +			aligned_alloc_type = __aligned_alloc(aligned_size, bo_size);
> > +			next_data = aligned_alloc_type.ptr;
> > +			igt_assert(next_data);
> > +			__aligned_partial_free(&aligned_alloc_type);
> > +
> > +			b = 0;
> > +			write_dword(data[next_idx].batch,
> > +				    to_user_pointer(next_data) +
> > +				    (char *)&data[next_idx].data - (char *)data,
> > +				    WRITE_VALUE(&data[next_idx], next_idx), &b);
> > +			igt_assert(b <= ARRAY_SIZE(data[next_idx].batch));
> > +		}
> > +
> > +		if (!exec_ufence)
> > +			data[idx].exec_sync = 0;
> > +
> > +		sync[0].addr = exec_ufence ? to_user_pointer(exec_ufence) :
> > +			addr + (char *)&data[idx].exec_sync - (char *)data;
> > +
> > +		exec.exec_queue_id = exec_queues[e];
> > +		if (fault_inject)
> > +			exec.address = batch_addr * 2;
> > +		else
> > +			exec.address = batch_addr;
> > +
> > +		if (fault_injected) {
> > +			err = __xe_exec(fd, &exec);
> > +			igt_assert(err == -ENOENT);
> > +		} else {
> > +			xe_exec(fd, &exec);
> > +		}
> > +
> > +		if (barrier)
> > +			pthread_barrier_wait(barrier);
> > +
> > +		if (fault_inject || fault_injected) {
> > +			int64_t timeout = QUARTER_SEC;
> > +
> > +			err = __xe_wait_ufence(fd, exec_ufence ? exec_ufence :
> > +					       &data[idx].exec_sync,
> > +					       USER_FENCE_VALUE,
> > +					       exec_queues[e], &timeout);
> > +			igt_assert(err == -ETIME || err == -EIO);
> > +		} else {
> > +			xe_wait_ufence(fd, exec_ufence ? exec_ufence :
> > +				       &data[idx].exec_sync, USER_FENCE_VALUE,
> > +				       exec_queues[e], FIVE_SEC);
> > +			if (flags & LOCK && !i)
> > +				munlock(data, bo_size);
> > +
> > +			if (flags & MREMAP) {
> > +				void *old = data;
> > +				int remap_flags = MREMAP_MAYMOVE | MREMAP_FIXED;
> > +
> > +				if (flags & DONTUNMAP)
> > +					remap_flags |= MREMAP_DONTUNMAP;
> > +
> > +				if (flags & READ_ONLY_REMAP)
> > +					igt_assert(!mprotect(old, bo_size,
> > +							     PROT_READ));
> > +
> > +				if (!next_data) {
> > +					aligned_alloc_type = __aligned_alloc(aligned_size,
> > +								    bo_size);
> > +					data = aligned_alloc_type.ptr;
> > +					__aligned_partial_free(&aligned_alloc_type);
> > +				} else {
> > +					data = next_data;
> > +				}
> > +				next_data = NULL;
> > +				igt_assert(data);
> > +
> > +				data = mremap(old, bo_size, bo_size,
> > +					      remap_flags, data);
> > +				igt_assert(data != MAP_FAILED);
> > +
> > +				if (flags & READ_ONLY_REMAP)
> > +					igt_assert(!mprotect(data, bo_size,
> > +							     PROT_READ |
> > +							     PROT_WRITE));
> > +
> > +				addr = to_user_pointer(data);
> > +				if (flags & DONTUNMAP)
> > +					munmap(old, bo_size);
> > +			}
> > +
> > +			if (!(flags & EVERY_OTHER_CHECK) || odd(i)) {
> > +				if (flags & FORK_READ) {
> > +					igt_fork(child, 1)
> > +						igt_assert_eq(data[idx].data,
> > +							      READ_VALUE(&data[idx]));
> > +					if (!(flags & FORK_READ_AFTER))
> > +						igt_assert_eq(data[idx].data,
> > +							      READ_VALUE(&data[idx]));
> > +					igt_waitchildren();
> > +					if (flags & FORK_READ_AFTER)
> > +						igt_assert_eq(data[idx].data,
> > +							      READ_VALUE(&data[idx]));
> > +				} else {
> > +					igt_assert_eq(data[idx].data,
> > +						      READ_VALUE(&data[idx]));
> > +
> > +					if (flags & MULTI_FAULT) {
> > +						for (j = 1; j < N_MULTI_FAULT; ++j) {
> > +							struct test_exec_data *__data =
> > +								((void *)data) + j * orig_size;
> > +
> > +							igt_assert_eq(__data[idx].data,
> > +								      READ_VALUE(&data[idx]));
> > +						}
> > +					}
> > +				}
> > +				if (flags & EVERY_OTHER_CHECK)
> > +					igt_assert_eq(data[prev_idx].data,
> > +						      READ_VALUE(&data[prev_idx]));
> > +			}
> > +		}
> > +
> > +		if (exec_ufence)
> > +			exec_ufence[0] = 0;
> > +
> > +		if (bo) {
> > +			__xe_vm_bind_assert(fd, vm, 0,
> > +					    0, 0, addr, bo_size,
> > +					    DRM_XE_VM_BIND_OP_MAP,
> > +					    DRM_XE_VM_BIND_FLAG_CPU_ADDR_MIRROR,
> > +					    NULL, 0, 0, 0);
> > +			munmap(data, bo_size);
> > +			gem_close(fd, bo);
> > +		}
> > +
> > +		if (flags & NEW) {
> > +			if (flags & MMAP) {
> > +				if (flags & FREE)
> > +					munmap(data, bo_size);
> > +				else
> > +					pending_free[i] = data;
> > +				data = mmap(NULL, bo_size, PROT_READ |
> > +					    PROT_WRITE, MAP_SHARED |
> > +					    MAP_ANONYMOUS, -1, 0);
> > +				igt_assert(data != MAP_FAILED);
> > +			} else if (flags & BO_MAP && odd(i)) {
> > +				if (!bo) {
> > +					if (flags & FREE)
> > +						free(data);
> > +					else
> > +						pending_free[i] = data;
> > +				}
> > +
> > +				aligned_alloc_type = __aligned_alloc(aligned_size, bo_size);
> > +				data = aligned_alloc_type.ptr;
> > +				igt_assert(data);
> > +				__aligned_partial_free(&aligned_alloc_type);
> > +
> > +				bo_flags = DRM_XE_GEM_CREATE_FLAG_NEEDS_VISIBLE_VRAM;
> > +				bo = xe_bo_create(fd, vm, bo_size,
> > +						  vram_if_possible(fd, eci->gt_id),
> > +						  bo_flags);
> > +				data = xe_bo_map_fixed(fd, bo, bo_size,
> > +						       to_user_pointer(data));
> > +
> > +				xe_vm_bind_async(fd, vm, 0, bo, 0,
> > +						 to_user_pointer(data),
> > +						 bo_size, 0, 0);
> > +			} else {
> > +				if (!bo) {
> > +					if (flags & FREE)
> > +						free(data);
> > +					else
> > +						pending_free[i] = data;
> > +				}
> > +				bo = 0;
> > +				data = aligned_alloc(aligned_size, bo_size);
> > +				igt_assert(data);
> > +			}
> > +			addr = to_user_pointer(data);
> > +			if (!(flags & SKIP_MEMSET))
> > +				memset(data, 0, bo_size);
> > +		}
> > +
> > +		prev_idx = idx;
> > +	}
> > +
> > +	if (bo) {
> > +		__xe_vm_bind_assert(fd, vm, 0,
> > +				    0, 0, addr, bo_size,
> > +				    DRM_XE_VM_BIND_OP_MAP,
> > +				    DRM_XE_VM_BIND_FLAG_CPU_ADDR_MIRROR,
> > +				    NULL, 0, 0, 0);
> > +		munmap(data, bo_size);
> > +		data = NULL;
> > +		gem_close(fd, bo);
> > +	}
> > +
> > +	if (flags & BUSY)
> > +		igt_assert_eq(unbind_system_allocator(), -EBUSY);
> > +
> > +	for (i = 0; i < n_exec_queues; i++)
> > +		xe_exec_queue_destroy(fd, exec_queues[i]);
> > +
> > +	if (exec_ufence)
> > +		munmap(exec_ufence, SZ_4K);
> > +
> > +	if (flags & LOCK)
> > +		munlock(data, bo_size);
> > +
> > +	if (file_fd != -1)
> > +		close(file_fd);
> > +
> > +	if (flags & NEW && !(flags & FREE)) {
> > +		for (i = 0; i < n_execs; i++) {
> > +			if (!pending_free[i])
> > +				continue;
> > +
> > +			if (flags & MMAP)
> > +				munmap(pending_free[i], bo_size);
> > +			else
> > +				free(pending_free[i]);
> > +		}
> > +		free(pending_free);
> > +	}
> > +	if (data) {
> > +		if (flags & MMAP)
> > +			munmap(data, bo_size);
> > +		else if (!alloc)
> > +			free(data);
> > +	}
> > +	if (free_vm)
> > +		xe_vm_destroy(fd, vm);
> > +}
> > +
> > +struct thread_data {
> > +	pthread_t thread;
> > +	pthread_mutex_t *mutex;
> > +	pthread_cond_t *cond;
> > +	pthread_barrier_t *barrier;
> > +	int fd;
> > +	struct drm_xe_engine_class_instance *eci;
> > +	int n_exec_queues;
> > +	int n_execs;
> > +	size_t bo_size;
> > +	size_t stride;
> > +	uint32_t vm;
> > +	unsigned int flags;
> > +	void *alloc;
> > +	bool *go;
> > +};
> > +
> > +static void *thread(void *data)
> > +{
> > +	struct thread_data *t = data;
> > +
> > +	pthread_mutex_lock(t->mutex);
> > +	while (!*t->go)
> > +		pthread_cond_wait(t->cond, t->mutex);
> > +	pthread_mutex_unlock(t->mutex);
> > +
> > +	test_exec(t->fd, t->eci, t->n_exec_queues, t->n_execs,
> > +		  t->bo_size, t->stride, t->vm, t->alloc, t->barrier,
> > +		  t->flags);
> > +
> > +	return NULL;
> > +}
> > +
> > +static void
> > +threads(int fd, int n_exec_queues, int n_execs, size_t bo_size,
> > +	size_t stride, unsigned int flags, bool shared_vm)
> > +{
> > +	struct drm_xe_engine_class_instance *hwe;
> > +	struct thread_data *threads_data;
> > +	int n_engines = 0, i = 0;
> > +	pthread_mutex_t mutex;
> > +	pthread_cond_t cond;
> > +	pthread_barrier_t barrier;
> > +	uint32_t vm = 0;
> > +	bool go = false;
> > +	void *alloc = NULL;
> > +
> > +	if ((FILE_BACKED | FORK_READ) & flags)
> > +		return;
> > +
> > +	xe_for_each_engine(fd, hwe)
> > +		++n_engines;
> > +
> > +	if (shared_vm) {
> > +		vm = xe_vm_create(fd, DRM_XE_VM_CREATE_FLAG_LR_MODE |
> > +				  DRM_XE_VM_CREATE_FLAG_FAULT_MODE, 0);
> > +		bind_system_allocator(NULL, 0);
> > +	}
> > +
> > +	if (flags & SHARED_ALLOC) {
> > +		uint64_t alloc_size;
> > +
> > +		igt_assert(stride);
> > +
> > +		alloc_size = sizeof(struct test_exec_data) * stride *
> > +			n_execs * n_engines;
> > +		alloc_size = xe_bb_size(fd, alloc_size);
> > +		alloc = aligned_alloc(SZ_2M, alloc_size);
> > +		igt_assert(alloc);
> > +
> > +		memset(alloc, 0, alloc_size);
> > +		flags &= ~SHARED_ALLOC;
> > +	}
> > +
> > +	threads_data = calloc(n_engines, sizeof(*threads_data));
> > +	igt_assert(threads_data);
> > +
> > +	pthread_mutex_init(&mutex, 0);
> > +	pthread_cond_init(&cond, 0);
> > +	pthread_barrier_init(&barrier, 0, n_engines);
> > +
> > +	xe_for_each_engine(fd, hwe) {
> > +		threads_data[i].mutex = &mutex;
> > +		threads_data[i].cond = &cond;
> > +		threads_data[i].barrier = (flags & SYNC_EXEC) ? &barrier : NULL;
> > +		threads_data[i].fd = fd;
> > +		threads_data[i].eci = hwe;
> > +		threads_data[i].n_exec_queues = n_exec_queues;
> > +		threads_data[i].n_execs = n_execs;
> > +		threads_data[i].bo_size = bo_size;
> > +		threads_data[i].stride = stride;
> > +		threads_data[i].vm = vm;
> > +		threads_data[i].flags = flags;
> > +		threads_data[i].alloc = alloc ? alloc + i *
> > +			sizeof(struct test_exec_data) : NULL;
> > +		threads_data[i].go = &go;
> > +		pthread_create(&threads_data[i].thread, 0, thread,
> > +			       &threads_data[i]);
> > +		++i;
> > +	}
> > +
> > +	pthread_mutex_lock(&mutex);
> > +	go = true;
> > +	pthread_cond_broadcast(&cond);
> > +	pthread_mutex_unlock(&mutex);
> > +
> > +	for (i = 0; i < n_engines; ++i)
> > +		pthread_join(threads_data[i].thread, NULL);
> > +
> > +	if (shared_vm) {
> > +		int ret;
> > +
> > +		if (flags & MMAP) {
> > +			int tries = 300;
> > +
> > +			while (tries && (ret = unbind_system_allocator()) == -EBUSY) {
> > +				sleep(.01);
> > +				--tries;
> > +			}
> > +			igt_assert_eq(ret, 0);
> > +		}
> > +		xe_vm_destroy(fd, vm);
> > +		if (alloc)
> > +			free(alloc);
> > +	}
> > +	free(threads_data);
> > +}
> > +
> > +static void process(struct drm_xe_engine_class_instance *hwe, int n_exec_queues,
> > +		    int n_execs, size_t bo_size, size_t stride,
> > +		    unsigned int flags)
> > +{
> > +	struct process_data *pdata;
> > +	int map_fd;
> > +	int fd;
> > +
> > +	map_fd = open(sync_file, O_RDWR, 0x666);
> > +	pdata = mmap(NULL, sizeof(*pdata), PROT_READ |
> > +		     PROT_WRITE, MAP_SHARED, map_fd, 0);
> > +	wait_pdata(pdata);
> > +
> > +	fd = drm_open_driver(DRIVER_XE);
> > +	test_exec(fd, hwe, n_exec_queues, n_execs,
> > +		  bo_size, stride, 0, NULL, NULL, flags);
> > +	drm_close_driver(fd);
> > +
> > +	close(map_fd);
> > +	munmap(pdata, sizeof(*pdata));
> > +}
> > +
> > +static void
> > +processes(int fd, int n_exec_queues, int n_execs, size_t bo_size,
> > +	  size_t stride, unsigned int flags)
> > +{
> > +	struct drm_xe_engine_class_instance *hwe;
> > +	struct process_data *pdata;
> > +	int map_fd;
> > +
> > +	if (flags & FORK_READ)
> > +		return;
> > +
> > +	map_fd = open(sync_file, O_RDWR | O_CREAT, 0x666);
> > +	posix_fallocate(map_fd, 0, sizeof(*pdata));
> > +	pdata = mmap(NULL, sizeof(*pdata), PROT_READ |
> > +		     PROT_WRITE, MAP_SHARED, map_fd, 0);
> > +
> > +	init_pdata(pdata, 0);
> > +
> > +	xe_for_each_engine(fd, hwe) {
> > +		igt_fork(child, 1)
> > +			process(hwe, n_exec_queues, n_execs, bo_size,
> > +				stride, flags);
> > +	}
> > +
> > +	signal_pdata(pdata);
> > +	igt_waitchildren();
> > +
> > +	close(map_fd);
> > +	munmap(pdata, sizeof(*pdata));
> > +}
> > +
> > +struct section {
> > +	const char *name;
> > +	unsigned int flags;
> > +};
> > +
> > +igt_main
> > +{
> > +	struct drm_xe_engine_class_instance *hwe;
> > +	const struct section sections[] = {
> > +		{ "malloc", 0 },
> > +		{ "malloc-multi-fault", MULTI_FAULT },
> > +		{ "malloc-fork-read", FORK_READ },
> > +		{ "malloc-fork-read-after", FORK_READ | FORK_READ_AFTER },
> > +		{ "malloc-mlock", LOCK },
> > +		{ "malloc-race", RACE },
> > +		{ "malloc-busy", BUSY },
> > +		{ "malloc-bo-unmap", BO_UNMAP },
> > +		{ "mmap", MMAP },
> > +		{ "mmap-remap", MMAP | MREMAP },
> > +		{ "mmap-remap-dontunmap", MMAP | MREMAP | DONTUNMAP },
> > +		{ "mmap-remap-ro", MMAP | MREMAP | READ_ONLY_REMAP },
> > +		{ "mmap-remap-ro-dontunmap", MMAP | MREMAP | DONTUNMAP |
> > +			READ_ONLY_REMAP },
> > +		{ "mmap-remap-eocheck", MMAP | MREMAP | EVERY_OTHER_CHECK },
> > +		{ "mmap-remap-dontunmap-eocheck", MMAP | MREMAP | DONTUNMAP |
> > +			EVERY_OTHER_CHECK },
> > +		{ "mmap-remap-ro-eocheck", MMAP | MREMAP | READ_ONLY_REMAP |
> > +			EVERY_OTHER_CHECK },
> > +		{ "mmap-remap-ro-dontunmap-eocheck", MMAP | MREMAP | DONTUNMAP |
> > +			READ_ONLY_REMAP | EVERY_OTHER_CHECK },
> > +		{ "mmap-huge", MMAP | HUGE_PAGE },
> > +		{ "mmap-shared", MMAP | LOCK | MMAP_SHARED },
> > +		{ "mmap-shared-remap", MMAP | LOCK | MMAP_SHARED | MREMAP },
> > +		{ "mmap-shared-remap-dontunmap", MMAP | LOCK | MMAP_SHARED |
> > +			MREMAP | DONTUNMAP },
> > +		{ "mmap-shared-remap-eocheck", MMAP | LOCK | MMAP_SHARED |
> > +			MREMAP | EVERY_OTHER_CHECK },
> > +		{ "mmap-shared-remap-dontunmap-eocheck", MMAP | LOCK |
> > +			MMAP_SHARED | MREMAP | DONTUNMAP | EVERY_OTHER_CHECK },
> > +		{ "mmap-mlock", MMAP | LOCK },
> > +		{ "mmap-file", MMAP | FILE_BACKED },
> > +		{ "mmap-file-mlock", MMAP | LOCK | FILE_BACKED },
> > +		{ "mmap-race", MMAP | RACE },
> > +		{ "free", NEW | FREE },
> > +		{ "free-race", NEW | FREE | RACE },
> > +		{ "new", NEW },
> > +		{ "new-race", NEW | RACE },
> > +		{ "new-bo-map", NEW | BO_MAP },
> > +		{ "new-busy", NEW | BUSY },
> > +		{ "mmap-free", MMAP | NEW | FREE },
> > +		{ "mmap-free-huge", MMAP | NEW | FREE | HUGE_PAGE },
> > +		{ "mmap-free-race", MMAP | NEW | FREE | RACE },
> > +		{ "mmap-new", MMAP | NEW },
> > +		{ "mmap-new-huge", MMAP | NEW | HUGE_PAGE },
> > +		{ "mmap-new-race", MMAP | NEW | RACE },
> > +		{ "malloc-nomemset", SKIP_MEMSET },
> > +		{ "malloc-mlock-nomemset", SKIP_MEMSET | LOCK },
> > +		{ "malloc-race-nomemset", SKIP_MEMSET | RACE },
> > +		{ "malloc-busy-nomemset", SKIP_MEMSET | BUSY },
> > +		{ "malloc-bo-unmap-nomemset", SKIP_MEMSET | BO_UNMAP },
> > +		{ "mmap-nomemset", SKIP_MEMSET | MMAP },
> > +		{ "mmap-huge-nomemset", SKIP_MEMSET | MMAP | HUGE_PAGE },
> > +		{ "mmap-shared-nomemset", SKIP_MEMSET | MMAP | MMAP_SHARED },
> > +		{ "mmap-mlock-nomemset", SKIP_MEMSET | MMAP | LOCK },
> > +		{ "mmap-file-nomemset", SKIP_MEMSET | MMAP | FILE_BACKED },
> > +		{ "mmap-file-mlock-nomemset", SKIP_MEMSET | MMAP | LOCK | FILE_BACKED },
> > +		{ "mmap-race-nomemset", SKIP_MEMSET | MMAP | RACE },
> > +		{ "free-nomemset", SKIP_MEMSET | NEW | FREE },
> > +		{ "free-race-nomemset", SKIP_MEMSET | NEW | FREE | RACE },
> > +		{ "new-nomemset", SKIP_MEMSET | NEW },
> > +		{ "new-race-nomemset", SKIP_MEMSET | NEW | RACE },
> > +		{ "new-bo-map-nomemset", SKIP_MEMSET | NEW | BO_MAP },
> > +		{ "new-busy-nomemset", SKIP_MEMSET | NEW | BUSY },
> > +		{ "mmap-free-nomemset", SKIP_MEMSET | MMAP | NEW | FREE },
> > +		{ "mmap-free-huge-nomemset", SKIP_MEMSET | MMAP | NEW | FREE | HUGE_PAGE },
> > +		{ "mmap-free-race-nomemset", SKIP_MEMSET | MMAP | NEW | FREE | RACE },
> > +		{ "mmap-new-nomemset", SKIP_MEMSET | MMAP | NEW },
> > +		{ "mmap-new-huge-nomemset", SKIP_MEMSET | MMAP | NEW | HUGE_PAGE },
> > +		{ "mmap-new-race-nomemset", SKIP_MEMSET | MMAP | NEW | RACE },
> > +		{ NULL },
> > +	};
> > +	const struct section psections[] = {
> > +		{ "munmap-cpu-fault", CPU_FAULT },
> > +		{ "munmap-no-cpu-fault", 0 },
> > +		{ "remap-cpu-fault", CPU_FAULT | REMAP },
> > +		{ "remap-no-cpu-fault", REMAP },
> > +		{ "middle-munmap-cpu-fault", MIDDLE | CPU_FAULT },
> > +		{ "middle-munmap-no-cpu-fault", MIDDLE },
> > +		{ "middle-remap-cpu-fault", MIDDLE | CPU_FAULT | REMAP },
> > +		{ "middle-remap-no-cpu-fault", MIDDLE | REMAP },
> > +		{ NULL },
> > +	};
> > +	const struct section esections[] = {
> > +		{ "malloc", 0 },
> > +		{ "malloc-mix-bo", MIX_BO_ALLOC },
> > +		{ NULL },
> > +	};
> > +	int fd;
> > +
> > +	igt_fixture {
> > +		struct xe_device *xe;
> > +
> > +		fd = drm_open_driver(DRIVER_XE);
> > +		igt_require(!xe_supports_faults(fd));
> > +
> > +		xe = xe_device_get(fd);
> > +		va_bits = xe->va_bits;
> > +		open_sync_file();
> > +	}
> > +
> > +	for (const struct section *s = sections; s->name; s++) {
> > +		igt_subtest_f("once-%s", s->name)
> > +			xe_for_each_engine(fd, hwe)
> > +				test_exec(fd, hwe, 1, 1, 0, 0, 0, NULL,
> > +					  NULL, s->flags);
> > +
> > +		igt_subtest_f("once-large-%s", s->name)
> > +			xe_for_each_engine(fd, hwe)
> > +				test_exec(fd, hwe, 1, 1, SZ_2M, 0, 0, NULL,
> > +					  NULL, s->flags);
> > +
> > +		igt_subtest_f("twice-%s", s->name)
> > +			xe_for_each_engine(fd, hwe)
> > +				test_exec(fd, hwe, 1, 2, 0, 0, 0, NULL,
> > +					  NULL, s->flags);
> > +
> > +		igt_subtest_f("twice-large-%s", s->name)
> > +			xe_for_each_engine(fd, hwe)
> > +				test_exec(fd, hwe, 1, 2, SZ_2M, 0, 0, NULL,
> > +					  NULL, s->flags);
> > +
> > +		igt_subtest_f("many-%s", s->name)
> > +			xe_for_each_engine(fd, hwe)
> > +				test_exec(fd, hwe, 1, 128, 0, 0, 0, NULL,
> > +					  NULL, s->flags);
> > +
> > +		igt_subtest_f("many-stride-%s", s->name)
> > +			xe_for_each_engine(fd, hwe)
> > +				test_exec(fd, hwe, 1, 128, 0, 256, 0, NULL,
> > +					  NULL, s->flags);
> > +
> > +		igt_subtest_f("many-execqueues-%s", s->name)
> > +			xe_for_each_engine(fd, hwe)
> > +				test_exec(fd, hwe, 16, 128, 0, 0, 0, NULL,
> > +					  NULL, s->flags);
> > +
> > +		igt_subtest_f("many-large-%s", s->name)
> > +			xe_for_each_engine(fd, hwe)
> > +				test_exec(fd, hwe, 1, 128, SZ_2M, 0, 0, NULL,
> > +					  NULL, s->flags);
> > +
> > +		igt_subtest_f("many-large-execqueues-%s", s->name)
> > +			xe_for_each_engine(fd, hwe)
> > +				test_exec(fd, hwe, 16, 128, SZ_2M, 0, 0, NULL,
> > +					  NULL, s->flags);
> > +
> > +		igt_subtest_f("threads-many-%s", s->name)
> > +			threads(fd, 1, 128, 0, 0, s->flags, false);
> > +
> > +		igt_subtest_f("threads-many-stride-%s", s->name)
> > +			threads(fd, 1, 128, 0, 256, s->flags, false);
> > +
> > +		igt_subtest_f("threads-many-execqueues-%s", s->name)
> > +			threads(fd, 16, 128, 0, 0, s->flags, false);
> > +
> > +		igt_subtest_f("threads-many-large-%s", s->name)
> > +			threads(fd, 1, 128, SZ_2M, 0, s->flags, false);
> > +
> > +		igt_subtest_f("threads-many-large-execqueues-%s", s->name)
> > +			threads(fd, 16, 128, SZ_2M, 0, s->flags, false);
> > +
> > +		igt_subtest_f("threads-shared-vm-many-%s", s->name)
> > +			threads(fd, 1, 128, 0, 0, s->flags, true);
> > +
> > +		igt_subtest_f("threads-shared-vm-many-stride-%s", s->name)
> > +			threads(fd, 1, 128, 0, 256, s->flags, true);
> > +
> > +		igt_subtest_f("threads-shared-vm-many-execqueues-%s", s->name)
> > +			threads(fd, 16, 128, 0, 0, s->flags, true);
> > +
> > +		igt_subtest_f("threads-shared-vm-many-large-%s", s->name)
> > +			threads(fd, 1, 128, SZ_2M, 0, s->flags, true);
> > +
> > +		igt_subtest_f("threads-shared-vm-many-large-execqueues-%s", s->name)
> > +			threads(fd, 16, 128, SZ_2M, 0, s->flags, true);
> > +
> > +		igt_subtest_f("process-many-%s", s->name)
> > +			processes(fd, 1, 128, 0, 0, s->flags);
> > +
> > +		igt_subtest_f("process-many-stride-%s", s->name)
> > +			processes(fd, 1, 128, 0, 256, s->flags);
> > +
> > +		igt_subtest_f("process-many-execqueues-%s", s->name)
> > +			processes(fd, 16, 128, 0, 0, s->flags);
> > +
> > +		igt_subtest_f("process-many-large-%s", s->name)
> > +			processes(fd, 1, 128, SZ_2M, 0, s->flags);
> > +
> > +		igt_subtest_f("process-many-large-execqueues-%s", s->name)
> > +			processes(fd, 16, 128, SZ_2M, 0, s->flags);
> > +	}
> > +
> > +	igt_subtest("threads-shared-vm-shared-alloc-many-stride-malloc")
> > +		threads(fd, 1, 128, 0, 256, SHARED_ALLOC, true);
> > +
> > +	igt_subtest("threads-shared-vm-shared-alloc-many-stride-malloc-race")
> > +		threads(fd, 1, 128, 0, 256, RACE | SHARED_ALLOC, true);
> > +
> > +	igt_subtest("threads-shared-alloc-many-stride-malloc")
> > +		threads(fd, 1, 128, 0, 256, SHARED_ALLOC, false);
> > +
> > +	igt_subtest("threads-shared-alloc-many-stride-malloc-sync")
> > +		threads(fd, 1, 128, 0, 256, SHARED_ALLOC | SYNC_EXEC, false);
> > +
> > +	igt_subtest("threads-shared-alloc-many-stride-malloc-race")
> > +		threads(fd, 1, 128, 0, 256, RACE | SHARED_ALLOC, false);
> > +
> > +	igt_subtest_f("fault")
> > +		xe_for_each_engine(fd, hwe)
> > +			test_exec(fd, hwe, 4, 1, SZ_2M, 0, 0, NULL, NULL,
> > +				  FAULT);
> > +
> > +	for (const struct section *s = psections; s->name; s++) {
> > +		igt_subtest_f("partial-%s", s->name)
> > +			xe_for_each_engine(fd, hwe)
> > +				partial(fd, hwe, s->flags);
> > +	}
> > +
> > +	igt_subtest_f("unaligned-alloc")
> > +		xe_for_each_engine(fd, hwe) {
> > +			many_allocs(fd, hwe, (SZ_1M + SZ_512K) * 8,
> > +				    SZ_1M + SZ_512K, SZ_4K, NULL, 0);
> > +			break;
> > +		}
> > +
> > +	igt_subtest_f("fault-benchmark")
> > +		xe_for_each_engine(fd, hwe)
> > +			many_allocs(fd, hwe, SZ_64M, SZ_64M, SZ_4K, NULL,
> > +				    BENCHMARK);
> > +
> > +	igt_subtest_f("fault-threads-benchmark")
> > +		xe_for_each_engine(fd, hwe)
> > +			many_allocs(fd, hwe, SZ_64M, SZ_64M, SZ_4K, NULL,
> > +				    BENCHMARK | CPU_FAULT_THREADS);
> > +
> > +	igt_subtest_f("fault-threads-same-page-benchmark")
> > +		xe_for_each_engine(fd, hwe)
> > +			many_allocs(fd, hwe, SZ_64M, SZ_64M, SZ_4K, NULL,
> > +				    BENCHMARK | CPU_FAULT_THREADS |
> > +				    CPU_FAULT_SAME_PAGE);
> > +
> > +	igt_subtest_f("fault-process-benchmark")
> > +		xe_for_each_engine(fd, hwe)
> > +			many_allocs(fd, hwe, SZ_64M, SZ_64M, SZ_4K, NULL,
> > +				    BENCHMARK | CPU_FAULT_PROCESS);
> > +
> > +	igt_subtest_f("fault-process-same-page-benchmark")
> > +		xe_for_each_engine(fd, hwe)
> > +			many_allocs(fd, hwe, SZ_64M, SZ_64M, SZ_4K, NULL,
> > +				    BENCHMARK | CPU_FAULT_PROCESS |
> > +				    CPU_FAULT_SAME_PAGE);
> > +
> > +	for (const struct section *s = esections; s->name; s++) {
> > +		igt_subtest_f("evict-%s", s->name)
> > +			xe_for_each_engine(fd, hwe) {
> > +				many_allocs(fd, hwe,
> > +					    xe_visible_vram_size(fd, hwe->gt_id),
> > +					    SZ_8M, SZ_1M, NULL, s->flags);
> > +				break;
> > +			}
> > +	}
> > +
> > +	for (const struct section *s = esections; s->name; s++) {
> > +		igt_subtest_f("processes-evict-%s", s->name)
> > +			processes_evict(fd, SZ_8M, SZ_1M, s->flags);
> > +	}
> > +
> > +	igt_fixture {
> > +		xe_device_put(fd);
> > +		drm_close_driver(fd);
> > +		close_sync_file();
> > +	}
> > +}
> > diff --git a/tests/meson.build b/tests/meson.build
> > index 6328792e3a..20ddddb89f 100644
> > --- a/tests/meson.build
> > +++ b/tests/meson.build
> > @@ -295,6 +295,7 @@ intel_xe_progs = [
> >  	'xe_exec_reset',
> >  	'xe_exec_sip',
> >  	'xe_exec_store',
> > +	'xe_exec_system_allocator',
> >  	'xe_exec_threads',
> >  	'xe_exercise_blt',
> >  	'xe_fault_injection',
> > -- 
> > 2.34.1
> > 
> > 

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v3 1/2] uapi/xe: Sync latest uAPI KMD headers
  2025-04-24 20:44 ` [PATCH v3 1/2] uapi/xe: Sync latest uAPI KMD headers Matthew Brost
  2025-04-24 22:32   ` Cavitt, Jonathan
@ 2025-04-25  6:54   ` Francois Dugast
  1 sibling, 0 replies; 8+ messages in thread
From: Francois Dugast @ 2025-04-25  6:54 UTC (permalink / raw)
  To: Matthew Brost; +Cc: igt-dev

On Thu, Apr 24, 2025 at 01:44:05PM -0700, Matthew Brost wrote:
> Pull in latest uAPI KMD headers to enable testing of new features.
> 
> Signed-off-by: Matthew Brost <matthew.brost@intel.com>

Reviewed-by: Francois Dugast <francois.dugast@intel.com>

> ---
>  include/drm-uapi/xe_drm.h | 49 ++++++++++++++++++++++++++++++++++++---
>  1 file changed, 46 insertions(+), 3 deletions(-)
> 
> diff --git a/include/drm-uapi/xe_drm.h b/include/drm-uapi/xe_drm.h
> index 154f947ef0..c90fab1b00 100644
> --- a/include/drm-uapi/xe_drm.h
> +++ b/include/drm-uapi/xe_drm.h
> @@ -393,6 +393,10 @@ struct drm_xe_query_mem_regions {
>   *
>   *    - %DRM_XE_QUERY_CONFIG_FLAG_HAS_VRAM - Flag is set if the device
>   *      has usable VRAM
> + *    - %DRM_XE_QUERY_CONFIG_FLAG_HAS_LOW_LATENCY - Flag is set if the device
> + *      has low latency hint support
> + *    - %DRM_XE_QUERY_CONFIG_FLAG_HAS_CPU_ADDR_MIRROR - Flag is set if the
> + *      device has CPU address mirroring support
>   *  - %DRM_XE_QUERY_CONFIG_MIN_ALIGNMENT - Minimal memory alignment
>   *    required by this device, typically SZ_4K or SZ_64K
>   *  - %DRM_XE_QUERY_CONFIG_VA_BITS - Maximum bits of a virtual address
> @@ -409,6 +413,8 @@ struct drm_xe_query_config {
>  #define DRM_XE_QUERY_CONFIG_REV_AND_DEVICE_ID	0
>  #define DRM_XE_QUERY_CONFIG_FLAGS			1
>  	#define DRM_XE_QUERY_CONFIG_FLAG_HAS_VRAM	(1 << 0)
> +	#define DRM_XE_QUERY_CONFIG_FLAG_HAS_LOW_LATENCY	(1 << 1)
> +	#define DRM_XE_QUERY_CONFIG_FLAG_HAS_CPU_ADDR_MIRROR	(1 << 2)
>  #define DRM_XE_QUERY_CONFIG_MIN_ALIGNMENT		2
>  #define DRM_XE_QUERY_CONFIG_VA_BITS			3
>  #define DRM_XE_QUERY_CONFIG_MAX_EXEC_QUEUE_PRIORITY	4
> @@ -911,7 +917,11 @@ struct drm_xe_gem_mmap_offset {
>   * struct drm_xe_vm_create - Input of &DRM_IOCTL_XE_VM_CREATE
>   *
>   * The @flags can be:
> - *  - %DRM_XE_VM_CREATE_FLAG_SCRATCH_PAGE
> + *  - %DRM_XE_VM_CREATE_FLAG_SCRATCH_PAGE - Map the whole virtual address
> + *    space of the VM to scratch page. A vm_bind would overwrite the scratch
> + *    page mapping. This flag is mutually exclusive with the
> + *    %DRM_XE_VM_CREATE_FLAG_FAULT_MODE flag, with an exception of on x2 and
> + *    xe3 platform.
>   *  - %DRM_XE_VM_CREATE_FLAG_LR_MODE - An LR, or Long Running VM accepts
>   *    exec submissions to its exec_queues that don't have an upper time
>   *    limit on the job execution time. But exec submissions to these
> @@ -987,6 +997,12 @@ struct drm_xe_vm_destroy {
>   *  - %DRM_XE_VM_BIND_FLAG_CHECK_PXP - If the object is encrypted via PXP,
>   *    reject the binding if the encryption key is no longer valid. This
>   *    flag has no effect on BOs that are not marked as using PXP.
> + *  - %DRM_XE_VM_BIND_FLAG_CPU_ADDR_MIRROR - When the CPU address mirror flag is
> + *    set, no mappings are created rather the range is reserved for CPU address
> + *    mirroring which will be populated on GPU page faults or prefetches. Only
> + *    valid on VMs with DRM_XE_VM_CREATE_FLAG_FAULT_MODE set. The CPU address
> + *    mirror flag are only valid for DRM_XE_VM_BIND_OP_MAP operations, the BO
> + *    handle MBZ, and the BO offset MBZ.
>   */
>  struct drm_xe_vm_bind_op {
>  	/** @extensions: Pointer to the first extension struct, if any */
> @@ -1039,7 +1055,9 @@ struct drm_xe_vm_bind_op {
>  	 * on the @pat_index. For such mappings there is no actual memory being
>  	 * mapped (the address in the PTE is invalid), so the various PAT memory
>  	 * attributes likely do not apply.  Simply leaving as zero is one
> -	 * option (still a valid pat_index).
> +	 * option (still a valid pat_index). Same applies to
> +	 * DRM_XE_VM_BIND_FLAG_CPU_ADDR_MIRROR bindings as for such mapping
> +	 * there is no actual memory being mapped.
>  	 */
>  	__u16 pat_index;
>  
> @@ -1055,6 +1073,14 @@ struct drm_xe_vm_bind_op {
>  
>  		/** @userptr: user pointer to bind on */
>  		__u64 userptr;
> +
> +		/**
> +		 * @cpu_addr_mirror_offset: Offset from GPU @addr to create
> +		 * CPU address mirror mappings. MBZ with current level of
> +		 * support (e.g. 1 to 1 mapping between GPU and CPU mappings
> +		 * only supported).
> +		 */
> +		__s64 cpu_addr_mirror_offset;
>  	};
>  
>  	/**
> @@ -1078,6 +1104,7 @@ struct drm_xe_vm_bind_op {
>  #define DRM_XE_VM_BIND_FLAG_NULL	(1 << 2)
>  #define DRM_XE_VM_BIND_FLAG_DUMPABLE	(1 << 3)
>  #define DRM_XE_VM_BIND_FLAG_CHECK_PXP	(1 << 4)
> +#define DRM_XE_VM_BIND_FLAG_CPU_ADDR_MIRROR	(1 << 5)
>  	/** @flags: Bind flags */
>  	__u32 flags;
>  
> @@ -1205,6 +1232,21 @@ struct drm_xe_vm_bind {
>   *     };
>   *     ioctl(fd, DRM_IOCTL_XE_EXEC_QUEUE_CREATE, &exec_queue_create);
>   *
> + *     Allow users to provide a hint to kernel for cases demanding low latency
> + *     profile. Please note it will have impact on power consumption. User can
> + *     indicate low latency hint with flag while creating exec queue as
> + *     mentioned below,
> + *
> + *     struct drm_xe_exec_queue_create exec_queue_create = {
> + *          .flags = DRM_XE_EXEC_QUEUE_LOW_LATENCY_HINT,
> + *          .extensions = 0,
> + *          .vm_id = vm,
> + *          .num_bb_per_exec = 1,
> + *          .num_eng_per_bb = 1,
> + *          .instances = to_user_pointer(&instance),
> + *     };
> + *     ioctl(fd, DRM_IOCTL_XE_EXEC_QUEUE_CREATE, &exec_queue_create);
> + *
>   */
>  struct drm_xe_exec_queue_create {
>  #define DRM_XE_EXEC_QUEUE_EXTENSION_SET_PROPERTY		0
> @@ -1223,7 +1265,8 @@ struct drm_xe_exec_queue_create {
>  	/** @vm_id: VM to use for this exec queue */
>  	__u32 vm_id;
>  
> -	/** @flags: MBZ */
> +#define DRM_XE_EXEC_QUEUE_LOW_LATENCY_HINT	(1 << 0)
> +	/** @flags: flags to use for this exec queue */
>  	__u32 flags;
>  
>  	/** @exec_queue_id: Returned exec queue ID */
> -- 
> 2.34.1
> 

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v3 2/2] tests/xe: Add system_allocator test
  2025-04-24 20:44 ` [PATCH v3 2/2] tests/xe: Add system_allocator test Matthew Brost
  2025-04-24 22:32   ` Cavitt, Jonathan
@ 2025-04-25  7:06   ` Francois Dugast
  1 sibling, 0 replies; 8+ messages in thread
From: Francois Dugast @ 2025-04-25  7:06 UTC (permalink / raw)
  To: Matthew Brost; +Cc: igt-dev

On Thu, Apr 24, 2025 at 01:44:06PM -0700, Matthew Brost wrote:
> Test various uses of system allocator in single thread, multiple
> threads, and multiple processes.
> 
> Features tested:
>  - Malloc with various size
>  - Mmap with various sizes and flags including file backed mappings
>  - Mixing BO allocations with system allocator
>  - Various page sizes
>  - Dynamically freeing / unmapping memory
>  - Sharing VM across threads
>  - Faults racing on different hardware engines / GTs / Tiles
>  - GPU faults and CPU faults racing
>  - CPU faults on multiple threads racing
>  - CPU faults on multiple process racing
>  - GPU faults of memory not faulted in by CPU
>  - Partial unmap of allocations
>  - Attempting to unmap system allocations when GPU has mappings
>  - Eviction of both system allocations and BOs
>  - Forking child processes and reading data from VRAM
>  - mremap data in VRAM
>  - Protection changes
>  - Multiple faults per execbuf
> 
> Running on LNL, BMG, PVC 1 tile, and PVC 2 tile.
> 
> v2:
>  - Rebase
>  - Fix memory allocation to not interfear with malloc (Thomas)
> v3:
>  - Fix memory leak (Francois)
>  - Break out uAPI into own patch (Francois)
>  - Use mkstemp for sync file (Francois)
>  - Use mkstemp for file backed data (Francois)
>  - Drop i argument from READ_VALUE (Francois)
>  - Fix test description (Francois)
>  - Add comment to check_all_pages_process (Francois)
>  - Prefer igt_info over printf (Francois)
>  - Fix types in messages (Francois)
>  - Prefer odd macro (Francois)
> 
> Signed-off-by: Matthew Brost <matthew.brost@intel.com>

Earlier comments [1] are addressed, thanks.

Reviewed-by: Francois Dugast <francois.dugast@intel.com>

[1] https://patchwork.freedesktop.org/patch/648841/?series=137545&rev=4#comment_1189490

Francois

> ---
>  lib/xe/xe_ioctl.c                      |   12 +
>  lib/xe/xe_ioctl.h                      |    1 +
>  tests/intel/xe_exec_system_allocator.c | 1849 ++++++++++++++++++++++++
>  tests/meson.build                      |    1 +
>  4 files changed, 1863 insertions(+)
>  create mode 100644 tests/intel/xe_exec_system_allocator.c
> 
> diff --git a/lib/xe/xe_ioctl.c b/lib/xe/xe_ioctl.c
> index fb8c4aef13..785fc9184c 100644
> --- a/lib/xe/xe_ioctl.c
> +++ b/lib/xe/xe_ioctl.c
> @@ -440,6 +440,18 @@ void *xe_bo_map(int fd, uint32_t bo, size_t size)
>  	return __xe_bo_map(fd, bo, size, PROT_WRITE);
>  }
>  
> +void *xe_bo_map_fixed(int fd, uint32_t bo, size_t size, uint64_t addr)
> +{
> +	uint64_t mmo;
> +	void *map;
> +
> +	mmo = xe_bo_mmap_offset(fd, bo);
> +	map = mmap((void *)addr, size, PROT_WRITE, MAP_SHARED | MAP_FIXED, fd, mmo);
> +	igt_assert(map != MAP_FAILED);
> +
> +	return map;
> +}
> +
>  void *xe_bo_mmap_ext(int fd, uint32_t bo, size_t size, int prot)
>  {
>  	return __xe_bo_map(fd, bo, size, prot);
> diff --git a/lib/xe/xe_ioctl.h b/lib/xe/xe_ioctl.h
> index 9bdf73b2bd..554a33c9cd 100644
> --- a/lib/xe/xe_ioctl.h
> +++ b/lib/xe/xe_ioctl.h
> @@ -86,6 +86,7 @@ uint32_t xe_exec_queue_create_class(int fd, uint32_t vm, uint16_t class);
>  void xe_exec_queue_destroy(int fd, uint32_t exec_queue);
>  uint64_t xe_bo_mmap_offset(int fd, uint32_t bo);
>  void *xe_bo_map(int fd, uint32_t bo, size_t size);
> +void *xe_bo_map_fixed(int fd, uint32_t bo, size_t size, long unsigned int addr);
>  void *xe_bo_mmap_ext(int fd, uint32_t bo, size_t size, int prot);
>  int __xe_exec(int fd, struct drm_xe_exec *exec);
>  void xe_exec(int fd, struct drm_xe_exec *exec);
> diff --git a/tests/intel/xe_exec_system_allocator.c b/tests/intel/xe_exec_system_allocator.c
> new file mode 100644
> index 0000000000..4839090cb2
> --- /dev/null
> +++ b/tests/intel/xe_exec_system_allocator.c
> @@ -0,0 +1,1849 @@
> +// SPDX-License-Identifier: MIT
> +/*
> + * Copyright © 2024 Intel Corporation
> + */
> +
> +/**
> + * TEST: Basic tests for execbuf functionality using system allocator
> + * Category: Core
> + * Mega feature: USM
> + * Sub-category: System allocator
> + * Functionality: fault mode, system allocator
> + * GPU: LNL, BMG, PVC
> + */
> +
> +#include <fcntl.h>
> +#include <linux/mman.h>
> +#include <time.h>
> +
> +#include "igt.h"
> +#include "lib/igt_syncobj.h"
> +#include "lib/intel_reg.h"
> +#include "xe_drm.h"
> +
> +#include "xe/xe_ioctl.h"
> +#include "xe/xe_query.h"
> +#include <string.h>
> +
> +#define USER_FENCE_VALUE	0xdeadbeefdeadbeefull
> +#define QUARTER_SEC		(NSEC_PER_SEC / 4)
> +#define FIVE_SEC		(5LL * NSEC_PER_SEC)
> +
> +struct batch_data {
> +	uint32_t batch[16];
> +	uint64_t pad;
> +	uint32_t data;
> +	uint32_t expected_data;
> +};
> +
> +#define WRITE_VALUE(data__, i__)	({			\
> +	if (!(data__)->expected_data)				\
> +		(data__)->expected_data = rand() << 12 | (i__);	\
> +	(data__)->expected_data;				\
> +})
> +#define READ_VALUE(data__)	((data__)->expected_data)
> +
> +static void __write_dword(uint32_t *batch, uint64_t sdi_addr, uint32_t wdata,
> +			int *idx)
> +{
> +	batch[(*idx)++] = MI_STORE_DWORD_IMM_GEN4;
> +	batch[(*idx)++] = sdi_addr;
> +	batch[(*idx)++] = sdi_addr >> 32;
> +	batch[(*idx)++] = wdata;
> +}
> +
> +static void write_dword(uint32_t *batch, uint64_t sdi_addr, uint32_t wdata,
> +			int *idx)
> +{
> +	__write_dword(batch, sdi_addr, wdata, idx);
> +	batch[(*idx)++] = MI_BATCH_BUFFER_END;
> +}
> +
> +static void check_all_pages(void *ptr, uint64_t alloc_size, uint64_t stride,
> +			    pthread_barrier_t *barrier)
> +{
> +	int i, n_writes = alloc_size / stride;
> +
> +	for (i = 0; i < n_writes; ++i) {
> +		struct batch_data *data = ptr + i * stride;
> +
> +		igt_assert_eq(data->data, READ_VALUE(data));
> +
> +		if (barrier)
> +			pthread_barrier_wait(barrier);
> +	}
> +}
> +
> +static char sync_file[] = "/tmp/xe_exec_system_allocator_syncXXXXXX";
> +static int sync_fd;
> +
> +static void open_sync_file(void)
> +{
> +	sync_fd = mkstemp(sync_file);
> +}
> +
> +static void close_sync_file(void)
> +{
> +	close(sync_fd);
> +}
> +
> +struct process_data {
> +	pthread_mutex_t mutex;
> +	pthread_cond_t cond;
> +	pthread_barrier_t barrier;
> +	bool go;
> +};
> +
> +static void wait_pdata(struct process_data *pdata)
> +{
> +	pthread_mutex_lock(&pdata->mutex);
> +	while (!pdata->go)
> +		pthread_cond_wait(&pdata->cond, &pdata->mutex);
> +	pthread_mutex_unlock(&pdata->mutex);
> +}
> +
> +static void init_pdata(struct process_data *pdata, int n_engine)
> +{
> +	pthread_mutexattr_t mutex_attr;
> +	pthread_condattr_t cond_attr;
> +	pthread_barrierattr_t barrier_attr;
> +
> +	pthread_mutexattr_init(&mutex_attr);
> +	pthread_mutexattr_setpshared(&mutex_attr, PTHREAD_PROCESS_SHARED);
> +	pthread_mutex_init(&pdata->mutex, &mutex_attr);
> +
> +	pthread_condattr_init(&cond_attr);
> +	pthread_condattr_setpshared(&cond_attr, PTHREAD_PROCESS_SHARED);
> +	pthread_cond_init(&pdata->cond, &cond_attr);
> +
> +	pthread_barrierattr_init(&barrier_attr);
> +	pthread_barrierattr_setpshared(&barrier_attr, PTHREAD_PROCESS_SHARED);
> +	pthread_barrier_init(&pdata->barrier, &barrier_attr, n_engine);
> +
> +	pdata->go = false;
> +}
> +
> +static void signal_pdata(struct process_data *pdata)
> +{
> +	pthread_mutex_lock(&pdata->mutex);
> +	pdata->go = true;
> +	pthread_cond_broadcast(&pdata->cond);
> +	pthread_mutex_unlock(&pdata->mutex);
> +}
> +
> +/* many_alloc flags */
> +#define MIX_BO_ALLOC		(0x1 << 0)
> +#define BENCHMARK		(0x1 << 1)
> +#define CPU_FAULT_THREADS	(0x1 << 2)
> +#define CPU_FAULT_PROCESS	(0x1 << 3)
> +#define CPU_FAULT_SAME_PAGE	(0x1 << 4)
> +
> +static void process_check(void *ptr, uint64_t alloc_size, uint64_t stride,
> +			  unsigned int flags)
> +{
> +	struct process_data *pdata;
> +	int map_fd;
> +
> +	map_fd = open(sync_file, O_RDWR, 0x666);
> +	pdata = mmap(NULL, sizeof(*pdata), PROT_READ |
> +		     PROT_WRITE, MAP_SHARED, map_fd, 0);
> +	wait_pdata(pdata);
> +
> +	if (flags & CPU_FAULT_SAME_PAGE)
> +		check_all_pages(ptr, alloc_size, stride, &pdata->barrier);
> +	else
> +		check_all_pages(ptr, alloc_size, stride, NULL);
> +
> +	close(map_fd);
> +	munmap(pdata, sizeof(*pdata));
> +}
> +
> +/*
> + * Partition checking of results in chunks which causes multiple processes to
> + * fault same VRAM allocation in parallel.
> + */
> +static void
> +check_all_pages_process(void *ptr, uint64_t alloc_size, uint64_t stride,
> +			int n_process, unsigned int flags)
> +{
> +	struct process_data *pdata;
> +	int map_fd, i;
> +
> +	map_fd = open(sync_file, O_RDWR | O_CREAT, 0x666);
> +	posix_fallocate(map_fd, 0, sizeof(*pdata));
> +	pdata = mmap(NULL, sizeof(*pdata), PROT_READ |
> +		     PROT_WRITE, MAP_SHARED, map_fd, 0);
> +
> +	init_pdata(pdata, n_process);
> +
> +	for (i = 0; i < n_process; ++i) {
> +		igt_fork(child, 1)
> +			if (flags & CPU_FAULT_SAME_PAGE)
> +				process_check(ptr, alloc_size, stride, flags);
> +			else
> +				process_check(ptr + stride * i, alloc_size,
> +					      stride * n_process, flags);
> +	}
> +
> +	signal_pdata(pdata);
> +	igt_waitchildren();
> +
> +	close(map_fd);
> +	munmap(pdata, sizeof(*pdata));
> +}
> +
> +struct thread_check_data {
> +	pthread_t thread;
> +	pthread_mutex_t *mutex;
> +	pthread_cond_t *cond;
> +	pthread_barrier_t *barrier;
> +	void *ptr;
> +	uint64_t alloc_size;
> +	uint64_t stride;
> +	bool *go;
> +};
> +
> +static void *thread_check(void *data)
> +{
> +	struct thread_check_data *t = data;
> +
> +	pthread_mutex_lock(t->mutex);
> +	while (!*t->go)
> +		pthread_cond_wait(t->cond, t->mutex);
> +	pthread_mutex_unlock(t->mutex);
> +
> +	check_all_pages(t->ptr, t->alloc_size, t->stride, t->barrier);
> +
> +	return NULL;
> +}
> +
> +/*
> + * Partition checking of results in chunks which causes multiple threads to
> + * fault same VRAM allocation in parallel.
> + */
> +static void
> +check_all_pages_threads(void *ptr, uint64_t alloc_size, uint64_t stride,
> +			int n_threads, unsigned int flags)
> +{
> +	struct thread_check_data *threads_check_data;
> +	pthread_mutex_t mutex;
> +	pthread_cond_t cond;
> +	pthread_barrier_t barrier;
> +	int i;
> +	bool go = false;
> +
> +	threads_check_data = calloc(n_threads, sizeof(*threads_check_data));
> +	igt_assert(threads_check_data);
> +
> +	pthread_mutex_init(&mutex, 0);
> +	pthread_cond_init(&cond, 0);
> +	pthread_barrier_init(&barrier, 0, n_threads);
> +
> +	for (i = 0; i < n_threads; ++i) {
> +		threads_check_data[i].mutex = &mutex;
> +		threads_check_data[i].cond = &cond;
> +		if (flags & CPU_FAULT_SAME_PAGE) {
> +			threads_check_data[i].barrier = &barrier;
> +			threads_check_data[i].ptr = ptr;
> +			threads_check_data[i].alloc_size = alloc_size;
> +			threads_check_data[i].stride = stride;
> +		} else {
> +			threads_check_data[i].barrier = NULL;
> +			threads_check_data[i].ptr = ptr + stride * i;
> +			threads_check_data[i].alloc_size = alloc_size;
> +			threads_check_data[i].stride = n_threads * stride;
> +		}
> +		threads_check_data[i].go = &go;
> +
> +		pthread_create(&threads_check_data[i].thread, 0, thread_check,
> +			       &threads_check_data[i]);
> +	}
> +
> +	pthread_mutex_lock(&mutex);
> +	go = true;
> +	pthread_cond_broadcast(&cond);
> +	pthread_mutex_unlock(&mutex);
> +
> +	for (i = 0; i < n_threads; ++i)
> +		pthread_join(threads_check_data[i].thread, NULL);
> +	free(threads_check_data);
> +}
> +
> +static void touch_all_pages(int fd, uint32_t exec_queue, void *ptr,
> +			    uint64_t alloc_size, uint64_t stride,
> +			    struct timespec *tv, uint64_t *submit)
> +{
> +	struct drm_xe_sync sync[1] = {
> +		{ .type = DRM_XE_SYNC_TYPE_USER_FENCE,
> +		  .flags = DRM_XE_SYNC_FLAG_SIGNAL,
> +		  .timeline_value = USER_FENCE_VALUE },
> +	};
> +	struct drm_xe_exec exec = {
> +		.num_batch_buffer = 1,
> +		.num_syncs = 0,
> +		.exec_queue_id = exec_queue,
> +		.syncs = to_user_pointer(&sync),
> +	};
> +	uint64_t addr = to_user_pointer(ptr);
> +	int i, ret, n_writes = alloc_size / stride;
> +	u64 *exec_ufence = NULL;
> +	int64_t timeout = FIVE_SEC;
> +
> +	exec_ufence = mmap(NULL, SZ_4K, PROT_READ |
> +			   PROT_WRITE, MAP_SHARED |
> +			   MAP_ANONYMOUS, -1, 0);
> +	igt_assert(exec_ufence != MAP_FAILED);
> +	memset(exec_ufence, 0, SZ_4K);
> +	sync[0].addr = to_user_pointer(exec_ufence);
> +
> +	for (i = 0; i < n_writes; ++i, addr += stride) {
> +		struct batch_data *data = ptr + i * stride;
> +		uint64_t sdi_offset = (char *)&data->data - (char *)data;
> +		uint64_t sdi_addr = addr + sdi_offset;
> +		int b = 0;
> +
> +		write_dword(data->batch, sdi_addr, WRITE_VALUE(data, i), &b);
> +		igt_assert(b <= ARRAY_SIZE(data->batch));
> +	}
> +
> +	igt_nsec_elapsed(tv);
> +	*submit = igt_nsec_elapsed(tv);
> +
> +	addr = to_user_pointer(ptr);
> +	for (i = 0; i < n_writes; ++i, addr += stride) {
> +		struct batch_data *data = ptr + i * stride;
> +		uint64_t batch_offset = (char *)&data->batch - (char *)data;
> +		uint64_t batch_addr = addr + batch_offset;
> +
> +		exec.address = batch_addr;
> +		if (i + 1 == n_writes)
> +			exec.num_syncs = 1;
> +		xe_exec(fd, &exec);
> +	}
> +
> +	ret = __xe_wait_ufence(fd, exec_ufence, USER_FENCE_VALUE, exec_queue,
> +			       &timeout);
> +	if (ret) {
> +		igt_info("FAIL EXEC_UFENCE_ADDR: 0x%016llx\n", sync[0].addr);
> +		igt_info("FAIL EXEC_UFENCE: EXPECTED=0x%016llx, ACTUAL=0x%016lx\n",
> +			 USER_FENCE_VALUE, exec_ufence[0]);
> +
> +		addr = to_user_pointer(ptr);
> +		for (i = 0; i < n_writes; ++i, addr += stride) {
> +			struct batch_data *data = ptr + i * stride;
> +			uint64_t batch_offset = (char *)&data->batch - (char *)data;
> +			uint64_t batch_addr = addr + batch_offset;
> +			uint64_t sdi_offset = (char *)&data->data - (char *)data;
> +			uint64_t sdi_addr = addr + sdi_offset;
> +
> +			igt_info("FAIL BATCH_ADDR: 0x%016lx\n", batch_addr);
> +			igt_info("FAIL SDI_ADDR: 0x%016lx\n", sdi_addr);
> +			igt_info("FAIL SDI_ADDR (in batch): 0x%016lx\n",
> +				 (((u64)data->batch[2]) << 32) | data->batch[1]);
> +			igt_info("FAIL DATA: EXPECTED=0x%08x, ACTUAL=0x%08x\n",
> +				 data->expected_data, data->data);
> +		}
> +		igt_assert_eq(ret, 0);
> +	}
> +	munmap(exec_ufence, SZ_4K);
> +}
> +
> +static int va_bits;
> +
> +#define bind_system_allocator(__sync, __num_sync)			\
> +	__xe_vm_bind_assert(fd, vm, 0,					\
> +			    0, 0, 0, 0x1ull << va_bits,			\
> +			    DRM_XE_VM_BIND_OP_MAP,			\
> +			    DRM_XE_VM_BIND_FLAG_CPU_ADDR_MIRROR,	\
> +			    (__sync), (__num_sync), 0, 0)
> +
> +#define unbind_system_allocator()				\
> +	__xe_vm_bind(fd, vm, 0, 0, 0, 0, 0x1ull << va_bits,	\
> +		     DRM_XE_VM_BIND_OP_UNMAP, 0,		\
> +		     NULL, 0, 0, 0, 0)
> +
> +#define odd(__i)	(__i & 1)
> +
> +struct aligned_alloc_type {
> +	void *__ptr;
> +	void *ptr;
> +	size_t __size;
> +	size_t size;
> +};
> +
> +static struct aligned_alloc_type __aligned_alloc(size_t alignment, size_t size)
> +{
> +	struct aligned_alloc_type aligned_alloc_type;
> +
> +	aligned_alloc_type.__ptr = mmap(NULL, alignment + size, PROT_NONE, MAP_PRIVATE |
> +			      MAP_ANONYMOUS, -1, 0);
> +	igt_assert(aligned_alloc_type.__ptr != MAP_FAILED);
> +
> +	aligned_alloc_type.ptr = (void *)ALIGN((uint64_t)aligned_alloc_type.__ptr, alignment);
> +	aligned_alloc_type.size = size;
> +	aligned_alloc_type.__size = size + alignment;
> +
> +	return aligned_alloc_type;
> +}
> +
> +static void __aligned_free(struct aligned_alloc_type  *aligned_alloc_type)
> +{
> +	munmap(aligned_alloc_type->__ptr, aligned_alloc_type->__size);
> +}
> +
> +static void __aligned_partial_free(struct aligned_alloc_type  *aligned_alloc_type)
> +{
> +	size_t begin_size = (size_t)(aligned_alloc_type->ptr - aligned_alloc_type->__ptr);
> +
> +	if (begin_size)
> +		munmap(aligned_alloc_type->__ptr, begin_size);
> +	if (aligned_alloc_type->__size - aligned_alloc_type->size - begin_size)
> +		munmap(aligned_alloc_type->ptr + aligned_alloc_type->size,
> +		       aligned_alloc_type->__size - aligned_alloc_type->size - begin_size);
> +}
> +
> +/**
> + * SUBTEST: unaligned-alloc
> + * Description: allocate unaligned sizes of memory
> + * Test category: functionality test
> + *
> + * SUBTEST: fault-benchmark
> + * Description: Benchmark how long GPU / CPU take
> + * Test category: performance test
> + *
> + * SUBTEST: fault-threads-benchmark
> + * Description: Benchmark how long GPU / CPU take, reading results with multiple threads
> + * Test category: performance and functionality test
> + *
> + * SUBTEST: fault-threads-same-page-benchmark
> + * Description: Benchmark how long GPU / CPU take, reading results with multiple threads, hammer same page
> + * Test category: performance and functionality test
> + *
> + * SUBTEST: fault-process-benchmark
> + * Description: Benchmark how long GPU / CPU take, reading results with multiple process
> + * Test category: performance and functionality test
> + *
> + * SUBTEST: fault-process-same-page-benchmark
> + * Description: Benchmark how long GPU / CPU take, reading results with multiple process, hammer same page
> + * Test category: performance and functionality test
> + *
> + * SUBTEST: evict-malloc
> + * Description: trigger eviction of VRAM allocated via malloc
> + * Test category: functionality test
> + *
> + * SUBTEST: evict-malloc-mix-bo
> + * Description: trigger eviction of VRAM allocated via malloc and BO create
> + * Test category: functionality test
> + *
> + * SUBTEST: processes-evict-malloc
> + * Description: multi-process trigger eviction of VRAM allocated via malloc
> + * Test category: stress test
> + *
> + * SUBTEST: processes-evict-malloc-mix-bo
> + * Description: multi-process trigger eviction of VRAM allocated via malloc and BO create
> + * Test category: stress test
> + */
> +
> +static void
> +many_allocs(int fd, struct drm_xe_engine_class_instance *eci,
> +	    uint64_t total_alloc, uint64_t alloc_size, uint64_t stride,
> +	    pthread_barrier_t *barrier, unsigned int flags)
> +{
> +	uint32_t vm, exec_queue;
> +	int num_allocs = flags & BENCHMARK ? 1 :
> +		(9 * (total_alloc / alloc_size)) / 8;
> +	struct aligned_alloc_type *allocs;
> +	uint32_t *bos = NULL;
> +	struct timespec tv = {};
> +	uint64_t submit, read, elapsed;
> +	int i;
> +
> +	vm = xe_vm_create(fd, DRM_XE_VM_CREATE_FLAG_LR_MODE |
> +			  DRM_XE_VM_CREATE_FLAG_FAULT_MODE, 0);
> +	exec_queue = xe_exec_queue_create(fd, vm, eci, 0);
> +
> +	bind_system_allocator(NULL, 0);
> +
> +	allocs = malloc(sizeof(*allocs) * num_allocs);
> +	igt_assert(allocs);
> +	memset(allocs, 0, sizeof(*allocs) * num_allocs);
> +
> +	if (flags & MIX_BO_ALLOC) {
> +		bos = malloc(sizeof(*bos) * num_allocs);
> +		igt_assert(bos);
> +		memset(bos, 0, sizeof(*bos) * num_allocs);
> +	}
> +
> +	for (i = 0; i < num_allocs; ++i) {
> +		struct aligned_alloc_type alloc;
> +
> +		if (flags & MIX_BO_ALLOC && odd(i)) {
> +			uint32_t bo_flags =
> +				DRM_XE_GEM_CREATE_FLAG_NEEDS_VISIBLE_VRAM;
> +
> +			alloc = __aligned_alloc(SZ_2M, alloc_size);
> +			igt_assert(alloc.ptr);
> +
> +			bos[i] = xe_bo_create(fd, vm, alloc_size,
> +					      vram_if_possible(fd, eci->gt_id),
> +					      bo_flags);
> +			alloc.ptr = xe_bo_map_fixed(fd, bos[i], alloc_size,
> +						    to_user_pointer(alloc.ptr));
> +			xe_vm_bind_async(fd, vm, 0, bos[i], 0,
> +					 to_user_pointer(alloc.ptr),
> +					 alloc_size, 0, 0);
> +		} else {
> +			alloc.ptr = aligned_alloc(SZ_2M, alloc_size);
> +			igt_assert(alloc.ptr);
> +		}
> +		allocs[i] = alloc;
> +
> +		touch_all_pages(fd, exec_queue, allocs[i].ptr, alloc_size, stride,
> +				&tv, &submit);
> +	}
> +
> +	if (barrier)
> +		pthread_barrier_wait(barrier);
> +
> +	for (i = 0; i < num_allocs; ++i) {
> +		if (flags & BENCHMARK)
> +			read = igt_nsec_elapsed(&tv);
> +#define NUM_CHECK_THREADS	8
> +		if (flags & CPU_FAULT_PROCESS)
> +			check_all_pages_process(allocs[i].ptr, alloc_size, stride,
> +						NUM_CHECK_THREADS, flags);
> +		else if (flags & CPU_FAULT_THREADS)
> +			check_all_pages_threads(allocs[i].ptr, alloc_size, stride,
> +						NUM_CHECK_THREADS, flags);
> +		else
> +			check_all_pages(allocs[i].ptr, alloc_size, stride, NULL);
> +		if (flags & BENCHMARK) {
> +			elapsed = igt_nsec_elapsed(&tv);
> +			igt_info("Execution took %.3fms (submit %.1fus, read %.1fus, total %.1fus, read_total %.1fus)\n",
> +				 1e-6 * elapsed, 1e-3 * submit, 1e-3 * read,
> +				 1e-3 * (elapsed - submit),
> +				 1e-3 * (elapsed - read));
> +		}
> +		if (bos && bos[i]) {
> +			__aligned_free(allocs + i);
> +			gem_close(fd, bos[i]);
> +		} else {
> +			free(allocs[i].ptr);
> +		}
> +	}
> +	if (bos)
> +		free(bos);
> +	free(allocs);
> +	xe_exec_queue_destroy(fd, exec_queue);
> +	xe_vm_destroy(fd, vm);
> +}
> +
> +static void process_evict(struct drm_xe_engine_class_instance *hwe,
> +			  uint64_t total_alloc, uint64_t alloc_size,
> +			  uint64_t stride, unsigned int flags)
> +{
> +	struct process_data *pdata;
> +	int map_fd;
> +	int fd;
> +
> +	map_fd = open(sync_file, O_RDWR, 0x666);
> +	pdata = mmap(NULL, sizeof(*pdata), PROT_READ |
> +		     PROT_WRITE, MAP_SHARED, map_fd, 0);
> +	wait_pdata(pdata);
> +
> +	fd = drm_open_driver(DRIVER_XE);
> +	many_allocs(fd, hwe, total_alloc, alloc_size, stride, &pdata->barrier,
> +		    flags);
> +	drm_close_driver(fd);
> +
> +	close(map_fd);
> +	munmap(pdata, sizeof(*pdata));
> +}
> +
> +static void
> +processes_evict(int fd, uint64_t alloc_size, uint64_t stride,
> +		unsigned int flags)
> +{
> +	struct drm_xe_engine_class_instance *hwe;
> +	struct process_data *pdata;
> +	int n_engine_gt[2] = { 0, 0 }, n_engine = 0;
> +	int map_fd;
> +
> +	map_fd = open(sync_file, O_RDWR | O_CREAT, 0x666);
> +	posix_fallocate(map_fd, 0, sizeof(*pdata));
> +	pdata = mmap(NULL, sizeof(*pdata), PROT_READ |
> +		     PROT_WRITE, MAP_SHARED, map_fd, 0);
> +
> +	xe_for_each_engine(fd, hwe) {
> +		igt_assert(hwe->gt_id < 2);
> +		n_engine_gt[hwe->gt_id]++;
> +		n_engine++;
> +	}
> +
> +	init_pdata(pdata, n_engine);
> +
> +	xe_for_each_engine(fd, hwe) {
> +		igt_fork(child, 1)
> +			process_evict(hwe,
> +				      xe_visible_vram_size(fd, hwe->gt_id) /
> +				      n_engine_gt[hwe->gt_id], alloc_size,
> +				      stride, flags);
> +	}
> +
> +	signal_pdata(pdata);
> +	igt_waitchildren();
> +
> +	close(map_fd);
> +	munmap(pdata, sizeof(*pdata));
> +}
> +
> +#define CPU_FAULT	(0x1 << 0)
> +#define REMAP		(0x1 << 1)
> +#define MIDDLE		(0x1 << 2)
> +
> +/**
> + * SUBTEST: partial-munmap-cpu-fault
> + * Description: munmap partially with cpu access in between
> + * Test category: functionality test
> + *
> + * SUBTEST: partial-munmap-no-cpu-fault
> + * Description: munmap partially with no cpu access in between
> + * Test category: functionality test
> + *
> + * SUBTEST: partial-remap-cpu-fault
> + * Description: remap partially with cpu access in between
> + * Test category: functionality test
> + *
> + * SUBTEST: partial-remap-no-cpu-fault
> + * Description: remap partially with no cpu access in between
> + * Test category: functionality test
> + *
> + * SUBTEST: partial-middle-munmap-cpu-fault
> + * Description: munmap middle with cpu access in between
> + * Test category: functionality test
> + *
> + * SUBTEST: partial-middle-munmap-no-cpu-fault
> + * Description: munmap middle with no cpu access in between
> + * Test category: functionality test
> + *
> + * SUBTEST: partial-middle-remap-cpu-fault
> + * Description: remap middle with cpu access in between
> + * Test category: functionality test
> + *
> + * SUBTEST: partial-middle-remap-no-cpu-fault
> + * Description: remap middle with no cpu access in between
> + * Test category: functionality test
> + */
> +
> +static void
> +partial(int fd, struct drm_xe_engine_class_instance *eci, unsigned int flags)
> +{
> +	struct drm_xe_sync sync[1] = {
> +		{ .type = DRM_XE_SYNC_TYPE_USER_FENCE, .flags = DRM_XE_SYNC_FLAG_SIGNAL,
> +	          .timeline_value = USER_FENCE_VALUE },
> +	};
> +	struct drm_xe_exec exec = {
> +		.num_batch_buffer = 1,
> +		.num_syncs = 1,
> +		.syncs = to_user_pointer(sync),
> +	};
> +	struct {
> +		uint32_t batch[16];
> +		uint64_t pad;
> +		uint64_t vm_sync;
> +		uint64_t exec_sync;
> +		uint32_t data;
> +		uint32_t expected_data;
> +	} *data;
> +	size_t bo_size = SZ_2M, unmap_offset = 0;
> +	uint32_t vm, exec_queue;
> +	u64 *exec_ufence = NULL;
> +	int i;
> +	void *old, *new = NULL;
> +	struct aligned_alloc_type alloc;
> +
> +	if (flags & MIDDLE)
> +		unmap_offset = bo_size / 4;
> +
> +	vm = xe_vm_create(fd, DRM_XE_VM_CREATE_FLAG_LR_MODE |
> +			  DRM_XE_VM_CREATE_FLAG_FAULT_MODE, 0);
> +
> +	alloc = __aligned_alloc(bo_size, bo_size);
> +	igt_assert(alloc.ptr);
> +
> +	data = mmap(alloc.ptr, bo_size, PROT_READ | PROT_WRITE,
> +		    MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0);
> +	igt_assert(data != MAP_FAILED);
> +	memset(data, 0, bo_size);
> +	old = data;
> +
> +	exec_queue = xe_exec_queue_create(fd, vm, eci, 0);
> +
> +	sync[0].addr = to_user_pointer(&data[0].vm_sync);
> +	bind_system_allocator(sync, 1);
> +	xe_wait_ufence(fd, &data[0].vm_sync, USER_FENCE_VALUE, 0, FIVE_SEC);
> +	data[0].vm_sync = 0;
> +
> +	exec_ufence = mmap(NULL, SZ_4K, PROT_READ |
> +			   PROT_WRITE, MAP_SHARED |
> +			   MAP_ANONYMOUS, -1, 0);
> +	igt_assert(exec_ufence != MAP_FAILED);
> +	memset(exec_ufence, 0, SZ_4K);
> +
> +	for (i = 0; i < 2; i++) {
> +		uint64_t addr = to_user_pointer(data);
> +		uint64_t sdi_offset = (char *)&data[i].data - (char *)data;
> +		uint64_t sdi_addr = addr + sdi_offset;
> +		int b = 0;
> +
> +		write_dword(data[i].batch, sdi_addr, WRITE_VALUE(&data[i], i), &b);
> +		igt_assert(b <= ARRAY_SIZE(data[i].batch));
> +
> +		if (!i)
> +			data = old + unmap_offset + bo_size / 2;
> +	}
> +
> +	data = old;
> +	exec.exec_queue_id = exec_queue;
> +
> +	for (i = 0; i < 2; i++) {
> +		uint64_t addr = to_user_pointer(data);
> +		uint64_t batch_offset = (char *)&data[i].batch - (char *)data;
> +		uint64_t batch_addr = addr + batch_offset;
> +
> +		sync[0].addr = new ? to_user_pointer(new) :
> +			to_user_pointer(exec_ufence);
> +		exec.address = batch_addr;
> +		xe_exec(fd, &exec);
> +
> +		xe_wait_ufence(fd, new ?: exec_ufence, USER_FENCE_VALUE,
> +			       exec_queue, FIVE_SEC);
> +		if (i || (flags & CPU_FAULT))
> +			igt_assert_eq(data[i].data, READ_VALUE(&data[i]));
> +		exec_ufence[0] = 0;
> +
> +		if (!i) {
> +			data = old + unmap_offset + bo_size / 2;
> +			munmap(old + unmap_offset, bo_size / 2);
> +			if (flags & REMAP) {
> +				new = mmap(old + unmap_offset, bo_size / 2,
> +					   PROT_READ | PROT_WRITE,
> +					   MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED |
> +					   MAP_LOCKED, -1, 0);
> +				igt_assert(new != MAP_FAILED);
> +			}
> +		}
> +	}
> +
> +	xe_exec_queue_destroy(fd, exec_queue);
> +	munmap(exec_ufence, SZ_4K);
> +	__aligned_free(&alloc);
> +	if (new)
> +		munmap(new, bo_size / 2);
> +	xe_vm_destroy(fd, vm);
> +}
> +
> +#define MAX_N_EXEC_QUEUES	16
> +
> +#define MMAP		(0x1 << 0)
> +#define NEW		(0x1 << 1)
> +#define BO_UNMAP	(0x1 << 2)
> +#define FREE		(0x1 << 3)
> +#define BUSY		(0x1 << 4)
> +#define BO_MAP		(0x1 << 5)
> +#define RACE		(0x1 << 6)
> +#define SKIP_MEMSET	(0x1 << 7)
> +#define FAULT		(0x1 << 8)
> +#define FILE_BACKED	(0x1 << 9)
> +#define LOCK		(0x1 << 10)
> +#define MMAP_SHARED	(0x1 << 11)
> +#define HUGE_PAGE	(0x1 << 12)
> +#define SHARED_ALLOC	(0x1 << 13)
> +#define FORK_READ	(0x1 << 14)
> +#define FORK_READ_AFTER	(0x1 << 15)
> +#define MREMAP		(0x1 << 16)
> +#define DONTUNMAP	(0x1 << 17)
> +#define READ_ONLY_REMAP	(0x1 << 18)
> +#define SYNC_EXEC	(0x1 << 19)
> +#define EVERY_OTHER_CHECK	(0x1 << 20)
> +#define MULTI_FAULT	(0x1 << 21)
> +
> +#define N_MULTI_FAULT	4
> +
> +/**
> + * SUBTEST: once-%s
> + * Description: Run %arg[1] system allocator test only once
> + * Test category: functionality test
> + *
> + * SUBTEST: once-large-%s
> + * Description: Run %arg[1] system allocator test only once with large allocation
> + * Test category: functionality test
> + *
> + * SUBTEST: twice-%s
> + * Description: Run %arg[1] system allocator test twice
> + * Test category: functionality test
> + *
> + * SUBTEST: twice-large-%s
> + * Description: Run %arg[1] system allocator test twice with large allocation
> + * Test category: functionality test
> + *
> + * SUBTEST: many-%s
> + * Description: Run %arg[1] system allocator test many times
> + * Test category: stress test
> + *
> + * SUBTEST: many-stride-%s
> + * Description: Run %arg[1] system allocator test many times with a stride on each exec
> + * Test category: stress test
> + *
> + * SUBTEST: many-execqueues-%s
> + * Description: Run %arg[1] system allocator test on many exec_queues
> + * Test category: stress test
> + *
> + * SUBTEST: many-large-%s
> + * Description: Run %arg[1] system allocator test many times with large allocations
> + * Test category: stress test
> + *
> + * SUBTEST: many-large-execqueues-%s
> + * Description: Run %arg[1] system allocator test on many exec_queues with large allocations
> + *
> + * SUBTEST: threads-many-%s
> + * Description: Run %arg[1] system allocator threaded test many times
> + * Test category: stress test
> + *
> + * SUBTEST: threads-many-stride-%s
> + * Description: Run %arg[1] system allocator threaded test many times with a stride on each exec
> + * Test category: stress test
> + *
> + * SUBTEST: threads-many-execqueues-%s
> + * Description: Run %arg[1] system allocator threaded test on many exec_queues
> + * Test category: stress test
> + *
> + * SUBTEST: threads-many-large-%s
> + * Description: Run %arg[1] system allocator threaded test many times with large allocations
> + * Test category: stress test
> + *
> + * SUBTEST: threads-many-large-execqueues-%s
> + * Description: Run %arg[1] system allocator threaded test on many exec_queues with large allocations
> + *
> + * SUBTEST: threads-shared-vm-many-%s
> + * Description: Run %arg[1] system allocator threaded, shared vm test many times
> + * Test category: stress test
> + *
> + * SUBTEST: threads-shared-vm-many-stride-%s
> + * Description: Run %arg[1] system allocator threaded, shared vm test many times with a stride on each exec
> + * Test category: stress test
> + *
> + * SUBTEST: threads-shared-vm-many-execqueues-%s
> + * Description: Run %arg[1] system allocator threaded, shared vm test on many exec_queues
> + * Test category: stress test
> + *
> + * SUBTEST: threads-shared-vm-many-large-%s
> + * Description: Run %arg[1] system allocator threaded, shared vm test many times with large allocations
> + * Test category: stress test
> + *
> + * SUBTEST: threads-shared-vm-many-large-execqueues-%s
> + * Description: Run %arg[1] system allocator threaded, shared vm test on many exec_queues with large allocations
> + * Test category: stress test
> + *
> + * SUBTEST: process-many-%s
> + * Description: Run %arg[1] system allocator multi-process test many times
> + * Test category: stress test
> + *
> + * SUBTEST: process-many-stride-%s
> + * Description: Run %arg[1] system allocator multi-process test many times with a stride on each exec
> + * Test category: stress test
> + *
> + * SUBTEST: process-many-execqueues-%s
> + * Description: Run %arg[1] system allocator multi-process test on many exec_queues
> + * Test category: stress test
> + *
> + * SUBTEST: process-many-large-%s
> + * Description: Run %arg[1] system allocator multi-process test many times with large allocations
> + * Test category: stress test
> + *
> + * SUBTEST: process-many-large-execqueues-%s
> + * Description: Run %arg[1] system allocator multi-process test on many exec_queues with large allocations
> + *
> + * SUBTEST: fault
> + * Description: use a bad system allocator address resulting in a fault
> + * Test category: bad input
> + *
> + * arg[1]:
> + *
> + * @malloc:				malloc single buffer for all execs, issue a command which will trigger multiple faults
> + * @malloc-multi-fault:			malloc single buffer for all execs
> + * @malloc-fork-read:			malloc single buffer for all execs, fork a process to read test output
> + * @malloc-fork-read-after:		malloc single buffer for all execs, fork a process to read test output, check again after fork returns in parent
> + * @malloc-mlock:			malloc and mlock single buffer for all execs
> + * @malloc-race:			malloc single buffer for all execs with race between cpu and gpu access
> + * @malloc-bo-unmap:			malloc single buffer for all execs, bind and unbind a BO to same address before execs
> + * @malloc-busy:			malloc single buffer for all execs, try to unbind while buffer valid
> + * @mmap:				mmap single buffer for all execs
> + * @mmap-remap:				mmap and mremap a buffer for all execs
> + * @mmap-remap-dontunmap:		mmap and mremap a buffer with dontunmap flag for all execs
> + * @mmap-remap-ro:			mmap and mremap a read-only buffer for all execs
> + * @mmap-remap-ro-dontunmap:		mmap and mremap a read-only buffer with dontunmap flag for all execs
> + * @mmap-remap-eocheck:			mmap and mremap a buffer for all execs, check data every other loop iteration
> + * @mmap-remap-dontunmap-eocheck:	mmap and mremap a buffer with dontunmap flag for all execs, check data every other loop iteration
> + * @mmap-remap-ro-eocheck:		mmap and mremap a read-only buffer for all execs, check data every other loop iteration
> + * @mmap-remap-ro-dontunmap-eocheck:	mmap and mremap a read-only buffer with dontunmap flag for all execs, check data every other loop iteration
> + * @mmap-huge:				mmap huge page single buffer for all execs
> + * @mmap-shared:			mmap shared single buffer for all execs
> + * @mmap-shared-remap:			mmap shared and mremap a buffer for all execs
> + * @mmap-shared-remap-dontunmap:	mmap shared and mremap a buffer with dontunmap flag for all execs
> + * @mmap-shared-remap-eocheck:		mmap shared and mremap a buffer for all execs, check data every other loop iteration
> + * @mmap-shared-remap-dontunmap-eocheck:	mmap shared and mremap a buffer with dontunmap flag for all execs, check data every other loop iteration
> + * @mmap-mlock:				mmap and mlock single buffer for all execs
> + * @mmap-file:				mmap single buffer, with file backing, for all execs
> + * @mmap-file-mlock:			mmap and mlock single buffer, with file backing, for all execs
> + * @mmap-race:				mmap single buffer for all execs with race between cpu and gpu access
> + * @free:				malloc and free buffer for each exec
> + * @free-race:				malloc and free buffer for each exec with race between cpu and gpu access
> + * @new:				malloc a new buffer for each exec
> + * @new-race:				malloc a new buffer for each exec with race between cpu and gpu access
> + * @new-bo-map:				malloc a new buffer or map BO for each exec
> + * @new-busy:				malloc a new buffer for each exec, try to unbind while buffers valid
> + * @mmap-free:				mmap and free buffer for each exec
> + * @mmap-free-huge:			mmap huge page and free buffer for each exec
> + * @mmap-free-race:			mmap and free buffer for each exec with race between cpu and gpu access
> + * @mmap-new:				mmap a new buffer for each exec
> + * @mmap-new-huge:			mmap huge page a new buffer for each exec
> + * @mmap-new-race:			mmap a new buffer for each exec with race between cpu and gpu access
> + * @malloc-nomemset:			malloc single buffer for all execs, skip memset of buffers
> + * @malloc-mlock-nomemset:		malloc and mlock single buffer for all execs, skip memset of buffers
> + * @malloc-race-nomemset:		malloc single buffer for all execs with race between cpu and gpu access, skip memset of buffers
> + * @malloc-bo-unmap-nomemset:		malloc single buffer for all execs, bind and unbind a BO to same address before execs, skip memset of buffers
> + * @malloc-busy-nomemset:		malloc single buffer for all execs, try to unbind while buffer valid, skip memset of buffers
> + * @mmap-nomemset:			mmap single buffer for all execs, skip memset of buffers
> + * @mmap-huge-nomemset:			mmap huge page single buffer for all execs, skip memset of buffers
> + * @mmap-shared-nomemset:		mmap shared single buffer for all execs, skip memset of buffers
> + * @mmap-mlock-nomemset:		mmap and mlock single buffer for all execs, skip memset of buffers
> + * @mmap-file-nomemset:			mmap single buffer, with file backing, for all execs, skip memset of buffers
> + * @mmap-file-mlock-nomemset:		mmap and mlock single buffer, with file backing, for all execs, skip memset of buffers
> + * @mmap-race-nomemset:			mmap single buffer for all execs with race between cpu and gpu access, skip memset of buffers
> + * @free-nomemset:			malloc and free buffer for each exec, skip memset of buffers
> + * @free-race-nomemset:			malloc and free buffer for each exec with race between cpu and gpu access, skip memset of buffers
> + * @new-nomemset:			malloc a new buffer for each exec, skip memset of buffers
> + * @new-race-nomemset:			malloc a new buffer for each exec with race between cpu and gpu access, skip memset of buffers
> + * @new-bo-map-nomemset:		malloc a new buffer or map BO for each exec, skip memset of buffers
> + * @new-busy-nomemset:			malloc a new buffer for each exec, try to unbind while buffers valid, skip memset of buffers
> + * @mmap-free-nomemset:			mmap and free buffer for each exec, skip memset of buffers
> + * @mmap-free-huge-nomemset:		mmap huge page and free buffer for each exec, skip memset of buffers
> + * @mmap-free-race-nomemset:		mmap and free buffer for each exec with race between cpu and gpu access, skip memset of buffers
> + * @mmap-new-nomemset:			mmap a new buffer for each exec, skip memset of buffers
> + * @mmap-new-huge-nomemset:		mmap huge page new buffer for each exec, skip memset of buffers
> + * @mmap-new-race-nomemset:		mmap a new buffer for each exec with race between cpu and gpu access, skip memset of buffers
> + *
> + * SUBTEST: threads-shared-vm-shared-alloc-many-stride-malloc
> + * Description: Create multiple threads with a shared VM triggering faults on different hardware engines to same addresses
> + * Test category: stress test
> + *
> + * SUBTEST: threads-shared-vm-shared-alloc-many-stride-malloc-race
> + * Description: Create multiple threads with a shared VM triggering faults on different hardware engines to same addresses, racing between CPU and GPU access
> + * Test category: stress test
> + *
> + * SUBTEST: threads-shared-alloc-many-stride-malloc
> + * Description: Create multiple threads with a faults on different hardware engines to same addresses
> + * Test category: stress test
> + *
> + * SUBTEST: threads-shared-alloc-many-stride-malloc-sync
> + * Description: Create multiple threads with a faults on different hardware engines to same addresses, syncing on each exec
> + * Test category: stress test
> + *
> + * SUBTEST: threads-shared-alloc-many-stride-malloc-race
> + * Description: Create multiple threads with a faults on different hardware engines to same addresses, racing between CPU and GPU access
> + * Test category: stress test
> + */
> +
> +struct test_exec_data {
> +	uint32_t batch[32];
> +	uint64_t pad;
> +	uint64_t vm_sync;
> +	uint64_t exec_sync;
> +	uint32_t data;
> +	uint32_t expected_data;
> +};
> +
> +static void
> +test_exec(int fd, struct drm_xe_engine_class_instance *eci,
> +	  int n_exec_queues, int n_execs, size_t bo_size,
> +	  size_t stride, uint32_t vm, void *alloc, pthread_barrier_t *barrier,
> +	  unsigned int flags)
> +{
> +	uint64_t addr;
> +	struct drm_xe_sync sync[1] = {
> +		{ .type = DRM_XE_SYNC_TYPE_USER_FENCE, .flags = DRM_XE_SYNC_FLAG_SIGNAL,
> +	          .timeline_value = USER_FENCE_VALUE },
> +	};
> +	struct drm_xe_exec exec = {
> +		.num_batch_buffer = 1,
> +		.num_syncs = 1,
> +		.syncs = to_user_pointer(sync),
> +	};
> +	uint32_t exec_queues[MAX_N_EXEC_QUEUES];
> +	struct test_exec_data *data, *next_data = NULL;
> +	uint32_t bo_flags;
> +	uint32_t bo = 0;
> +	void **pending_free;
> +	u64 *exec_ufence = NULL;
> +	int i, j, b, file_fd = -1, prev_idx;
> +	bool free_vm = false;
> +	size_t aligned_size = bo_size ?: xe_get_default_alignment(fd);
> +	size_t orig_size = bo_size;
> +	struct aligned_alloc_type aligned_alloc_type;
> +
> +	if (flags & MULTI_FAULT) {
> +		if (!bo_size)
> +			return;
> +
> +		bo_size *= N_MULTI_FAULT;
> +	}
> +
> +	if (flags & SHARED_ALLOC)
> +		return;
> +
> +	if (flags & EVERY_OTHER_CHECK && odd(n_execs))
> +		return;
> +
> +	if (flags & EVERY_OTHER_CHECK)
> +		igt_assert(flags & MREMAP);
> +
> +	igt_assert(n_exec_queues <= MAX_N_EXEC_QUEUES);
> +
> +	if (flags & NEW && !(flags & FREE)) {
> +		pending_free = malloc(sizeof(*pending_free) * n_execs);
> +		igt_assert(pending_free);
> +		memset(pending_free, 0, sizeof(*pending_free) * n_execs);
> +	}
> +
> +	if (!vm) {
> +		vm = xe_vm_create(fd, DRM_XE_VM_CREATE_FLAG_LR_MODE |
> +				  DRM_XE_VM_CREATE_FLAG_FAULT_MODE, 0);
> +		free_vm = true;
> +	}
> +	if (!bo_size) {
> +		if (!stride) {
> +			bo_size = sizeof(*data) * n_execs;
> +			bo_size = xe_bb_size(fd, bo_size);
> +		} else {
> +			bo_size = stride * n_execs * sizeof(*data);
> +			bo_size = xe_bb_size(fd, bo_size);
> +		}
> +	}
> +	if (flags & HUGE_PAGE) {
> +		aligned_size = ALIGN(aligned_size, SZ_2M);
> +		bo_size = ALIGN(bo_size, SZ_2M);
> +	}
> +
> +	if (alloc) {
> +		data = alloc;
> +	} else {
> +		if (flags & MMAP) {
> +			int mmap_flags = MAP_FIXED;
> +
> +			aligned_alloc_type = __aligned_alloc(aligned_size, bo_size);
> +			data = aligned_alloc_type.ptr;
> +			igt_assert(data);
> +			__aligned_partial_free(&aligned_alloc_type);
> +
> +			if (flags & MMAP_SHARED)
> +				mmap_flags |= MAP_SHARED;
> +			else
> +				mmap_flags |= MAP_PRIVATE;
> +
> +			if (flags & HUGE_PAGE)
> +				mmap_flags |= MAP_HUGETLB | MAP_HUGE_2MB;
> +
> +			if (flags & FILE_BACKED) {
> +				char name[] = "/tmp/xe_exec_system_allocator_datXXXXXX";
> +
> +				igt_assert(!(flags & NEW));
> +
> +				file_fd = mkstemp(name);
> +				posix_fallocate(file_fd, 0, bo_size);
> +			} else {
> +				mmap_flags |= MAP_ANONYMOUS;
> +			}
> +
> +			data = mmap(data, bo_size, PROT_READ |
> +				    PROT_WRITE, mmap_flags, file_fd, 0);
> +			igt_assert(data != MAP_FAILED);
> +		} else {
> +			data = aligned_alloc(aligned_size, bo_size);
> +			igt_assert(data);
> +		}
> +		if (!(flags & SKIP_MEMSET))
> +			memset(data, 0, bo_size);
> +		if (flags & LOCK) {
> +			igt_assert(!(flags & NEW));
> +			mlock(data, bo_size);
> +		}
> +	}
> +
> +	for (i = 0; i < n_exec_queues; i++)
> +		exec_queues[i] = xe_exec_queue_create(fd, vm, eci, 0);
> +
> +	sync[0].addr = to_user_pointer(&data[0].vm_sync);
> +	if (free_vm) {
> +		bind_system_allocator(sync, 1);
> +		xe_wait_ufence(fd, &data[0].vm_sync, USER_FENCE_VALUE, 0, FIVE_SEC);
> +	}
> +	data[0].vm_sync = 0;
> +
> +	addr = to_user_pointer(data);
> +
> +	if (flags & BO_UNMAP) {
> +		bo_flags = DRM_XE_GEM_CREATE_FLAG_NEEDS_VISIBLE_VRAM;
> +		bo = xe_bo_create(fd, vm, bo_size,
> +				  vram_if_possible(fd, eci->gt_id), bo_flags);
> +		xe_vm_bind_async(fd, vm, 0, bo, 0, addr, bo_size, 0, 0);
> +
> +		__xe_vm_bind_assert(fd, vm, 0,
> +				    0, 0, addr, bo_size,
> +				    DRM_XE_VM_BIND_OP_MAP,
> +				    DRM_XE_VM_BIND_FLAG_CPU_ADDR_MIRROR, sync,
> +				    1, 0, 0);
> +		xe_wait_ufence(fd, &data[0].vm_sync, USER_FENCE_VALUE, 0,
> +			       FIVE_SEC);
> +		data[0].vm_sync = 0;
> +		gem_close(fd, bo);
> +		bo = 0;
> +	}
> +
> +	if (!(flags & RACE)) {
> +		exec_ufence = mmap(NULL, SZ_4K, PROT_READ |
> +				   PROT_WRITE, MAP_SHARED |
> +				   MAP_ANONYMOUS, -1, 0);
> +		igt_assert(exec_ufence != MAP_FAILED);
> +		memset(exec_ufence, 0, SZ_4K);
> +	}
> +
> +	for (i = 0; i < n_execs; i++) {
> +		int idx = !stride ? i : i * stride, next_idx = !stride
> +			? (i + 1) : (i + 1) * stride;
> +		uint64_t batch_offset = (char *)&data[idx].batch - (char *)data;
> +		uint64_t batch_addr = addr + batch_offset;
> +		uint64_t sdi_offset = (char *)&data[idx].data - (char *)data;
> +		uint64_t sdi_addr = addr + sdi_offset;
> +		int e = i % n_exec_queues, err;
> +		bool fault_inject = (FAULT & flags) && i == n_execs / 2;
> +		bool fault_injected = (FAULT & flags) && i > n_execs;
> +
> +		if (barrier)
> +			pthread_barrier_wait(barrier);
> +
> +		if (flags & MULTI_FAULT) {
> +			b = 0;
> +			for (j = 0; j < N_MULTI_FAULT - 1; ++j)
> +				__write_dword(data[idx].batch,
> +					      sdi_addr + j * orig_size,
> +					      WRITE_VALUE(&data[idx], idx), &b);
> +			write_dword(data[idx].batch, sdi_addr + j * orig_size,
> +				    WRITE_VALUE(&data[idx], idx), &b);
> +			igt_assert(b <= ARRAY_SIZE(data[idx].batch));
> +		} else if (!(flags & EVERY_OTHER_CHECK)) {
> +			b = 0;
> +			write_dword(data[idx].batch, sdi_addr,
> +				    WRITE_VALUE(&data[idx], idx), &b);
> +			igt_assert(b <= ARRAY_SIZE(data[idx].batch));
> +		} else if (flags & EVERY_OTHER_CHECK && !odd(i)) {
> +			b = 0;
> +			write_dword(data[idx].batch, sdi_addr,
> +				    WRITE_VALUE(&data[idx], idx), &b);
> +			igt_assert(b <= ARRAY_SIZE(data[idx].batch));
> +
> +			aligned_alloc_type = __aligned_alloc(aligned_size, bo_size);
> +			next_data = aligned_alloc_type.ptr;
> +			igt_assert(next_data);
> +			__aligned_partial_free(&aligned_alloc_type);
> +
> +			b = 0;
> +			write_dword(data[next_idx].batch,
> +				    to_user_pointer(next_data) +
> +				    (char *)&data[next_idx].data - (char *)data,
> +				    WRITE_VALUE(&data[next_idx], next_idx), &b);
> +			igt_assert(b <= ARRAY_SIZE(data[next_idx].batch));
> +		}
> +
> +		if (!exec_ufence)
> +			data[idx].exec_sync = 0;
> +
> +		sync[0].addr = exec_ufence ? to_user_pointer(exec_ufence) :
> +			addr + (char *)&data[idx].exec_sync - (char *)data;
> +
> +		exec.exec_queue_id = exec_queues[e];
> +		if (fault_inject)
> +			exec.address = batch_addr * 2;
> +		else
> +			exec.address = batch_addr;
> +
> +		if (fault_injected) {
> +			err = __xe_exec(fd, &exec);
> +			igt_assert(err == -ENOENT);
> +		} else {
> +			xe_exec(fd, &exec);
> +		}
> +
> +		if (barrier)
> +			pthread_barrier_wait(barrier);
> +
> +		if (fault_inject || fault_injected) {
> +			int64_t timeout = QUARTER_SEC;
> +
> +			err = __xe_wait_ufence(fd, exec_ufence ? exec_ufence :
> +					       &data[idx].exec_sync,
> +					       USER_FENCE_VALUE,
> +					       exec_queues[e], &timeout);
> +			igt_assert(err == -ETIME || err == -EIO);
> +		} else {
> +			xe_wait_ufence(fd, exec_ufence ? exec_ufence :
> +				       &data[idx].exec_sync, USER_FENCE_VALUE,
> +				       exec_queues[e], FIVE_SEC);
> +			if (flags & LOCK && !i)
> +				munlock(data, bo_size);
> +
> +			if (flags & MREMAP) {
> +				void *old = data;
> +				int remap_flags = MREMAP_MAYMOVE | MREMAP_FIXED;
> +
> +				if (flags & DONTUNMAP)
> +					remap_flags |= MREMAP_DONTUNMAP;
> +
> +				if (flags & READ_ONLY_REMAP)
> +					igt_assert(!mprotect(old, bo_size,
> +							     PROT_READ));
> +
> +				if (!next_data) {
> +					aligned_alloc_type = __aligned_alloc(aligned_size,
> +								    bo_size);
> +					data = aligned_alloc_type.ptr;
> +					__aligned_partial_free(&aligned_alloc_type);
> +				} else {
> +					data = next_data;
> +				}
> +				next_data = NULL;
> +				igt_assert(data);
> +
> +				data = mremap(old, bo_size, bo_size,
> +					      remap_flags, data);
> +				igt_assert(data != MAP_FAILED);
> +
> +				if (flags & READ_ONLY_REMAP)
> +					igt_assert(!mprotect(data, bo_size,
> +							     PROT_READ |
> +							     PROT_WRITE));
> +
> +				addr = to_user_pointer(data);
> +				if (flags & DONTUNMAP)
> +					munmap(old, bo_size);
> +			}
> +
> +			if (!(flags & EVERY_OTHER_CHECK) || odd(i)) {
> +				if (flags & FORK_READ) {
> +					igt_fork(child, 1)
> +						igt_assert_eq(data[idx].data,
> +							      READ_VALUE(&data[idx]));
> +					if (!(flags & FORK_READ_AFTER))
> +						igt_assert_eq(data[idx].data,
> +							      READ_VALUE(&data[idx]));
> +					igt_waitchildren();
> +					if (flags & FORK_READ_AFTER)
> +						igt_assert_eq(data[idx].data,
> +							      READ_VALUE(&data[idx]));
> +				} else {
> +					igt_assert_eq(data[idx].data,
> +						      READ_VALUE(&data[idx]));
> +
> +					if (flags & MULTI_FAULT) {
> +						for (j = 1; j < N_MULTI_FAULT; ++j) {
> +							struct test_exec_data *__data =
> +								((void *)data) + j * orig_size;
> +
> +							igt_assert_eq(__data[idx].data,
> +								      READ_VALUE(&data[idx]));
> +						}
> +					}
> +				}
> +				if (flags & EVERY_OTHER_CHECK)
> +					igt_assert_eq(data[prev_idx].data,
> +						      READ_VALUE(&data[prev_idx]));
> +			}
> +		}
> +
> +		if (exec_ufence)
> +			exec_ufence[0] = 0;
> +
> +		if (bo) {
> +			__xe_vm_bind_assert(fd, vm, 0,
> +					    0, 0, addr, bo_size,
> +					    DRM_XE_VM_BIND_OP_MAP,
> +					    DRM_XE_VM_BIND_FLAG_CPU_ADDR_MIRROR,
> +					    NULL, 0, 0, 0);
> +			munmap(data, bo_size);
> +			gem_close(fd, bo);
> +		}
> +
> +		if (flags & NEW) {
> +			if (flags & MMAP) {
> +				if (flags & FREE)
> +					munmap(data, bo_size);
> +				else
> +					pending_free[i] = data;
> +				data = mmap(NULL, bo_size, PROT_READ |
> +					    PROT_WRITE, MAP_SHARED |
> +					    MAP_ANONYMOUS, -1, 0);
> +				igt_assert(data != MAP_FAILED);
> +			} else if (flags & BO_MAP && odd(i)) {
> +				if (!bo) {
> +					if (flags & FREE)
> +						free(data);
> +					else
> +						pending_free[i] = data;
> +				}
> +
> +				aligned_alloc_type = __aligned_alloc(aligned_size, bo_size);
> +				data = aligned_alloc_type.ptr;
> +				igt_assert(data);
> +				__aligned_partial_free(&aligned_alloc_type);
> +
> +				bo_flags = DRM_XE_GEM_CREATE_FLAG_NEEDS_VISIBLE_VRAM;
> +				bo = xe_bo_create(fd, vm, bo_size,
> +						  vram_if_possible(fd, eci->gt_id),
> +						  bo_flags);
> +				data = xe_bo_map_fixed(fd, bo, bo_size,
> +						       to_user_pointer(data));
> +
> +				xe_vm_bind_async(fd, vm, 0, bo, 0,
> +						 to_user_pointer(data),
> +						 bo_size, 0, 0);
> +			} else {
> +				if (!bo) {
> +					if (flags & FREE)
> +						free(data);
> +					else
> +						pending_free[i] = data;
> +				}
> +				bo = 0;
> +				data = aligned_alloc(aligned_size, bo_size);
> +				igt_assert(data);
> +			}
> +			addr = to_user_pointer(data);
> +			if (!(flags & SKIP_MEMSET))
> +				memset(data, 0, bo_size);
> +		}
> +
> +		prev_idx = idx;
> +	}
> +
> +	if (bo) {
> +		__xe_vm_bind_assert(fd, vm, 0,
> +				    0, 0, addr, bo_size,
> +				    DRM_XE_VM_BIND_OP_MAP,
> +				    DRM_XE_VM_BIND_FLAG_CPU_ADDR_MIRROR,
> +				    NULL, 0, 0, 0);
> +		munmap(data, bo_size);
> +		data = NULL;
> +		gem_close(fd, bo);
> +	}
> +
> +	if (flags & BUSY)
> +		igt_assert_eq(unbind_system_allocator(), -EBUSY);
> +
> +	for (i = 0; i < n_exec_queues; i++)
> +		xe_exec_queue_destroy(fd, exec_queues[i]);
> +
> +	if (exec_ufence)
> +		munmap(exec_ufence, SZ_4K);
> +
> +	if (flags & LOCK)
> +		munlock(data, bo_size);
> +
> +	if (file_fd != -1)
> +		close(file_fd);
> +
> +	if (flags & NEW && !(flags & FREE)) {
> +		for (i = 0; i < n_execs; i++) {
> +			if (!pending_free[i])
> +				continue;
> +
> +			if (flags & MMAP)
> +				munmap(pending_free[i], bo_size);
> +			else
> +				free(pending_free[i]);
> +		}
> +		free(pending_free);
> +	}
> +	if (data) {
> +		if (flags & MMAP)
> +			munmap(data, bo_size);
> +		else if (!alloc)
> +			free(data);
> +	}
> +	if (free_vm)
> +		xe_vm_destroy(fd, vm);
> +}
> +
> +struct thread_data {
> +	pthread_t thread;
> +	pthread_mutex_t *mutex;
> +	pthread_cond_t *cond;
> +	pthread_barrier_t *barrier;
> +	int fd;
> +	struct drm_xe_engine_class_instance *eci;
> +	int n_exec_queues;
> +	int n_execs;
> +	size_t bo_size;
> +	size_t stride;
> +	uint32_t vm;
> +	unsigned int flags;
> +	void *alloc;
> +	bool *go;
> +};
> +
> +static void *thread(void *data)
> +{
> +	struct thread_data *t = data;
> +
> +	pthread_mutex_lock(t->mutex);
> +	while (!*t->go)
> +		pthread_cond_wait(t->cond, t->mutex);
> +	pthread_mutex_unlock(t->mutex);
> +
> +	test_exec(t->fd, t->eci, t->n_exec_queues, t->n_execs,
> +		  t->bo_size, t->stride, t->vm, t->alloc, t->barrier,
> +		  t->flags);
> +
> +	return NULL;
> +}
> +
> +static void
> +threads(int fd, int n_exec_queues, int n_execs, size_t bo_size,
> +	size_t stride, unsigned int flags, bool shared_vm)
> +{
> +	struct drm_xe_engine_class_instance *hwe;
> +	struct thread_data *threads_data;
> +	int n_engines = 0, i = 0;
> +	pthread_mutex_t mutex;
> +	pthread_cond_t cond;
> +	pthread_barrier_t barrier;
> +	uint32_t vm = 0;
> +	bool go = false;
> +	void *alloc = NULL;
> +
> +	if ((FILE_BACKED | FORK_READ) & flags)
> +		return;
> +
> +	xe_for_each_engine(fd, hwe)
> +		++n_engines;
> +
> +	if (shared_vm) {
> +		vm = xe_vm_create(fd, DRM_XE_VM_CREATE_FLAG_LR_MODE |
> +				  DRM_XE_VM_CREATE_FLAG_FAULT_MODE, 0);
> +		bind_system_allocator(NULL, 0);
> +	}
> +
> +	if (flags & SHARED_ALLOC) {
> +		uint64_t alloc_size;
> +
> +		igt_assert(stride);
> +
> +		alloc_size = sizeof(struct test_exec_data) * stride *
> +			n_execs * n_engines;
> +		alloc_size = xe_bb_size(fd, alloc_size);
> +		alloc = aligned_alloc(SZ_2M, alloc_size);
> +		igt_assert(alloc);
> +
> +		memset(alloc, 0, alloc_size);
> +		flags &= ~SHARED_ALLOC;
> +	}
> +
> +	threads_data = calloc(n_engines, sizeof(*threads_data));
> +	igt_assert(threads_data);
> +
> +	pthread_mutex_init(&mutex, 0);
> +	pthread_cond_init(&cond, 0);
> +	pthread_barrier_init(&barrier, 0, n_engines);
> +
> +	xe_for_each_engine(fd, hwe) {
> +		threads_data[i].mutex = &mutex;
> +		threads_data[i].cond = &cond;
> +		threads_data[i].barrier = (flags & SYNC_EXEC) ? &barrier : NULL;
> +		threads_data[i].fd = fd;
> +		threads_data[i].eci = hwe;
> +		threads_data[i].n_exec_queues = n_exec_queues;
> +		threads_data[i].n_execs = n_execs;
> +		threads_data[i].bo_size = bo_size;
> +		threads_data[i].stride = stride;
> +		threads_data[i].vm = vm;
> +		threads_data[i].flags = flags;
> +		threads_data[i].alloc = alloc ? alloc + i *
> +			sizeof(struct test_exec_data) : NULL;
> +		threads_data[i].go = &go;
> +		pthread_create(&threads_data[i].thread, 0, thread,
> +			       &threads_data[i]);
> +		++i;
> +	}
> +
> +	pthread_mutex_lock(&mutex);
> +	go = true;
> +	pthread_cond_broadcast(&cond);
> +	pthread_mutex_unlock(&mutex);
> +
> +	for (i = 0; i < n_engines; ++i)
> +		pthread_join(threads_data[i].thread, NULL);
> +
> +	if (shared_vm) {
> +		int ret;
> +
> +		if (flags & MMAP) {
> +			int tries = 300;
> +
> +			while (tries && (ret = unbind_system_allocator()) == -EBUSY) {
> +				sleep(.01);
> +				--tries;
> +			}
> +			igt_assert_eq(ret, 0);
> +		}
> +		xe_vm_destroy(fd, vm);
> +		if (alloc)
> +			free(alloc);
> +	}
> +	free(threads_data);
> +}
> +
> +static void process(struct drm_xe_engine_class_instance *hwe, int n_exec_queues,
> +		    int n_execs, size_t bo_size, size_t stride,
> +		    unsigned int flags)
> +{
> +	struct process_data *pdata;
> +	int map_fd;
> +	int fd;
> +
> +	map_fd = open(sync_file, O_RDWR, 0x666);
> +	pdata = mmap(NULL, sizeof(*pdata), PROT_READ |
> +		     PROT_WRITE, MAP_SHARED, map_fd, 0);
> +	wait_pdata(pdata);
> +
> +	fd = drm_open_driver(DRIVER_XE);
> +	test_exec(fd, hwe, n_exec_queues, n_execs,
> +		  bo_size, stride, 0, NULL, NULL, flags);
> +	drm_close_driver(fd);
> +
> +	close(map_fd);
> +	munmap(pdata, sizeof(*pdata));
> +}
> +
> +static void
> +processes(int fd, int n_exec_queues, int n_execs, size_t bo_size,
> +	  size_t stride, unsigned int flags)
> +{
> +	struct drm_xe_engine_class_instance *hwe;
> +	struct process_data *pdata;
> +	int map_fd;
> +
> +	if (flags & FORK_READ)
> +		return;
> +
> +	map_fd = open(sync_file, O_RDWR | O_CREAT, 0x666);
> +	posix_fallocate(map_fd, 0, sizeof(*pdata));
> +	pdata = mmap(NULL, sizeof(*pdata), PROT_READ |
> +		     PROT_WRITE, MAP_SHARED, map_fd, 0);
> +
> +	init_pdata(pdata, 0);
> +
> +	xe_for_each_engine(fd, hwe) {
> +		igt_fork(child, 1)
> +			process(hwe, n_exec_queues, n_execs, bo_size,
> +				stride, flags);
> +	}
> +
> +	signal_pdata(pdata);
> +	igt_waitchildren();
> +
> +	close(map_fd);
> +	munmap(pdata, sizeof(*pdata));
> +}
> +
> +struct section {
> +	const char *name;
> +	unsigned int flags;
> +};
> +
> +igt_main
> +{
> +	struct drm_xe_engine_class_instance *hwe;
> +	const struct section sections[] = {
> +		{ "malloc", 0 },
> +		{ "malloc-multi-fault", MULTI_FAULT },
> +		{ "malloc-fork-read", FORK_READ },
> +		{ "malloc-fork-read-after", FORK_READ | FORK_READ_AFTER },
> +		{ "malloc-mlock", LOCK },
> +		{ "malloc-race", RACE },
> +		{ "malloc-busy", BUSY },
> +		{ "malloc-bo-unmap", BO_UNMAP },
> +		{ "mmap", MMAP },
> +		{ "mmap-remap", MMAP | MREMAP },
> +		{ "mmap-remap-dontunmap", MMAP | MREMAP | DONTUNMAP },
> +		{ "mmap-remap-ro", MMAP | MREMAP | READ_ONLY_REMAP },
> +		{ "mmap-remap-ro-dontunmap", MMAP | MREMAP | DONTUNMAP |
> +			READ_ONLY_REMAP },
> +		{ "mmap-remap-eocheck", MMAP | MREMAP | EVERY_OTHER_CHECK },
> +		{ "mmap-remap-dontunmap-eocheck", MMAP | MREMAP | DONTUNMAP |
> +			EVERY_OTHER_CHECK },
> +		{ "mmap-remap-ro-eocheck", MMAP | MREMAP | READ_ONLY_REMAP |
> +			EVERY_OTHER_CHECK },
> +		{ "mmap-remap-ro-dontunmap-eocheck", MMAP | MREMAP | DONTUNMAP |
> +			READ_ONLY_REMAP | EVERY_OTHER_CHECK },
> +		{ "mmap-huge", MMAP | HUGE_PAGE },
> +		{ "mmap-shared", MMAP | LOCK | MMAP_SHARED },
> +		{ "mmap-shared-remap", MMAP | LOCK | MMAP_SHARED | MREMAP },
> +		{ "mmap-shared-remap-dontunmap", MMAP | LOCK | MMAP_SHARED |
> +			MREMAP | DONTUNMAP },
> +		{ "mmap-shared-remap-eocheck", MMAP | LOCK | MMAP_SHARED |
> +			MREMAP | EVERY_OTHER_CHECK },
> +		{ "mmap-shared-remap-dontunmap-eocheck", MMAP | LOCK |
> +			MMAP_SHARED | MREMAP | DONTUNMAP | EVERY_OTHER_CHECK },
> +		{ "mmap-mlock", MMAP | LOCK },
> +		{ "mmap-file", MMAP | FILE_BACKED },
> +		{ "mmap-file-mlock", MMAP | LOCK | FILE_BACKED },
> +		{ "mmap-race", MMAP | RACE },
> +		{ "free", NEW | FREE },
> +		{ "free-race", NEW | FREE | RACE },
> +		{ "new", NEW },
> +		{ "new-race", NEW | RACE },
> +		{ "new-bo-map", NEW | BO_MAP },
> +		{ "new-busy", NEW | BUSY },
> +		{ "mmap-free", MMAP | NEW | FREE },
> +		{ "mmap-free-huge", MMAP | NEW | FREE | HUGE_PAGE },
> +		{ "mmap-free-race", MMAP | NEW | FREE | RACE },
> +		{ "mmap-new", MMAP | NEW },
> +		{ "mmap-new-huge", MMAP | NEW | HUGE_PAGE },
> +		{ "mmap-new-race", MMAP | NEW | RACE },
> +		{ "malloc-nomemset", SKIP_MEMSET },
> +		{ "malloc-mlock-nomemset", SKIP_MEMSET | LOCK },
> +		{ "malloc-race-nomemset", SKIP_MEMSET | RACE },
> +		{ "malloc-busy-nomemset", SKIP_MEMSET | BUSY },
> +		{ "malloc-bo-unmap-nomemset", SKIP_MEMSET | BO_UNMAP },
> +		{ "mmap-nomemset", SKIP_MEMSET | MMAP },
> +		{ "mmap-huge-nomemset", SKIP_MEMSET | MMAP | HUGE_PAGE },
> +		{ "mmap-shared-nomemset", SKIP_MEMSET | MMAP | MMAP_SHARED },
> +		{ "mmap-mlock-nomemset", SKIP_MEMSET | MMAP | LOCK },
> +		{ "mmap-file-nomemset", SKIP_MEMSET | MMAP | FILE_BACKED },
> +		{ "mmap-file-mlock-nomemset", SKIP_MEMSET | MMAP | LOCK | FILE_BACKED },
> +		{ "mmap-race-nomemset", SKIP_MEMSET | MMAP | RACE },
> +		{ "free-nomemset", SKIP_MEMSET | NEW | FREE },
> +		{ "free-race-nomemset", SKIP_MEMSET | NEW | FREE | RACE },
> +		{ "new-nomemset", SKIP_MEMSET | NEW },
> +		{ "new-race-nomemset", SKIP_MEMSET | NEW | RACE },
> +		{ "new-bo-map-nomemset", SKIP_MEMSET | NEW | BO_MAP },
> +		{ "new-busy-nomemset", SKIP_MEMSET | NEW | BUSY },
> +		{ "mmap-free-nomemset", SKIP_MEMSET | MMAP | NEW | FREE },
> +		{ "mmap-free-huge-nomemset", SKIP_MEMSET | MMAP | NEW | FREE | HUGE_PAGE },
> +		{ "mmap-free-race-nomemset", SKIP_MEMSET | MMAP | NEW | FREE | RACE },
> +		{ "mmap-new-nomemset", SKIP_MEMSET | MMAP | NEW },
> +		{ "mmap-new-huge-nomemset", SKIP_MEMSET | MMAP | NEW | HUGE_PAGE },
> +		{ "mmap-new-race-nomemset", SKIP_MEMSET | MMAP | NEW | RACE },
> +		{ NULL },
> +	};
> +	const struct section psections[] = {
> +		{ "munmap-cpu-fault", CPU_FAULT },
> +		{ "munmap-no-cpu-fault", 0 },
> +		{ "remap-cpu-fault", CPU_FAULT | REMAP },
> +		{ "remap-no-cpu-fault", REMAP },
> +		{ "middle-munmap-cpu-fault", MIDDLE | CPU_FAULT },
> +		{ "middle-munmap-no-cpu-fault", MIDDLE },
> +		{ "middle-remap-cpu-fault", MIDDLE | CPU_FAULT | REMAP },
> +		{ "middle-remap-no-cpu-fault", MIDDLE | REMAP },
> +		{ NULL },
> +	};
> +	const struct section esections[] = {
> +		{ "malloc", 0 },
> +		{ "malloc-mix-bo", MIX_BO_ALLOC },
> +		{ NULL },
> +	};
> +	int fd;
> +
> +	igt_fixture {
> +		struct xe_device *xe;
> +
> +		fd = drm_open_driver(DRIVER_XE);
> +		igt_require(!xe_supports_faults(fd));
> +
> +		xe = xe_device_get(fd);
> +		va_bits = xe->va_bits;
> +		open_sync_file();
> +	}
> +
> +	for (const struct section *s = sections; s->name; s++) {
> +		igt_subtest_f("once-%s", s->name)
> +			xe_for_each_engine(fd, hwe)
> +				test_exec(fd, hwe, 1, 1, 0, 0, 0, NULL,
> +					  NULL, s->flags);
> +
> +		igt_subtest_f("once-large-%s", s->name)
> +			xe_for_each_engine(fd, hwe)
> +				test_exec(fd, hwe, 1, 1, SZ_2M, 0, 0, NULL,
> +					  NULL, s->flags);
> +
> +		igt_subtest_f("twice-%s", s->name)
> +			xe_for_each_engine(fd, hwe)
> +				test_exec(fd, hwe, 1, 2, 0, 0, 0, NULL,
> +					  NULL, s->flags);
> +
> +		igt_subtest_f("twice-large-%s", s->name)
> +			xe_for_each_engine(fd, hwe)
> +				test_exec(fd, hwe, 1, 2, SZ_2M, 0, 0, NULL,
> +					  NULL, s->flags);
> +
> +		igt_subtest_f("many-%s", s->name)
> +			xe_for_each_engine(fd, hwe)
> +				test_exec(fd, hwe, 1, 128, 0, 0, 0, NULL,
> +					  NULL, s->flags);
> +
> +		igt_subtest_f("many-stride-%s", s->name)
> +			xe_for_each_engine(fd, hwe)
> +				test_exec(fd, hwe, 1, 128, 0, 256, 0, NULL,
> +					  NULL, s->flags);
> +
> +		igt_subtest_f("many-execqueues-%s", s->name)
> +			xe_for_each_engine(fd, hwe)
> +				test_exec(fd, hwe, 16, 128, 0, 0, 0, NULL,
> +					  NULL, s->flags);
> +
> +		igt_subtest_f("many-large-%s", s->name)
> +			xe_for_each_engine(fd, hwe)
> +				test_exec(fd, hwe, 1, 128, SZ_2M, 0, 0, NULL,
> +					  NULL, s->flags);
> +
> +		igt_subtest_f("many-large-execqueues-%s", s->name)
> +			xe_for_each_engine(fd, hwe)
> +				test_exec(fd, hwe, 16, 128, SZ_2M, 0, 0, NULL,
> +					  NULL, s->flags);
> +
> +		igt_subtest_f("threads-many-%s", s->name)
> +			threads(fd, 1, 128, 0, 0, s->flags, false);
> +
> +		igt_subtest_f("threads-many-stride-%s", s->name)
> +			threads(fd, 1, 128, 0, 256, s->flags, false);
> +
> +		igt_subtest_f("threads-many-execqueues-%s", s->name)
> +			threads(fd, 16, 128, 0, 0, s->flags, false);
> +
> +		igt_subtest_f("threads-many-large-%s", s->name)
> +			threads(fd, 1, 128, SZ_2M, 0, s->flags, false);
> +
> +		igt_subtest_f("threads-many-large-execqueues-%s", s->name)
> +			threads(fd, 16, 128, SZ_2M, 0, s->flags, false);
> +
> +		igt_subtest_f("threads-shared-vm-many-%s", s->name)
> +			threads(fd, 1, 128, 0, 0, s->flags, true);
> +
> +		igt_subtest_f("threads-shared-vm-many-stride-%s", s->name)
> +			threads(fd, 1, 128, 0, 256, s->flags, true);
> +
> +		igt_subtest_f("threads-shared-vm-many-execqueues-%s", s->name)
> +			threads(fd, 16, 128, 0, 0, s->flags, true);
> +
> +		igt_subtest_f("threads-shared-vm-many-large-%s", s->name)
> +			threads(fd, 1, 128, SZ_2M, 0, s->flags, true);
> +
> +		igt_subtest_f("threads-shared-vm-many-large-execqueues-%s", s->name)
> +			threads(fd, 16, 128, SZ_2M, 0, s->flags, true);
> +
> +		igt_subtest_f("process-many-%s", s->name)
> +			processes(fd, 1, 128, 0, 0, s->flags);
> +
> +		igt_subtest_f("process-many-stride-%s", s->name)
> +			processes(fd, 1, 128, 0, 256, s->flags);
> +
> +		igt_subtest_f("process-many-execqueues-%s", s->name)
> +			processes(fd, 16, 128, 0, 0, s->flags);
> +
> +		igt_subtest_f("process-many-large-%s", s->name)
> +			processes(fd, 1, 128, SZ_2M, 0, s->flags);
> +
> +		igt_subtest_f("process-many-large-execqueues-%s", s->name)
> +			processes(fd, 16, 128, SZ_2M, 0, s->flags);
> +	}
> +
> +	igt_subtest("threads-shared-vm-shared-alloc-many-stride-malloc")
> +		threads(fd, 1, 128, 0, 256, SHARED_ALLOC, true);
> +
> +	igt_subtest("threads-shared-vm-shared-alloc-many-stride-malloc-race")
> +		threads(fd, 1, 128, 0, 256, RACE | SHARED_ALLOC, true);
> +
> +	igt_subtest("threads-shared-alloc-many-stride-malloc")
> +		threads(fd, 1, 128, 0, 256, SHARED_ALLOC, false);
> +
> +	igt_subtest("threads-shared-alloc-many-stride-malloc-sync")
> +		threads(fd, 1, 128, 0, 256, SHARED_ALLOC | SYNC_EXEC, false);
> +
> +	igt_subtest("threads-shared-alloc-many-stride-malloc-race")
> +		threads(fd, 1, 128, 0, 256, RACE | SHARED_ALLOC, false);
> +
> +	igt_subtest_f("fault")
> +		xe_for_each_engine(fd, hwe)
> +			test_exec(fd, hwe, 4, 1, SZ_2M, 0, 0, NULL, NULL,
> +				  FAULT);
> +
> +	for (const struct section *s = psections; s->name; s++) {
> +		igt_subtest_f("partial-%s", s->name)
> +			xe_for_each_engine(fd, hwe)
> +				partial(fd, hwe, s->flags);
> +	}
> +
> +	igt_subtest_f("unaligned-alloc")
> +		xe_for_each_engine(fd, hwe) {
> +			many_allocs(fd, hwe, (SZ_1M + SZ_512K) * 8,
> +				    SZ_1M + SZ_512K, SZ_4K, NULL, 0);
> +			break;
> +		}
> +
> +	igt_subtest_f("fault-benchmark")
> +		xe_for_each_engine(fd, hwe)
> +			many_allocs(fd, hwe, SZ_64M, SZ_64M, SZ_4K, NULL,
> +				    BENCHMARK);
> +
> +	igt_subtest_f("fault-threads-benchmark")
> +		xe_for_each_engine(fd, hwe)
> +			many_allocs(fd, hwe, SZ_64M, SZ_64M, SZ_4K, NULL,
> +				    BENCHMARK | CPU_FAULT_THREADS);
> +
> +	igt_subtest_f("fault-threads-same-page-benchmark")
> +		xe_for_each_engine(fd, hwe)
> +			many_allocs(fd, hwe, SZ_64M, SZ_64M, SZ_4K, NULL,
> +				    BENCHMARK | CPU_FAULT_THREADS |
> +				    CPU_FAULT_SAME_PAGE);
> +
> +	igt_subtest_f("fault-process-benchmark")
> +		xe_for_each_engine(fd, hwe)
> +			many_allocs(fd, hwe, SZ_64M, SZ_64M, SZ_4K, NULL,
> +				    BENCHMARK | CPU_FAULT_PROCESS);
> +
> +	igt_subtest_f("fault-process-same-page-benchmark")
> +		xe_for_each_engine(fd, hwe)
> +			many_allocs(fd, hwe, SZ_64M, SZ_64M, SZ_4K, NULL,
> +				    BENCHMARK | CPU_FAULT_PROCESS |
> +				    CPU_FAULT_SAME_PAGE);
> +
> +	for (const struct section *s = esections; s->name; s++) {
> +		igt_subtest_f("evict-%s", s->name)
> +			xe_for_each_engine(fd, hwe) {
> +				many_allocs(fd, hwe,
> +					    xe_visible_vram_size(fd, hwe->gt_id),
> +					    SZ_8M, SZ_1M, NULL, s->flags);
> +				break;
> +			}
> +	}
> +
> +	for (const struct section *s = esections; s->name; s++) {
> +		igt_subtest_f("processes-evict-%s", s->name)
> +			processes_evict(fd, SZ_8M, SZ_1M, s->flags);
> +	}
> +
> +	igt_fixture {
> +		xe_device_put(fd);
> +		drm_close_driver(fd);
> +		close_sync_file();
> +	}
> +}
> diff --git a/tests/meson.build b/tests/meson.build
> index 6328792e3a..20ddddb89f 100644
> --- a/tests/meson.build
> +++ b/tests/meson.build
> @@ -295,6 +295,7 @@ intel_xe_progs = [
>  	'xe_exec_reset',
>  	'xe_exec_sip',
>  	'xe_exec_store',
> +	'xe_exec_system_allocator',
>  	'xe_exec_threads',
>  	'xe_exercise_blt',
>  	'xe_fault_injection',
> -- 
> 2.34.1
> 

^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2025-04-25  7:08 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-04-24 20:44 [PATCH v3 0/2] Add system_allocator test Matthew Brost
2025-04-24 20:44 ` [PATCH v3 1/2] uapi/xe: Sync latest uAPI KMD headers Matthew Brost
2025-04-24 22:32   ` Cavitt, Jonathan
2025-04-25  6:54   ` Francois Dugast
2025-04-24 20:44 ` [PATCH v3 2/2] tests/xe: Add system_allocator test Matthew Brost
2025-04-24 22:32   ` Cavitt, Jonathan
2025-04-24 22:39     ` Matthew Brost
2025-04-25  7:06   ` Francois Dugast

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox