[PATCH i-g-t v6] test/intel/xe_vm:Add oversubscribe concurrent bind stress subtest

All of lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH i-g-t v6] test/intel/xe_vm:Add oversubscribe concurrent bind stress subtest
@ 2026-05-06 14:10 Sobin Thomas
  0 siblings, 0 replies; 3+ messages in thread
From: Sobin Thomas @ 2026-05-06 14:10 UTC (permalink / raw)
  To: igt-dev, thomas.hellstrom; +Cc: nishit.sharma, kamil.konieczny, Sobin Thomas

Add test for oversubscribing VRAM in multi process environment that
creates VM, bind large BOs and submit workloads nearly simultaneously.

Previous coverage lacked a scenario combining multi-process bind
with VRAM oversubscription. This generates memory pressure with
multi-process VM Bind activity and concurrent submission, exercising
the bind pipeline under eviction pressure.

v2: Removed helper APIs usage clock_nanosleep and commented
code.(Nishit)

v3: Refactored code to smaller functions.
    Added check for available SRAM usage and keep the max process to 20.

v4: Remove explicit macros definition
    Replace Bind ioctl with library calls.(Thomas)
v5: Remove unused query_mem_info
    Fix xe_exec_with_retry (Thomas)
    Rename align_to_page_size with ALIGN macro (kamil/Thomas)
v6: Fix vm_bind_bo_batch: move igt_assert(ufence) before first dereference
    Fix create_test_bos: check errno instead of ret for ENOMEM/ENOSPC
    detection, since igt_ioctl returns -1 on failure. (Thomas)

Signed-off-by: Sobin Thomas <sobin.thomas@intel.com>
---
 tests/intel/xe_vm.c | 401 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 401 insertions(+)

diff --git a/tests/intel/xe_vm.c b/tests/intel/xe_vm.c
index 408bfdb71..fe4174458 100644
--- a/tests/intel/xe_vm.c
+++ b/tests/intel/xe_vm.c
@@ -21,6 +21,7 @@
 #include "xe/xe_spin.h"
 #include <string.h>
 #define USER_FENCE_VALUE 0xdeadbeefdeadbeefull
+#define GB(x) (1024ULL * 1024ULL * 1024ULL * (x))
 
 enum overcommit_stage {
 	EXPECT_NONE,
@@ -29,6 +30,69 @@ enum overcommit_stage {
 	EXPECT_EXEC,
 };
 
+struct gem_bo {
+	uint32_t handle;
+	uint64_t size;
+	int *ptr;
+	uint64_t addr;
+};
+
+struct xe_test_ctx {
+	uint32_t vm_id;
+	uint32_t exec_queue_id;
+};
+
+struct mem_bind_sync {
+	struct gem_bo *bufs;
+	int n_bufs;
+	uint64_t *binds_ufence;
+};
+
+static void create_exec_queue(int fd, struct xe_test_ctx *ctx)
+{
+	struct drm_xe_engine_class_instance *hwe;
+	struct drm_xe_engine_class_instance eci = { 0 };
+
+	/* Use first available engine */
+	xe_for_each_engine(fd, hwe) {
+		eci = *hwe;
+		break;
+	}
+	ctx->exec_queue_id = xe_exec_queue_create(fd, ctx->vm_id, &eci, 0);
+}
+
+static uint64_t *
+vm_bind_bo_batch(int fd, struct xe_test_ctx *ctx, struct gem_bo *bos, int size)
+{
+	uint64_t *ufence;
+	struct drm_xe_sync bind_sync;
+	struct drm_xe_vm_bind_op binds[size];
+	int i;
+
+	ufence = calloc(1, sizeof(uint64_t));
+	igt_assert(ufence);
+	*ufence = 0;
+	bind_sync = (struct drm_xe_sync) {
+		.type = DRM_XE_SYNC_TYPE_USER_FENCE,
+		.flags = DRM_XE_SYNC_FLAG_SIGNAL,
+		.addr = to_user_pointer(ufence),
+		.timeline_value = 1,
+	};
+
+	for (i = 0; i < size; i++) {
+		binds[i] = (struct drm_xe_vm_bind_op) {
+			.obj = bos[i].handle,
+		.obj_offset = 0,
+			.range = bos[i].size,
+			.addr = bos[i].addr,
+			.op = DRM_XE_VM_BIND_OP_MAP,
+			.flags = 0,
+		};
+	}
+	xe_vm_bind_array(fd, ctx->vm_id, 0, binds, size, &bind_sync, 1);
+	return ufence;
+}
+
 static uint32_t
 addr_low(uint64_t addr)
 {
@@ -3073,6 +3137,338 @@ static void test_get_property(int fd, void (*func)(int fd, uint32_t vm))
 	xe_vm_destroy(fd, vm);
 }
 
+static int build_add_batch(struct gem_bo *batch_bo, struct gem_bo *integers_bo,
+			   struct gem_bo *result_bo, int ints_to_add)
+{
+	int pos = 0;
+	uint64_t tmp_addr;
+	#define GPR_RX_ADDR(x)		(0x600 + (x) * 8)
+
+	batch_bo->ptr[pos++] =  MI_LOAD_REGISTER_MEM_CMD | MI_LRI_LRM_CS_MMIO | 2;
+	batch_bo->ptr[pos++] = GPR_RX_ADDR(0);
+	tmp_addr = integers_bo->addr + 0 * sizeof(uint32_t);
+	batch_bo->ptr[pos++] = tmp_addr & 0xFFFFFFFF;
+	batch_bo->ptr[pos++] = (tmp_addr >> 32) & 0xFFFFFFFF;
+	for (int i = 1; i < ints_to_add; i++) {
+		/* r1 = integers_bo[i] */
+		batch_bo->ptr[pos++] =  MI_LOAD_REGISTER_MEM_CMD | MI_LRI_LRM_CS_MMIO | 2;
+		batch_bo->ptr[pos++] = GPR_RX_ADDR(1);
+		tmp_addr = integers_bo->addr + i * sizeof(uint32_t);
+		batch_bo->ptr[pos++] = tmp_addr & 0xFFFFFFFF;
+		batch_bo->ptr[pos++] = (tmp_addr >> 32) & 0xFFFFFFFF;
+		/* r0 = r0 + r1 */
+		batch_bo->ptr[pos++] = MI_MATH(4);
+		batch_bo->ptr[pos++] = MI_MATH_LOAD(MI_MATH_REG_SRCA, MI_MATH_REG(0));
+		batch_bo->ptr[pos++] = MI_MATH_LOAD(MI_MATH_REG_SRCB, MI_MATH_REG(1));
+		batch_bo->ptr[pos++] = MI_MATH_ADD;
+		batch_bo->ptr[pos++] = MI_MATH_STORE(MI_MATH_REG(0), MI_MATH_REG_ACCU);
+	}
+	/* result_bo[0] = r0 */
+	batch_bo->ptr[pos++] = MI_STORE_REGISTER_MEM_GEN8 | MI_LRI_LRM_CS_MMIO;
+	batch_bo->ptr[pos++] = GPR_RX_ADDR(0);
+	tmp_addr = result_bo->addr + 0 * sizeof(uint32_t);
+	batch_bo->ptr[pos++] = tmp_addr & 0xFFFFFFFF;
+	batch_bo->ptr[pos++] = (tmp_addr >> 32) & 0xFFFFFFFF;
+
+	batch_bo->ptr[pos++] = MI_BATCH_BUFFER_END;
+	while (pos % 4 != 0)
+		batch_bo->ptr[pos++] = MI_NOOP;
+	return pos;
+}
+
+static void create_test_bos(int fd, struct xe_test_ctx *ctx, struct mem_bind_sync *bind,
+			    uint32_t  placement, uint64_t *addr)
+{
+	const char *mem_type = (placement & vram_memory(fd, 0)) ? "VRAM" : "SRAM";
+
+	for (int i = 0; i < bind->n_bufs; i++) {
+		struct gem_bo *bo = &bind->bufs[i];
+
+		bo->size = GB(1);
+		ret = __xe_bo_create_caching(fd, ctx->vm_id, bo->size, placement, 0,
+					     DRM_XE_GEM_CPU_CACHING_WC, &bo->handle);
+		if (ret) {
+			if (errno == ENOMEM || errno == ENOSPC) {
+				bind->n_bufs = i;
+				igt_debug("%s allocation failed at buffer %d\n", mem_type, i);
+				break;
+			}
+			igt_assert_eq(ret, 0);
+		}
+		bo->ptr = NULL;
+		bo->addr = *addr;
+		*addr += bo->size;
+		igt_debug("%s buffer %d created at 0x%016lx\n", mem_type, i, bo->addr);
+	}
+}
+
+static int fill_random_integers(struct gem_bo *int_bo, int ints_to_add)
+{
+	uint32_t expected_result = 0;
+
+	for (int i = 0; i < ints_to_add; i++) {
+		int random_int = rand() % 8;
+
+		int_bo->ptr[i] = random_int;
+		expected_result += random_int;
+
+		igt_debug("%d", random_int);
+		if (i + 1 != ints_to_add)
+			igt_debug(" + ");
+		else
+			igt_debug(" = ");
+	}
+	igt_debug("%d\n", expected_result);
+	return expected_result;
+}
+
+/*
+ * In concurrent VM bind stress tests, multiple threads simultaneously bind
+ * buffers to GPU virtual address space and submit batch operations. This
+ * creates significant GPU memory pressure where the kernel may transiently
+ * fail batch submission when:
+ *   - GPU page tables are being updated across multiple bindings
+ *   - GPU memory is fragmented across many concurrent buffer mappings
+ *   - Multiple processes compete for finite GPU resources
+ *
+ * Without retries, transient ENOMEM/ENOSPC failures cause false test failures.
+ * Retrying lets us distinguish temporary resource exhaustion from actual
+ * driver bugs. Non ENOMEM/ENOSPC errors still fail immediately and are properly
+ * reported with full errno context for debugging.
+ */
+static int xe_exec_with_retry(int fd, struct drm_xe_exec *exec, int max_retries)
+{
+	int rc = 0, retries = 0;
+
+	for (retries = 0; retries < max_retries; retries++) {
+		rc = igt_ioctl(fd, DRM_IOCTL_XE_EXEC, exec);
+
+		if (!(rc && (errno == ENOMEM || errno == ENOSPC)))
+			break;
+
+		usleep(100 * retries);
+		if (retries == 0)
+			igt_warn("got %s, retrying\n", strerror(errno));
+	}
+
+	if (retries == max_retries)
+		igt_warn("gave up after %d retries\n", retries);
+
+	if (rc)
+		igt_warn("errno: %d (%s)\n", errno, strerror(errno));
+
+	return rc;
+}
+
+static void cleanup_bo_resources(int fd, struct gem_bo *bo)
+{
+	if (bo->ptr) {
+		igt_assert_eq(munmap(bo->ptr, bo->size), 0);
+		bo->ptr = NULL;
+	}
+	if (bo->handle)
+		gem_close(fd, bo->handle);
+}
+
+static void cleanup_sram_vram_objs(int fd, struct mem_bind_sync *vram_bind,
+				   struct mem_bind_sync *sram_bind)
+{
+	for (int i = 0; i < vram_bind->n_bufs; i++)
+		gem_close(fd, vram_bind->bufs[i].handle);
+	for (int i = 0; i < sram_bind->n_bufs; i++)
+		gem_close(fd, sram_bind->bufs[i].handle);
+	free(vram_bind->bufs);
+	free(sram_bind->bufs);
+	if (vram_bind->n_bufs)
+		free(vram_bind->binds_ufence);
+	if (sram_bind->n_bufs)
+		free(sram_bind->binds_ufence);
+}
+
+/**
+ * SUBTEST: oversubscribe-concurrent-bind
+ * Description: Test for oversubscribing the VM with multiple processes
+ * doing binds at the same time, and ensure they all complete successfully.
+ * Functionality: This check is for a specific bug where if multiple processes
+ * oversubscribe the VM, some of the binds may fail with  ENOMEM due to
+ * deadlock in the bind code.
+ * Test category: stress test
+ */
+static void test_vm_oversubscribe_concurrent_bind(int fd)
+{
+	#define MIN_BUFS_PER_PROC 2
+	#define MAX_THREADS 20
+	int n_proc = 0, n_vram_bufs = 0, n_sram_bufs = 0;
+	uint32_t max_by_mem;
+	uint64_t total_vram_demand = 0;
+	uint64_t vram_size = xe_visible_available_vram_size(fd, 0);
+	uint64_t sram_avail = (uint64_t)igt_get_avail_ram_mb() << 20;
+	uint64_t target_vram = vram_size * 2;      /* 2 of VRAM */
+	uint64_t target_sram = sram_avail * 50 / 100;  /* 50% system RAM */
+
+	int total_vram_bufs = target_vram / GB(1);
+	int total_sram_bufs = target_sram / GB(1);
+
+	/* determine concurrency from memory pressure */
+
+	pthread_barrier_t *barrier;
+	pthread_barrierattr_t attr;
+
+	max_by_mem = min(total_vram_bufs / MIN_BUFS_PER_PROC,
+			 total_sram_bufs / MIN_BUFS_PER_PROC);
+	igt_info("\n max_by_mem = %d", max_by_mem);
+	n_proc = min_t(uint32_t, max_by_mem, MAX_THREADS);
+	igt_require_f(n_proc > 0, "Not enough VRAM/RAM for oversubscription test\n");
+
+	n_vram_bufs = max(2, total_vram_bufs / n_proc);
+	n_sram_bufs = max(2, total_sram_bufs / n_proc);
+	total_vram_demand = (uint64_t)n_proc * n_vram_bufs * GB(1);
+
+	igt_debug("VRAM size: %" PRIu64 "MB, System RAM available: %" PRIu64 "MB\n",
+		  vram_size >> 20, sram_avail >> 20);
+
+	igt_debug(" n_proc = %d\n", n_proc);
+	igt_debug("VRAM: %" PRIu64 "GB\n", vram_size >> 30);
+	igt_debug("VRAM demand: %" PRIu64 "MB (%.2fx oversubscription)\n",
+		  total_vram_demand >> 20, (double)total_vram_demand / vram_size);
+	igt_debug("Processes=%d VRAM_bufs=%d SRAM_bufs=%d\n", n_proc,
+		  n_vram_bufs, n_sram_bufs);
+
+	barrier = mmap(NULL, sizeof(pthread_barrier_t),
+		       PROT_READ | PROT_WRITE,
+		       MAP_SHARED | MAP_ANONYMOUS, -1, 0);
+	igt_assert(barrier != MAP_FAILED);
+	pthread_barrierattr_init(&attr);
+	pthread_barrierattr_setpshared(&attr, PTHREAD_PROCESS_SHARED);
+	pthread_barrier_init(barrier, &attr, n_proc);
+
+	igt_fork(child, n_proc) {
+		struct xe_test_ctx ctx = {0};
+		int rc;
+		uint64_t addr = 0x40000000;
+		int expected_result = 0, ints_to_add = 4;
+		int max_retries = 1024;
+		struct gem_bo integers_bo, result_bo, batch_bo, *vram_bufs, *sram_bufs;
+		int pos = 0;
+		struct mem_bind_sync vram_bind = {0};
+		struct mem_bind_sync sram_bind = {0};
+		struct drm_xe_sync batch_syncs[1];
+		struct drm_xe_exec exec;
+		struct gem_bo ufence_bo = {0};
+
+		vram_bufs = (struct gem_bo *)calloc(n_vram_bufs, sizeof(struct gem_bo));
+		sram_bufs = (struct gem_bo *)calloc(n_sram_bufs, sizeof(struct gem_bo));
+		srand(child);
+
+		igt_assert(vram_bufs && sram_bufs);
+
+		ctx.vm_id = xe_vm_create(fd, DRM_XE_VM_CREATE_FLAG_SCRATCH_PAGE, 0);
+		create_exec_queue(fd, &ctx);
+		vram_bind.bufs = vram_bufs;
+		vram_bind.n_bufs = n_vram_bufs;
+		sram_bind.bufs = sram_bufs;
+		sram_bind.n_bufs = n_sram_bufs;
+
+		create_test_bos(fd, &ctx, &vram_bind, vram_memory(fd, 0), &addr);
+		create_test_bos(fd, &ctx, &sram_bind, system_memory(fd), &addr);
+
+		pthread_barrier_wait(barrier);
+
+		if (vram_bind.n_bufs)
+			vram_bind.binds_ufence = vm_bind_bo_batch(fd, &ctx, vram_bufs,
+								  vram_bind.n_bufs);
+
+		if (sram_bind.n_bufs)
+			sram_bind.binds_ufence = vm_bind_bo_batch(fd, &ctx, sram_bufs,
+								  sram_bind.n_bufs);
+
+		integers_bo.size = ALIGN(sizeof(int) * ints_to_add, 4096);
+		integers_bo.handle = xe_bo_create_caching(fd, ctx.vm_id, integers_bo.size,
+							  system_memory(fd), 0,
+							  DRM_XE_GEM_CPU_CACHING_WC);
+		integers_bo.ptr = (int *)xe_bo_map(fd, integers_bo.handle, integers_bo.size);
+		integers_bo.addr = 0x100000;
+
+		expected_result = fill_random_integers(&integers_bo, ints_to_add);
+		igt_debug("%d\n", expected_result);
+
+		result_bo.size = ALIGN(sizeof(int), 4096);
+		result_bo.handle  = xe_bo_create_caching(fd, ctx.vm_id, result_bo.size,
+							 system_memory(fd), 0,
+							 DRM_XE_GEM_CPU_CACHING_WC);
+		result_bo.ptr = NULL;
+		result_bo.addr = 0x200000;
+
+		batch_bo.size = 4096;
+		batch_bo.handle = xe_bo_create_caching(fd, ctx.vm_id, batch_bo.size,
+						       system_memory(fd), 0,
+						       DRM_XE_GEM_CPU_CACHING_WC);
+
+		batch_bo.ptr = (int *)xe_bo_map(fd, batch_bo.handle, batch_bo.size);
+		batch_bo.addr = 0x300000;
+
+		pos = build_add_batch(&batch_bo, &integers_bo, &result_bo, ints_to_add);
+
+		igt_assert(pos * sizeof(int) <= batch_bo.size);
+
+		/* Wait for large bind operations to complete before binding small BOs */
+		if (vram_bind.n_bufs)
+			xe_wait_ufence(fd, vram_bind.binds_ufence, 1, 0, INT64_MAX);
+		if (sram_bind.n_bufs)
+			xe_wait_ufence(fd, sram_bind.binds_ufence, 1, 0, INT64_MAX);
+
+		xe_vm_bind_lr_sync(fd, ctx.vm_id, integers_bo.handle, 0, integers_bo.addr,
+				   integers_bo.size, 0);
+		xe_vm_bind_lr_sync(fd, ctx.vm_id, result_bo.handle, 0, result_bo.addr,
+				   result_bo.size, 0);
+		xe_vm_bind_lr_sync(fd, ctx.vm_id, batch_bo.handle, 0, batch_bo.addr,
+				   batch_bo.size, 0);
+
+		ufence_bo.size = 4096;
+		ufence_bo.handle = xe_bo_create_caching(fd, ctx.vm_id, ufence_bo.size,
+							system_memory(fd), 0,
+							DRM_XE_GEM_CPU_CACHING_WB);
+		ufence_bo.ptr = (int *)xe_bo_map(fd, ufence_bo.handle, ufence_bo.size);
+		ufence_bo.addr = 0x400000;
+		memset(ufence_bo.ptr, 0, ufence_bo.size);
+		xe_vm_bind_lr_sync(fd, ctx.vm_id, ufence_bo.handle, 0, ufence_bo.addr,
+				   ufence_bo.size, 0);
+
+		batch_syncs[0] = (struct drm_xe_sync){
+			.type = DRM_XE_SYNC_TYPE_USER_FENCE,
+			.flags = DRM_XE_SYNC_FLAG_SIGNAL,
+			.addr = ufence_bo.addr,
+			.timeline_value = 1,
+		};
+
+		exec = (struct drm_xe_exec) {
+			.exec_queue_id = ctx.exec_queue_id,
+			.num_syncs = 1,
+			.syncs = (uintptr_t)batch_syncs,
+			.address = batch_bo.addr,
+			.num_batch_buffer = 1,
+		};
+
+		rc = xe_exec_with_retry(fd, &exec, max_retries);
+		igt_assert_eq(rc, 0);
+		xe_wait_ufence(fd, (uint64_t *)ufence_bo.ptr, 1, ctx.exec_queue_id, INT64_MAX);
+		result_bo.ptr = (int *)xe_bo_map(fd, result_bo.handle, result_bo.size);
+		igt_assert_eq(result_bo.ptr[0], expected_result);
+		cleanup_bo_resources(fd, &ufence_bo);
+		cleanup_bo_resources(fd, &result_bo);
+		cleanup_bo_resources(fd, &batch_bo);
+		cleanup_bo_resources(fd, &integers_bo);
+		cleanup_sram_vram_objs(fd, &vram_bind, &sram_bind);
+		xe_exec_queue_destroy(fd, ctx.exec_queue_id);
+		xe_vm_destroy(fd, ctx.vm_id);
+		close(fd);
+	}
+	igt_waitchildren();
+	pthread_barrier_destroy(barrier);
+	pthread_barrierattr_destroy(&attr);
+	igt_assert_eq(munmap(barrier, sizeof(pthread_barrier_t)), 0);
+}
+
 int igt_main()
 {
 	struct drm_xe_engine_class_instance *hwe, *hwe_non_copy = NULL;
@@ -3486,6 +3882,11 @@ int igt_main()
 		igt_assert(xe_visible_vram_size(fd, 0));
 		test_oom(fd);
 	}
+	igt_subtest("oversubscribe-concurrent-bind")
+	{
+		igt_require(xe_has_vram(fd));
+		test_vm_oversubscribe_concurrent_bind(fd);
+	}
 
 	for (const struct vm_get_property *f = xe_vm_get_property_tests; f->name; f++) {
 		igt_subtest_f("vm-get-property-%s", f->name)
-- 
2.52.0


^ permalink raw reply related	[flat|nested] 3+ messages in thread

* [PATCH i-g-t v6] test/intel/xe_vm:Add oversubscribe concurrent bind stress subtest
@ 2026-05-12  2:47 Sobin Thomas
  2026-05-21 15:52 ` Sharma, Nishit
  0 siblings, 1 reply; 3+ messages in thread
From: Sobin Thomas @ 2026-05-12  2:47 UTC (permalink / raw)
  To: igt-dev, thomas.hellstrom; +Cc: nishit.sharma, Sobin Thomas

Add test for oversubscribing VRAM in multi process environment that
creates VM, bind large BOs and submit workloads nearly simultaneously.

Previous coverage lacked a scenario combining multi-process bind
with VRAM oversubscription. This generates memory pressure with
multi-process VM Bind activity and concurrent submission, exercising
the bind pipeline under eviction pressure.

v2: Removed helper APIs usage clock_nanosleep and commented
code.(Nishit)

v3: Refactored code to smaller functions.
    Added check for available SRAM usage and keep the max process to 20.

v4: Remove explicit macros definition
    Replace Bind ioctl with library calls.(Thomas)
v5: Remove unused query_mem_info
    Fix xe_exec_with_retry (Thomas)
    Rename align_to_page_size with ALIGN macro (kamil/Thomas)
v6: Fix vm_bind_bo_batch: move igt_assert(ufence) before first dereference
    Fix create_test_bos: check errno instead of ret for ENOMEM/ENOSPC
    detection, since igt_ioctl returns -1 on failure. (Thomas)

Signed-off-by: Sobin Thomas <sobin.thomas@intel.com>
---
 tests/intel/xe_vm.c | 401 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 401 insertions(+)

diff --git a/tests/intel/xe_vm.c b/tests/intel/xe_vm.c
index 408bfdb71..9fa551e48 100644
--- a/tests/intel/xe_vm.c
+++ b/tests/intel/xe_vm.c
@@ -21,6 +21,7 @@
 #include "xe/xe_spin.h"
 #include <string.h>
 #define USER_FENCE_VALUE 0xdeadbeefdeadbeefull
+#define GB(x) (1024ULL * 1024ULL * 1024ULL * (x))
 
 enum overcommit_stage {
 	EXPECT_NONE,
@@ -29,6 +30,69 @@ enum overcommit_stage {
 	EXPECT_EXEC,
 };
 
+struct gem_bo {
+	uint32_t handle;
+	uint64_t size;
+	int *ptr;
+	uint64_t addr;
+};
+
+struct xe_test_ctx {
+	uint32_t vm_id;
+	uint32_t exec_queue_id;
+};
+
+struct mem_bind_sync {
+	struct gem_bo *bufs;
+	int n_bufs;
+	uint64_t *binds_ufence;
+};
+
+static void create_exec_queue(int fd, struct xe_test_ctx *ctx)
+{
+	struct drm_xe_engine_class_instance *hwe;
+	struct drm_xe_engine_class_instance eci = { 0 };
+
+	/* Use first available engine */
+	xe_for_each_engine(fd, hwe) {
+		eci = *hwe;
+		break;
+	}
+	ctx->exec_queue_id = xe_exec_queue_create(fd, ctx->vm_id, &eci, 0);
+}
+
+static uint64_t *
+vm_bind_bo_batch(int fd, struct xe_test_ctx *ctx, struct gem_bo *bos, int size)
+{
+	uint64_t *ufence;
+	struct drm_xe_sync bind_sync;
+	struct drm_xe_vm_bind_op binds[size];
+	int i;
+
+	ufence = calloc(1, sizeof(uint64_t));
+	igt_assert(ufence);
+	*ufence = 0;
+	bind_sync = (struct drm_xe_sync) {
+		.type = DRM_XE_SYNC_TYPE_USER_FENCE,
+		.flags = DRM_XE_SYNC_FLAG_SIGNAL,
+		.addr = to_user_pointer(ufence),
+		.timeline_value = 1,
+	};
+
+	for (i = 0; i < size; i++) {
+		binds[i] = (struct drm_xe_vm_bind_op) {
+			.obj = bos[i].handle,
+		.obj_offset = 0,
+			.range = bos[i].size,
+			.addr = bos[i].addr,
+			.op = DRM_XE_VM_BIND_OP_MAP,
+			.flags = 0,
+		};
+	}
+	xe_vm_bind_array(fd, ctx->vm_id, 0, binds, size, &bind_sync, 1);
+	return ufence;
+}
+
 static uint32_t
 addr_low(uint64_t addr)
 {
@@ -3073,6 +3137,338 @@ static void test_get_property(int fd, void (*func)(int fd, uint32_t vm))
 	xe_vm_destroy(fd, vm);
 }
 
+static int build_add_batch(struct gem_bo *batch_bo, struct gem_bo *integers_bo,
+			   struct gem_bo *result_bo, int ints_to_add)
+{
+	int pos = 0;
+	uint64_t tmp_addr;
+	#define GPR_RX_ADDR(x)		(0x600 + (x) * 8)
+
+	batch_bo->ptr[pos++] =  MI_LOAD_REGISTER_MEM_CMD | MI_LRI_LRM_CS_MMIO | 2;
+	batch_bo->ptr[pos++] = GPR_RX_ADDR(0);
+	tmp_addr = integers_bo->addr + 0 * sizeof(uint32_t);
+	batch_bo->ptr[pos++] = tmp_addr & 0xFFFFFFFF;
+	batch_bo->ptr[pos++] = (tmp_addr >> 32) & 0xFFFFFFFF;
+	for (int i = 1; i < ints_to_add; i++) {
+		/* r1 = integers_bo[i] */
+		batch_bo->ptr[pos++] =  MI_LOAD_REGISTER_MEM_CMD | MI_LRI_LRM_CS_MMIO | 2;
+		batch_bo->ptr[pos++] = GPR_RX_ADDR(1);
+		tmp_addr = integers_bo->addr + i * sizeof(uint32_t);
+		batch_bo->ptr[pos++] = tmp_addr & 0xFFFFFFFF;
+		batch_bo->ptr[pos++] = (tmp_addr >> 32) & 0xFFFFFFFF;
+		/* r0 = r0 + r1 */
+		batch_bo->ptr[pos++] = MI_MATH(4);
+		batch_bo->ptr[pos++] = MI_MATH_LOAD(MI_MATH_REG_SRCA, MI_MATH_REG(0));
+		batch_bo->ptr[pos++] = MI_MATH_LOAD(MI_MATH_REG_SRCB, MI_MATH_REG(1));
+		batch_bo->ptr[pos++] = MI_MATH_ADD;
+		batch_bo->ptr[pos++] = MI_MATH_STORE(MI_MATH_REG(0), MI_MATH_REG_ACCU);
+	}
+	/* result_bo[0] = r0 */
+	batch_bo->ptr[pos++] = MI_STORE_REGISTER_MEM_GEN8 | MI_LRI_LRM_CS_MMIO;
+	batch_bo->ptr[pos++] = GPR_RX_ADDR(0);
+	tmp_addr = result_bo->addr + 0 * sizeof(uint32_t);
+	batch_bo->ptr[pos++] = tmp_addr & 0xFFFFFFFF;
+	batch_bo->ptr[pos++] = (tmp_addr >> 32) & 0xFFFFFFFF;
+
+	batch_bo->ptr[pos++] = MI_BATCH_BUFFER_END;
+	while (pos % 4 != 0)
+		batch_bo->ptr[pos++] = MI_NOOP;
+	return pos;
+}
+
+static void create_test_bos(int fd, struct xe_test_ctx *ctx, struct mem_bind_sync *bind,
+			    uint32_t  placement, uint64_t *addr)
+{
+	const char *mem_type = (placement & vram_memory(fd, 0)) ? "VRAM" : "SRAM";
+	uint32_t ret;
+
+	for (int i = 0; i < bind->n_bufs; i++) {
+		struct gem_bo *bo = &bind->bufs[i];
+
+		bo->size = GB(1);
+		ret = __xe_bo_create_caching(fd, ctx->vm_id, bo->size, placement, 0,
+					     DRM_XE_GEM_CPU_CACHING_WC, &bo->handle);
+		if (ret) {
+			if (errno == ENOMEM || errno == ENOSPC) {
+				bind->n_bufs = i;
+				igt_debug("%s allocation failed at buffer %d\n", mem_type, i);
+				break;
+			}
+			igt_assert_eq(ret, 0);
+		}
+		bo->ptr = NULL;
+		bo->addr = *addr;
+		*addr += bo->size;
+		igt_debug("%s buffer %d created at 0x%016lx\n", mem_type, i, bo->addr);
+	}
+}
+
+static int fill_random_integers(struct gem_bo *int_bo, int ints_to_add)
+{
+	uint32_t expected_result = 0;
+
+	for (int i = 0; i < ints_to_add; i++) {
+		int random_int = rand() % 8;
+
+		int_bo->ptr[i] = random_int;
+		expected_result += random_int;
+
+		igt_debug("%d", random_int);
+		if (i + 1 != ints_to_add)
+			igt_debug(" + ");
+		else
+			igt_debug(" = ");
+	}
+	igt_debug("%d\n", expected_result);
+	return expected_result;
+}
+
+/*
+ * In concurrent VM bind stress tests, multiple threads simultaneously bind
+ * buffers to GPU virtual address space and submit batch operations. This
+ * creates significant GPU memory pressure where the kernel may transiently
+ * fail batch submission when:
+ *   - GPU page tables are being updated across multiple bindings
+ *   - GPU memory is fragmented across many concurrent buffer mappings
+ *   - Multiple processes compete for finite GPU resources
+ *
+ * Without retries, transient ENOMEM/ENOSPC failures cause false test failures.
+ * Retrying lets us distinguish temporary resource exhaustion from actual
+ * driver bugs. Non ENOMEM/ENOSPC errors still fail immediately and are properly
+ * reported with full errno context for debugging.
+ */
+static int xe_exec_with_retry(int fd, struct drm_xe_exec *exec, int max_retries)
+{
+	int rc = 0, retries;
+
+	for (retries = 1; retries < max_retries; retries++) {
+		rc = igt_ioctl(fd, DRM_IOCTL_XE_EXEC, exec);
+
+		if (!(rc && (errno == ENOMEM || errno == ENOSPC)))
+			break;
+
+		usleep(100 * retries);
+		if (retries == 0)
+			igt_warn("got %s, retrying\n", strerror(errno));
+	}
+
+	if (retries == max_retries)
+		igt_warn("gave up after %d retries\n", retries);
+
+	if (rc)
+		igt_warn("errno: %d (%s)\n", errno, strerror(errno));
+
+	return rc;
+}
+
+static void cleanup_bo_resources(int fd, struct gem_bo *bo)
+{
+	if (bo->ptr) {
+		igt_assert_eq(munmap(bo->ptr, bo->size), 0);
+		bo->ptr = NULL;
+	}
+	if (bo->handle)
+		gem_close(fd, bo->handle);
+}
+
+static void cleanup_sram_vram_objs(int fd, struct mem_bind_sync *vram_bind,
+				   struct mem_bind_sync *sram_bind)
+{
+	for (int i = 0; i < vram_bind->n_bufs; i++)
+		gem_close(fd, vram_bind->bufs[i].handle);
+	for (int i = 0; i < sram_bind->n_bufs; i++)
+		gem_close(fd, sram_bind->bufs[i].handle);
+	free(vram_bind->bufs);
+	free(sram_bind->bufs);
+	if (vram_bind->n_bufs)
+		free(vram_bind->binds_ufence);
+	if (sram_bind->n_bufs)
+		free(sram_bind->binds_ufence);
+}
+
+/**
+ * SUBTEST: oversubscribe-concurrent-bind
+ * Description: Test for oversubscribing the VM with multiple processes
+ * doing binds at the same time, and ensure they all complete successfully.
+ * Functionality: This check is for a specific bug where if multiple processes
+ * oversubscribe the VM, some of the binds may fail with  ENOMEM due to
+ * deadlock in the bind code.
+ * Test category: stress test
+ */
+static void test_vm_oversubscribe_concurrent_bind(int fd)
+{
+	#define MIN_BUFS_PER_PROC 2
+	#define MAX_THREADS 20
+	int n_proc = 0, n_vram_bufs = 0, n_sram_bufs = 0;
+	uint32_t max_by_mem;
+	uint64_t total_vram_demand = 0;
+	uint64_t vram_size = xe_visible_available_vram_size(fd, 0);
+	uint64_t sram_avail = (uint64_t)igt_get_avail_ram_mb() << 20;
+	uint64_t target_vram = vram_size * 2;      /* 2 of VRAM */
+	uint64_t target_sram = sram_avail * 50 / 100;  /* 50% system RAM */
+
+	int total_vram_bufs = target_vram / GB(1);
+	int total_sram_bufs = target_sram / GB(1);
+
+	/* determine concurrency from memory pressure */
+
+	pthread_barrier_t *barrier;
+	pthread_barrierattr_t attr;
+
+	max_by_mem = min(total_vram_bufs / MIN_BUFS_PER_PROC,
+			 total_sram_bufs / MIN_BUFS_PER_PROC);
+	n_proc = min_t(uint32_t, max_by_mem, MAX_THREADS);
+	igt_require_f(n_proc > 0, "Not enough VRAM/RAM for oversubscription test\n");
+
+	n_vram_bufs = max(2, total_vram_bufs / n_proc);
+	n_sram_bufs = max(2, total_sram_bufs / n_proc);
+	total_vram_demand = (uint64_t)n_proc * n_vram_bufs * GB(1);
+
+	igt_debug("VRAM size: %" PRIu64 "MB, System RAM available: %" PRIu64 "MB\n",
+		  vram_size >> 20, sram_avail >> 20);
+
+	igt_debug(" n_proc = %d\n", n_proc);
+	igt_debug("VRAM: %" PRIu64 "GB\n", vram_size >> 30);
+	igt_debug("VRAM demand: %" PRIu64 "MB (%.2fx oversubscription)\n",
+		  total_vram_demand >> 20, (double)total_vram_demand / vram_size);
+	igt_debug("Processes=%d VRAM_bufs=%d SRAM_bufs=%d\n", n_proc,
+		  n_vram_bufs, n_sram_bufs);
+
+	barrier = mmap(NULL, sizeof(pthread_barrier_t),
+		       PROT_READ | PROT_WRITE,
+		       MAP_SHARED | MAP_ANONYMOUS, -1, 0);
+	igt_assert(barrier != MAP_FAILED);
+	pthread_barrierattr_init(&attr);
+	pthread_barrierattr_setpshared(&attr, PTHREAD_PROCESS_SHARED);
+	pthread_barrier_init(barrier, &attr, n_proc);
+
+	igt_fork(child, n_proc) {
+		struct xe_test_ctx ctx = {0};
+		int rc;
+		uint64_t addr = 0x40000000;
+		int expected_result = 0, ints_to_add = 4;
+		int max_retries = 1024;
+		struct gem_bo integers_bo, result_bo, batch_bo, *vram_bufs, *sram_bufs;
+		int pos = 0;
+		struct mem_bind_sync vram_bind = {0};
+		struct mem_bind_sync sram_bind = {0};
+		struct drm_xe_sync batch_syncs[1];
+		struct drm_xe_exec exec;
+		struct gem_bo ufence_bo = {0};
+
+		vram_bufs = (struct gem_bo *)calloc(n_vram_bufs, sizeof(struct gem_bo));
+		sram_bufs = (struct gem_bo *)calloc(n_sram_bufs, sizeof(struct gem_bo));
+		srand(child);
+
+		igt_assert(vram_bufs && sram_bufs);
+
+		ctx.vm_id = xe_vm_create(fd, DRM_XE_VM_CREATE_FLAG_SCRATCH_PAGE, 0);
+		create_exec_queue(fd, &ctx);
+		vram_bind.bufs = vram_bufs;
+		vram_bind.n_bufs = n_vram_bufs;
+		sram_bind.bufs = sram_bufs;
+		sram_bind.n_bufs = n_sram_bufs;
+
+		create_test_bos(fd, &ctx, &vram_bind, vram_memory(fd, 0), &addr);
+		create_test_bos(fd, &ctx, &sram_bind, system_memory(fd), &addr);
+
+		pthread_barrier_wait(barrier);
+
+		if (vram_bind.n_bufs)
+			vram_bind.binds_ufence = vm_bind_bo_batch(fd, &ctx, vram_bufs,
+								  vram_bind.n_bufs);
+
+		if (sram_bind.n_bufs)
+			sram_bind.binds_ufence = vm_bind_bo_batch(fd, &ctx, sram_bufs,
+								  sram_bind.n_bufs);
+
+		integers_bo.size = ALIGN(sizeof(int) * ints_to_add, 4096);
+		integers_bo.handle = xe_bo_create_caching(fd, ctx.vm_id, integers_bo.size,
+							  system_memory(fd), 0,
+							  DRM_XE_GEM_CPU_CACHING_WC);
+		integers_bo.ptr = (int *)xe_bo_map(fd, integers_bo.handle, integers_bo.size);
+		integers_bo.addr = 0x100000;
+
+		expected_result = fill_random_integers(&integers_bo, ints_to_add);
+		igt_debug("%d\n", expected_result);
+
+		result_bo.size = ALIGN(sizeof(int), 4096);
+		result_bo.handle  = xe_bo_create_caching(fd, ctx.vm_id, result_bo.size,
+							 system_memory(fd), 0,
+							 DRM_XE_GEM_CPU_CACHING_WC);
+		result_bo.ptr = NULL;
+		result_bo.addr = 0x200000;
+
+		batch_bo.size = 4096;
+		batch_bo.handle = xe_bo_create_caching(fd, ctx.vm_id, batch_bo.size,
+						       system_memory(fd), 0,
+						       DRM_XE_GEM_CPU_CACHING_WC);
+
+		batch_bo.ptr = (int *)xe_bo_map(fd, batch_bo.handle, batch_bo.size);
+		batch_bo.addr = 0x300000;
+
+		pos = build_add_batch(&batch_bo, &integers_bo, &result_bo, ints_to_add);
+
+		igt_assert(pos * sizeof(int) <= batch_bo.size);
+
+		/* Wait for large bind operations to complete before binding small BOs */
+		if (vram_bind.n_bufs)
+			xe_wait_ufence(fd, vram_bind.binds_ufence, 1, 0, INT64_MAX);
+		if (sram_bind.n_bufs)
+			xe_wait_ufence(fd, sram_bind.binds_ufence, 1, 0, INT64_MAX);
+
+		xe_vm_bind_lr_sync(fd, ctx.vm_id, integers_bo.handle, 0, integers_bo.addr,
+				   integers_bo.size, 0);
+		xe_vm_bind_lr_sync(fd, ctx.vm_id, result_bo.handle, 0, result_bo.addr,
+				   result_bo.size, 0);
+		xe_vm_bind_lr_sync(fd, ctx.vm_id, batch_bo.handle, 0, batch_bo.addr,
+				   batch_bo.size, 0);
+
+		ufence_bo.size = 4096;
+		ufence_bo.handle = xe_bo_create_caching(fd, ctx.vm_id, ufence_bo.size,
+							system_memory(fd), 0,
+							DRM_XE_GEM_CPU_CACHING_WB);
+		ufence_bo.ptr = (int *)xe_bo_map(fd, ufence_bo.handle, ufence_bo.size);
+		ufence_bo.addr = 0x400000;
+		memset(ufence_bo.ptr, 0, ufence_bo.size);
+		xe_vm_bind_lr_sync(fd, ctx.vm_id, ufence_bo.handle, 0, ufence_bo.addr,
+				   ufence_bo.size, 0);
+
+		batch_syncs[0] = (struct drm_xe_sync){
+			.type = DRM_XE_SYNC_TYPE_USER_FENCE,
+			.flags = DRM_XE_SYNC_FLAG_SIGNAL,
+			.addr = ufence_bo.addr,
+			.timeline_value = 1,
+		};
+
+		exec = (struct drm_xe_exec) {
+			.exec_queue_id = ctx.exec_queue_id,
+			.num_syncs = 1,
+			.syncs = (uintptr_t)batch_syncs,
+			.address = batch_bo.addr,
+			.num_batch_buffer = 1,
+		};
+
+		rc = xe_exec_with_retry(fd, &exec, max_retries);
+		igt_assert_eq(rc, 0);
+		xe_wait_ufence(fd, (uint64_t *)ufence_bo.ptr, 1, ctx.exec_queue_id, INT64_MAX);
+		result_bo.ptr = (int *)xe_bo_map(fd, result_bo.handle, result_bo.size);
+		igt_assert_eq(result_bo.ptr[0], expected_result);
+		cleanup_bo_resources(fd, &ufence_bo);
+		cleanup_bo_resources(fd, &result_bo);
+		cleanup_bo_resources(fd, &batch_bo);
+		cleanup_bo_resources(fd, &integers_bo);
+		cleanup_sram_vram_objs(fd, &vram_bind, &sram_bind);
+		xe_exec_queue_destroy(fd, ctx.exec_queue_id);
+		xe_vm_destroy(fd, ctx.vm_id);
+		close(fd);
+	}
+	igt_waitchildren();
+	pthread_barrier_destroy(barrier);
+	pthread_barrierattr_destroy(&attr);
+	igt_assert_eq(munmap(barrier, sizeof(pthread_barrier_t)), 0);
+}
+
 int igt_main()
 {
 	struct drm_xe_engine_class_instance *hwe, *hwe_non_copy = NULL;
@@ -3486,6 +3882,11 @@ int igt_main()
 		igt_assert(xe_visible_vram_size(fd, 0));
 		test_oom(fd);
 	}
+	igt_subtest("oversubscribe-concurrent-bind")
+	{
+		igt_require(xe_has_vram(fd));
+		test_vm_oversubscribe_concurrent_bind(fd);
+	}
 
 	for (const struct vm_get_property *f = xe_vm_get_property_tests; f->name; f++) {
 		igt_subtest_f("vm-get-property-%s", f->name)
-- 
2.52.0


^ permalink raw reply related	[flat|nested] 3+ messages in thread

* Re: [PATCH i-g-t v6] test/intel/xe_vm:Add oversubscribe concurrent bind stress subtest
  2026-05-12  2:47 Sobin Thomas
@ 2026-05-21 15:52 ` Sharma, Nishit
  0 siblings, 0 replies; 3+ messages in thread
From: Sharma, Nishit @ 2026-05-21 15:52 UTC (permalink / raw)
  To: Sobin Thomas, igt-dev, thomas.hellstrom


On 5/12/2026 8:17 AM, Sobin Thomas wrote:
> Add test for oversubscribing VRAM in multi process environment that
> creates VM, bind large BOs and submit workloads nearly simultaneously.
>
> Previous coverage lacked a scenario combining multi-process bind
> with VRAM oversubscription. This generates memory pressure with
> multi-process VM Bind activity and concurrent submission, exercising
> the bind pipeline under eviction pressure.
>
> v2: Removed helper APIs usage clock_nanosleep and commented
> code.(Nishit)
>
> v3: Refactored code to smaller functions.
>      Added check for available SRAM usage and keep the max process to 20.
>
> v4: Remove explicit macros definition
>      Replace Bind ioctl with library calls.(Thomas)
> v5: Remove unused query_mem_info
>      Fix xe_exec_with_retry (Thomas)
>      Rename align_to_page_size with ALIGN macro (kamil/Thomas)
> v6: Fix vm_bind_bo_batch: move igt_assert(ufence) before first dereference
>      Fix create_test_bos: check errno instead of ret for ENOMEM/ENOSPC
>      detection, since igt_ioctl returns -1 on failure. (Thomas)
>
> Signed-off-by: Sobin Thomas <sobin.thomas@intel.com>
> ---
>   tests/intel/xe_vm.c | 401 ++++++++++++++++++++++++++++++++++++++++++++
>   1 file changed, 401 insertions(+)
>
> diff --git a/tests/intel/xe_vm.c b/tests/intel/xe_vm.c
> index 408bfdb71..9fa551e48 100644
> --- a/tests/intel/xe_vm.c
> +++ b/tests/intel/xe_vm.c
> @@ -21,6 +21,7 @@
>   #include "xe/xe_spin.h"
>   #include <string.h>
>   #define USER_FENCE_VALUE 0xdeadbeefdeadbeefull
> +#define GB(x) (1024ULL * 1024ULL * 1024ULL * (x))
>   
>   enum overcommit_stage {
>   	EXPECT_NONE,
> @@ -29,6 +30,69 @@ enum overcommit_stage {
>   	EXPECT_EXEC,
>   };
>   
> +struct gem_bo {
> +	uint32_t handle;
> +	uint64_t size;
> +	int *ptr;
> +	uint64_t addr;
> +};
> +
> +struct xe_test_ctx {
> +	uint32_t vm_id;
> +	uint32_t exec_queue_id;
> +};
> +
> +struct mem_bind_sync {
> +	struct gem_bo *bufs;
> +	int n_bufs;
> +	uint64_t *binds_ufence;
> +};
> +
> +static void create_exec_queue(int fd, struct xe_test_ctx *ctx)
> +{
> +	struct drm_xe_engine_class_instance *hwe;
> +	struct drm_xe_engine_class_instance eci = { 0 };
> +
> +	/* Use first available engine */
> +	xe_for_each_engine(fd, hwe) {
> +		eci = *hwe;
> +		break;
> +	}
> +	ctx->exec_queue_id = xe_exec_queue_create(fd, ctx->vm_id, &eci, 0);
> +}
> +
> +static uint64_t *
> +vm_bind_bo_batch(int fd, struct xe_test_ctx *ctx, struct gem_bo *bos, int size)
> +{
> +	uint64_t *ufence;
> +	struct drm_xe_sync bind_sync;
> +	struct drm_xe_vm_bind_op binds[size];
> +	int i;
> +
> +	ufence = calloc(1, sizeof(uint64_t));
> +	igt_assert(ufence);
> +	*ufence = 0;
> +	bind_sync = (struct drm_xe_sync) {
> +		.type = DRM_XE_SYNC_TYPE_USER_FENCE,
> +		.flags = DRM_XE_SYNC_FLAG_SIGNAL,
> +		.addr = to_user_pointer(ufence),
> +		.timeline_value = 1,
> +	};
> +
> +	for (i = 0; i < size; i++) {
> +		binds[i] = (struct drm_xe_vm_bind_op) {
> +			.obj = bos[i].handle,
> +		.obj_offset = 0,
> +			.range = bos[i].size,
> +			.addr = bos[i].addr,
> +			.op = DRM_XE_VM_BIND_OP_MAP,
> +			.flags = 0,
> +		};
> +	}
> +	xe_vm_bind_array(fd, ctx->vm_id, 0, binds, size, &bind_sync, 1);
> +	return ufence;
> +}
> +
>   static uint32_t
>   addr_low(uint64_t addr)
>   {
> @@ -3073,6 +3137,338 @@ static void test_get_property(int fd, void (*func)(int fd, uint32_t vm))
>   	xe_vm_destroy(fd, vm);
>   }
>   
> +static int build_add_batch(struct gem_bo *batch_bo, struct gem_bo *integers_bo,
> +			   struct gem_bo *result_bo, int ints_to_add)
> +{
> +	int pos = 0;
> +	uint64_t tmp_addr;
> +	#define GPR_RX_ADDR(x)		(0x600 + (x) * 8)
> +
> +	batch_bo->ptr[pos++] =  MI_LOAD_REGISTER_MEM_CMD | MI_LRI_LRM_CS_MMIO | 2;
> +	batch_bo->ptr[pos++] = GPR_RX_ADDR(0);
> +	tmp_addr = integers_bo->addr + 0 * sizeof(uint32_t);
> +	batch_bo->ptr[pos++] = tmp_addr & 0xFFFFFFFF;
> +	batch_bo->ptr[pos++] = (tmp_addr >> 32) & 0xFFFFFFFF;
> +	for (int i = 1; i < ints_to_add; i++) {
> +		/* r1 = integers_bo[i] */
> +		batch_bo->ptr[pos++] =  MI_LOAD_REGISTER_MEM_CMD | MI_LRI_LRM_CS_MMIO | 2;
> +		batch_bo->ptr[pos++] = GPR_RX_ADDR(1);
> +		tmp_addr = integers_bo->addr + i * sizeof(uint32_t);
> +		batch_bo->ptr[pos++] = tmp_addr & 0xFFFFFFFF;
> +		batch_bo->ptr[pos++] = (tmp_addr >> 32) & 0xFFFFFFFF;
> +		/* r0 = r0 + r1 */
> +		batch_bo->ptr[pos++] = MI_MATH(4);
> +		batch_bo->ptr[pos++] = MI_MATH_LOAD(MI_MATH_REG_SRCA, MI_MATH_REG(0));
> +		batch_bo->ptr[pos++] = MI_MATH_LOAD(MI_MATH_REG_SRCB, MI_MATH_REG(1));
> +		batch_bo->ptr[pos++] = MI_MATH_ADD;
> +		batch_bo->ptr[pos++] = MI_MATH_STORE(MI_MATH_REG(0), MI_MATH_REG_ACCU);
> +	}
> +	/* result_bo[0] = r0 */
> +	batch_bo->ptr[pos++] = MI_STORE_REGISTER_MEM_GEN8 | MI_LRI_LRM_CS_MMIO;
> +	batch_bo->ptr[pos++] = GPR_RX_ADDR(0);
> +	tmp_addr = result_bo->addr + 0 * sizeof(uint32_t);
> +	batch_bo->ptr[pos++] = tmp_addr & 0xFFFFFFFF;
> +	batch_bo->ptr[pos++] = (tmp_addr >> 32) & 0xFFFFFFFF;
> +
> +	batch_bo->ptr[pos++] = MI_BATCH_BUFFER_END;
> +	while (pos % 4 != 0)
> +		batch_bo->ptr[pos++] = MI_NOOP;
> +	return pos;
> +}
> +
> +static void create_test_bos(int fd, struct xe_test_ctx *ctx, struct mem_bind_sync *bind,
> +			    uint32_t  placement, uint64_t *addr)
> +{
> +	const char *mem_type = (placement & vram_memory(fd, 0)) ? "VRAM" : "SRAM";
> +	uint32_t ret;
> +
> +	for (int i = 0; i < bind->n_bufs; i++) {
> +		struct gem_bo *bo = &bind->bufs[i];
> +
> +		bo->size = GB(1);
> +		ret = __xe_bo_create_caching(fd, ctx->vm_id, bo->size, placement, 0,
> +					     DRM_XE_GEM_CPU_CACHING_WC, &bo->handle);
> +		if (ret) {
> +			if (errno == ENOMEM || errno == ENOSPC) {

_xe_bo_create_caching calling __xe_bo_create() which is calling 
igt_ioctl() and returning err which can be -1 for fail or 0 for pass but

it's not setting errno which you are checking in your code so this 
condition will never be checked. Either you need to add err = -errno in 
library

or need to use related function for BO creation which is setting or 
returning errno.

> +				bind->n_bufs = i;
> +				igt_debug("%s allocation failed at buffer %d\n", mem_type, i);
> +				break;
> +			}

__xe_bo_create() fail for any iteration without ENOMEM/ENOSPC, we need 
to handle that also. Like your first 3 xe_bo_create() passes, 4th 
iteration fails without ENOMEM/ENOSPC

but 5th iteration you get ENOMEM/ENOSPC

> +			igt_assert_eq(ret, 0);
> +		}
> +		bo->ptr = NULL;
> +		bo->addr = *addr;
> +		*addr += bo->size;
> +		igt_debug("%s buffer %d created at 0x%016lx\n", mem_type, i, bo->addr);
> +	}
> +}
> +
> +static int fill_random_integers(struct gem_bo *int_bo, int ints_to_add)
> +{
> +	uint32_t expected_result = 0;
> +
> +	for (int i = 0; i < ints_to_add; i++) {
> +		int random_int = rand() % 8;
> +
> +		int_bo->ptr[i] = random_int;
> +		expected_result += random_int;
> +
> +		igt_debug("%d", random_int);
> +		if (i + 1 != ints_to_add)
> +			igt_debug(" + ");
> +		else
> +			igt_debug(" = ");
> +	}
> +	igt_debug("%d\n", expected_result);
> +	return expected_result;
> +}
> +
> +/*
> + * In concurrent VM bind stress tests, multiple threads simultaneously bind
> + * buffers to GPU virtual address space and submit batch operations. This
> + * creates significant GPU memory pressure where the kernel may transiently
> + * fail batch submission when:
> + *   - GPU page tables are being updated across multiple bindings
> + *   - GPU memory is fragmented across many concurrent buffer mappings
> + *   - Multiple processes compete for finite GPU resources
> + *
> + * Without retries, transient ENOMEM/ENOSPC failures cause false test failures.
> + * Retrying lets us distinguish temporary resource exhaustion from actual
> + * driver bugs. Non ENOMEM/ENOSPC errors still fail immediately and are properly
> + * reported with full errno context for debugging.
> + */
> +static int xe_exec_with_retry(int fd, struct drm_xe_exec *exec, int max_retries)
> +{
> +	int rc = 0, retries;
> +
> +	for (retries = 1; retries < max_retries; retries++) {
> +		rc = igt_ioctl(fd, DRM_IOCTL_XE_EXEC, exec);
> +
> +		if (!(rc && (errno == ENOMEM || errno == ENOSPC)))
> +			break;
> +
> +		usleep(100 * retries);
> +		if (retries == 0)
retries is starting from 1 and will never met retries == 0 so it's a 
dead code
> +			igt_warn("got %s, retrying\n", strerror(errno));
> +	}
> +
> +	if (retries == max_retries)
> +		igt_warn("gave up after %d retries\n", retries);
> +
> +	if (rc)
> +		igt_warn("errno: %d (%s)\n", errno, strerror(errno));
> +
> +	return rc;
> +}
> +
> +static void cleanup_bo_resources(int fd, struct gem_bo *bo)
> +{
> +	if (bo->ptr) {
> +		igt_assert_eq(munmap(bo->ptr, bo->size), 0);
> +		bo->ptr = NULL;
> +	}
> +	if (bo->handle)
> +		gem_close(fd, bo->handle);
> +}
> +
> +static void cleanup_sram_vram_objs(int fd, struct mem_bind_sync *vram_bind,
> +				   struct mem_bind_sync *sram_bind)
> +{
> +	for (int i = 0; i < vram_bind->n_bufs; i++)
> +		gem_close(fd, vram_bind->bufs[i].handle);
> +	for (int i = 0; i < sram_bind->n_bufs; i++)
> +		gem_close(fd, sram_bind->bufs[i].handle);
> +	free(vram_bind->bufs);
> +	free(sram_bind->bufs);
> +	if (vram_bind->n_bufs)
> +		free(vram_bind->binds_ufence);
> +	if (sram_bind->n_bufs)
> +		free(sram_bind->binds_ufence);
> +}
> +
> +/**
> + * SUBTEST: oversubscribe-concurrent-bind
> + * Description: Test for oversubscribing the VM with multiple processes
> + * doing binds at the same time, and ensure they all complete successfully.
> + * Functionality: This check is for a specific bug where if multiple processes
> + * oversubscribe the VM, some of the binds may fail with  ENOMEM due to
> + * deadlock in the bind code.
> + * Test category: stress test
> + */
> +static void test_vm_oversubscribe_concurrent_bind(int fd)
> +{
> +	#define MIN_BUFS_PER_PROC 2
> +	#define MAX_THREADS 20
> +	int n_proc = 0, n_vram_bufs = 0, n_sram_bufs = 0;
> +	uint32_t max_by_mem;
> +	uint64_t total_vram_demand = 0;
> +	uint64_t vram_size = xe_visible_available_vram_size(fd, 0);
> +	uint64_t sram_avail = (uint64_t)igt_get_avail_ram_mb() << 20;
> +	uint64_t target_vram = vram_size * 2;      /* 2 of VRAM */
> +	uint64_t target_sram = sram_avail * 50 / 100;  /* 50% system RAM */
> +
> +	int total_vram_bufs = target_vram / GB(1);
> +	int total_sram_bufs = target_sram / GB(1);
> +
> +	/* determine concurrency from memory pressure */
> +
> +	pthread_barrier_t *barrier;
> +	pthread_barrierattr_t attr;
> +
> +	max_by_mem = min(total_vram_bufs / MIN_BUFS_PER_PROC,
> +			 total_sram_bufs / MIN_BUFS_PER_PROC);
> +	n_proc = min_t(uint32_t, max_by_mem, MAX_THREADS);
> +	igt_require_f(n_proc > 0, "Not enough VRAM/RAM for oversubscription test\n");
> +
> +	n_vram_bufs = max(2, total_vram_bufs / n_proc);
> +	n_sram_bufs = max(2, total_sram_bufs / n_proc);
> +	total_vram_demand = (uint64_t)n_proc * n_vram_bufs * GB(1);
> +
> +	igt_debug("VRAM size: %" PRIu64 "MB, System RAM available: %" PRIu64 "MB\n",
> +		  vram_size >> 20, sram_avail >> 20);
> +
> +	igt_debug(" n_proc = %d\n", n_proc);
> +	igt_debug("VRAM: %" PRIu64 "GB\n", vram_size >> 30);
> +	igt_debug("VRAM demand: %" PRIu64 "MB (%.2fx oversubscription)\n",
> +		  total_vram_demand >> 20, (double)total_vram_demand / vram_size);
> +	igt_debug("Processes=%d VRAM_bufs=%d SRAM_bufs=%d\n", n_proc,
> +		  n_vram_bufs, n_sram_bufs);
> +
> +	barrier = mmap(NULL, sizeof(pthread_barrier_t),
> +		       PROT_READ | PROT_WRITE,
> +		       MAP_SHARED | MAP_ANONYMOUS, -1, 0);
> +	igt_assert(barrier != MAP_FAILED);
> +	pthread_barrierattr_init(&attr);
> +	pthread_barrierattr_setpshared(&attr, PTHREAD_PROCESS_SHARED);
> +	pthread_barrier_init(barrier, &attr, n_proc);
> +
> +	igt_fork(child, n_proc) {
> +		struct xe_test_ctx ctx = {0};
> +		int rc;
> +		uint64_t addr = 0x40000000;
> +		int expected_result = 0, ints_to_add = 4;
> +		int max_retries = 1024;
> +		struct gem_bo integers_bo, result_bo, batch_bo, *vram_bufs, *sram_bufs;
> +		int pos = 0;
> +		struct mem_bind_sync vram_bind = {0};
> +		struct mem_bind_sync sram_bind = {0};
> +		struct drm_xe_sync batch_syncs[1];
> +		struct drm_xe_exec exec;
> +		struct gem_bo ufence_bo = {0};
> +
> +		vram_bufs = (struct gem_bo *)calloc(n_vram_bufs, sizeof(struct gem_bo));
> +		sram_bufs = (struct gem_bo *)calloc(n_sram_bufs, sizeof(struct gem_bo));
> +		srand(child);
> +
> +		igt_assert(vram_bufs && sram_bufs);
> +
> +		ctx.vm_id = xe_vm_create(fd, DRM_XE_VM_CREATE_FLAG_SCRATCH_PAGE, 0);
> +		create_exec_queue(fd, &ctx);
> +		vram_bind.bufs = vram_bufs;
> +		vram_bind.n_bufs = n_vram_bufs;
> +		sram_bind.bufs = sram_bufs;
> +		sram_bind.n_bufs = n_sram_bufs;
> +
> +		create_test_bos(fd, &ctx, &vram_bind, vram_memory(fd, 0), &addr);
> +		create_test_bos(fd, &ctx, &sram_bind, system_memory(fd), &addr);
> +
> +		pthread_barrier_wait(barrier);
> +
> +		if (vram_bind.n_bufs)
> +			vram_bind.binds_ufence = vm_bind_bo_batch(fd, &ctx, vram_bufs,
> +								  vram_bind.n_bufs);
> +
> +		if (sram_bind.n_bufs)
> +			sram_bind.binds_ufence = vm_bind_bo_batch(fd, &ctx, sram_bufs,
> +								  sram_bind.n_bufs);

if vram_bind.nbufs and sram_bind.n_bufs = 0 then also below code will be 
executed and we are checking over subscription

and it'll silently pass. Test should be skipped if nbufs above are 0

if (!vram_bind.n_bufs && !sram_bind.n_bufs)
     igt_skip("No BOs allocated; VRAM/SRAM unavailable, skipping\n");

> +
> +		integers_bo.size = ALIGN(sizeof(int) * ints_to_add, 4096);
> +		integers_bo.handle = xe_bo_create_caching(fd, ctx.vm_id, integers_bo.size,
> +							  system_memory(fd), 0,
> +							  DRM_XE_GEM_CPU_CACHING_WC);
> +		integers_bo.ptr = (int *)xe_bo_map(fd, integers_bo.handle, integers_bo.size);
> +		integers_bo.addr = 0x100000;
> +
> +		expected_result = fill_random_integers(&integers_bo, ints_to_add);
> +		igt_debug("%d\n", expected_result);
> +
> +		result_bo.size = ALIGN(sizeof(int), 4096);
> +		result_bo.handle  = xe_bo_create_caching(fd, ctx.vm_id, result_bo.size,
> +							 system_memory(fd), 0,
> +							 DRM_XE_GEM_CPU_CACHING_WC);
> +		result_bo.ptr = NULL;
> +		result_bo.addr = 0x200000;
> +
> +		batch_bo.size = 4096;
> +		batch_bo.handle = xe_bo_create_caching(fd, ctx.vm_id, batch_bo.size,
> +						       system_memory(fd), 0,
> +						       DRM_XE_GEM_CPU_CACHING_WC);
> +
> +		batch_bo.ptr = (int *)xe_bo_map(fd, batch_bo.handle, batch_bo.size);
> +		batch_bo.addr = 0x300000;
> +
> +		pos = build_add_batch(&batch_bo, &integers_bo, &result_bo, ints_to_add);
> +
> +		igt_assert(pos * sizeof(int) <= batch_bo.size);
> +
> +		/* Wait for large bind operations to complete before binding small BOs */
> +		if (vram_bind.n_bufs)
> +			xe_wait_ufence(fd, vram_bind.binds_ufence, 1, 0, INT64_MAX);
> +		if (sram_bind.n_bufs)
> +			xe_wait_ufence(fd, sram_bind.binds_ufence, 1, 0, INT64_MAX);
> +
> +		xe_vm_bind_lr_sync(fd, ctx.vm_id, integers_bo.handle, 0, integers_bo.addr,
> +				   integers_bo.size, 0);
> +		xe_vm_bind_lr_sync(fd, ctx.vm_id, result_bo.handle, 0, result_bo.addr,
> +				   result_bo.size, 0);
> +		xe_vm_bind_lr_sync(fd, ctx.vm_id, batch_bo.handle, 0, batch_bo.addr,
> +				   batch_bo.size, 0);
> +
> +		ufence_bo.size = 4096;
> +		ufence_bo.handle = xe_bo_create_caching(fd, ctx.vm_id, ufence_bo.size,
> +							system_memory(fd), 0,
> +							DRM_XE_GEM_CPU_CACHING_WB);
> +		ufence_bo.ptr = (int *)xe_bo_map(fd, ufence_bo.handle, ufence_bo.size);
> +		ufence_bo.addr = 0x400000;
> +		memset(ufence_bo.ptr, 0, ufence_bo.size);
> +		xe_vm_bind_lr_sync(fd, ctx.vm_id, ufence_bo.handle, 0, ufence_bo.addr,
> +				   ufence_bo.size, 0);
> +
> +		batch_syncs[0] = (struct drm_xe_sync){
> +			.type = DRM_XE_SYNC_TYPE_USER_FENCE,
> +			.flags = DRM_XE_SYNC_FLAG_SIGNAL,
> +			.addr = ufence_bo.addr,
> +			.timeline_value = 1,
> +		};
> +
> +		exec = (struct drm_xe_exec) {
> +			.exec_queue_id = ctx.exec_queue_id,
> +			.num_syncs = 1,
> +			.syncs = (uintptr_t)batch_syncs,
> +			.address = batch_bo.addr,
> +			.num_batch_buffer = 1,
> +		};
> +
> +		rc = xe_exec_with_retry(fd, &exec, max_retries);
> +		igt_assert_eq(rc, 0);
> +		xe_wait_ufence(fd, (uint64_t *)ufence_bo.ptr, 1, ctx.exec_queue_id, INT64_MAX);
> +		result_bo.ptr = (int *)xe_bo_map(fd, result_bo.handle, result_bo.size);
missing igt_assert(map != MAP_FAILED); If it fails below will get crash.
> +		igt_assert_eq(result_bo.ptr[0], expected_result);
> +		cleanup_bo_resources(fd, &ufence_bo);
> +		cleanup_bo_resources(fd, &result_bo);
> +		cleanup_bo_resources(fd, &batch_bo);
> +		cleanup_bo_resources(fd, &integers_bo);
> +		cleanup_sram_vram_objs(fd, &vram_bind, &sram_bind);
> +		xe_exec_queue_destroy(fd, ctx.exec_queue_id);
> +		xe_vm_destroy(fd, ctx.vm_id);
> +		close(fd);
> +	}
> +	igt_waitchildren();
> +	pthread_barrier_destroy(barrier);
> +	pthread_barrierattr_destroy(&attr);
> +	igt_assert_eq(munmap(barrier, sizeof(pthread_barrier_t)), 0);
> +}
> +
>   int igt_main()
>   {
>   	struct drm_xe_engine_class_instance *hwe, *hwe_non_copy = NULL;
> @@ -3486,6 +3882,11 @@ int igt_main()
>   		igt_assert(xe_visible_vram_size(fd, 0));
>   		test_oom(fd);
>   	}
> +	igt_subtest("oversubscribe-concurrent-bind")
> +	{
> +		igt_require(xe_has_vram(fd));
> +		test_vm_oversubscribe_concurrent_bind(fd);
> +	}
>   
>   	for (const struct vm_get_property *f = xe_vm_get_property_tests; f->name; f++) {
>   		igt_subtest_f("vm-get-property-%s", f->name)

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2026-05-21 15:53 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-05-06 14:10 [PATCH i-g-t v6] test/intel/xe_vm:Add oversubscribe concurrent bind stress subtest Sobin Thomas
  -- strict thread matches above, loose matches on Subject: below --
2026-05-12  2:47 Sobin Thomas
2026-05-21 15:52 ` Sharma, Nishit

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.