Intel-XE Archive on lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] drm/xe: Thread prefetch of SVM ranges
@ 2025-06-16  6:47 Matthew Brost
  2025-06-16  8:28 ` Thomas Hellström
                   ` (4 more replies)
  0 siblings, 5 replies; 14+ messages in thread
From: Matthew Brost @ 2025-06-16  6:47 UTC (permalink / raw)
  To: intel-xe; +Cc: himal.prasad.ghimiray, thomas.hellstrom, michal.mrozek

The migrate_vma_* functions are very CPU-intensive; as a result,
prefetching SVM ranges is limited by CPU performance rather than paging
copy engine bandwidth. To accelerate SVM range prefetching, the step
that calls migrate_vma_* is now threaded. This uses a dedicated
workqueue, as the page fault workqueue cannot be shared without risking
deadlocks—due to the prefetch IOCTL holding the VM lock in write mode
while work items in the page fault workqueue also require the VM lock.

The prefetch workqueue is currently allocated in GT, similar to the page
fault workqueue. While this is likely not the ideal location for either,
refactoring will be deferred to a later patch.

Running xe_exec_system_allocator --r prefetch-benchmark, which tests
64MB prefetches, shows an increase from ~4.35 GB/s to 12.25 GB/s with
this patch on drm-tip. Enabling high SLPC further increases throughput
to ~15.25 GB/s, and combining SLPC with ULLS raises it to ~16 GB/s. Both
of these optimizations are upcoming.

v2:
 - Use dedicated prefetch workqueue
 - Pick dedicated prefetch thread count based on profiling
 - Skip threaded prefetch for only 1 range or if prefetching to SRAM
 - Fully tested

Cc: Thomas Hellström <thomas.hellstrom@linux.intel.com>
Cc: Himal Prasad Ghimiray <himal.prasad.ghimiray@intel.com>
Signed-off-by: Matthew Brost <matthew.brost@intel.com>
---
 drivers/gpu/drm/xe/xe_gt_pagefault.c |  31 ++++++-
 drivers/gpu/drm/xe/xe_gt_types.h     |   2 +
 drivers/gpu/drm/xe/xe_vm.c           | 128 +++++++++++++++++++++------
 3 files changed, 135 insertions(+), 26 deletions(-)

diff --git a/drivers/gpu/drm/xe/xe_gt_pagefault.c b/drivers/gpu/drm/xe/xe_gt_pagefault.c
index e2d975b2fddb..941cca3371f2 100644
--- a/drivers/gpu/drm/xe/xe_gt_pagefault.c
+++ b/drivers/gpu/drm/xe/xe_gt_pagefault.c
@@ -400,6 +400,8 @@ static void pagefault_fini(void *arg)
 
 	destroy_workqueue(gt->usm.acc_wq);
 	destroy_workqueue(gt->usm.pf_wq);
+	if (gt->usm.prefetch_wq)
+		destroy_workqueue(gt->usm.prefetch_wq);
 }
 
 static int xe_alloc_pf_queue(struct xe_gt *gt, struct pf_queue *pf_queue)
@@ -438,10 +440,24 @@ static int xe_alloc_pf_queue(struct xe_gt *gt, struct pf_queue *pf_queue)
 	return 0;
 }
 
+static int prefetch_thread_count(struct xe_device *xe)
+{
+	if (!IS_DGFX(xe))
+		return 0;
+
+	/*
+	 * Based on profiling large aligned 2M prefetches, this is the optimial
+	 * number of threads on BMG (only platform currently supported). This
+	 * should be tuned for each supported platform and can change on per
+	 * platform basis as optimizations land (e.g., large device pages).
+	 */
+	return 5;
+}
+
 int xe_gt_pagefault_init(struct xe_gt *gt)
 {
 	struct xe_device *xe = gt_to_xe(gt);
-	int i, ret = 0;
+	int i, count, ret = 0;
 
 	if (!xe->info.has_usm)
 		return 0;
@@ -462,10 +478,23 @@ int xe_gt_pagefault_init(struct xe_gt *gt)
 	if (!gt->usm.pf_wq)
 		return -ENOMEM;
 
+	count = prefetch_thread_count(xe);
+	if (count) {
+		gt->usm.prefetch_wq = alloc_workqueue("xe_gt_prefetch_work_queue",
+						      WQ_UNBOUND | WQ_HIGHPRI,
+						      count);
+		if (!gt->usm.prefetch_wq) {
+			destroy_workqueue(gt->usm.pf_wq);
+			return -ENOMEM;
+		}
+	}
+
 	gt->usm.acc_wq = alloc_workqueue("xe_gt_access_counter_work_queue",
 					 WQ_UNBOUND | WQ_HIGHPRI,
 					 NUM_ACC_QUEUE);
 	if (!gt->usm.acc_wq) {
+		if (gt->usm.prefetch_wq)
+			destroy_workqueue(gt->usm.prefetch_wq);
 		destroy_workqueue(gt->usm.pf_wq);
 		return -ENOMEM;
 	}
diff --git a/drivers/gpu/drm/xe/xe_gt_types.h b/drivers/gpu/drm/xe/xe_gt_types.h
index 7def0959da35..d9ba4921b8ce 100644
--- a/drivers/gpu/drm/xe/xe_gt_types.h
+++ b/drivers/gpu/drm/xe/xe_gt_types.h
@@ -239,6 +239,8 @@ struct xe_gt {
 		u16 reserved_bcs_instance;
 		/** @usm.pf_wq: page fault work queue, unbound, high priority */
 		struct workqueue_struct *pf_wq;
+		/** @usm.prefetch_wq: prefetch work queue, unbound, high priority */
+		struct workqueue_struct *prefetch_wq;
 		/** @usm.acc_wq: access counter work queue, unbound, high priority */
 		struct workqueue_struct *acc_wq;
 		/**
diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c
index 6ef8c4dab647..1ae8e03aead6 100644
--- a/drivers/gpu/drm/xe/xe_vm.c
+++ b/drivers/gpu/drm/xe/xe_vm.c
@@ -2885,52 +2885,130 @@ static int check_ufence(struct xe_vma *vma)
 	return 0;
 }
 
-static int prefetch_ranges(struct xe_vm *vm, struct xe_vma_op *op)
+struct prefetch_thread {
+	struct work_struct work;
+	struct drm_gpusvm_ctx *ctx;
+	struct xe_vma *vma;
+	struct xe_svm_range *svm_range;
+	struct xe_tile *tile;
+	u32 region;
+	int err;
+};
+
+static void prefetch_work_func(struct work_struct *w)
 {
-	bool devmem_possible = IS_DGFX(vm->xe) && IS_ENABLED(CONFIG_DRM_XE_DEVMEM_MIRROR);
-	struct xe_vma *vma = gpuva_to_vma(op->base.prefetch.va);
+	struct prefetch_thread *thread =
+		container_of(w, struct prefetch_thread, work);
+	struct xe_vma *vma = thread->vma;
+	struct xe_vm *vm = xe_vma_vm(vma);
+	struct xe_svm_range *svm_range = thread->svm_range;
+	u32 region = thread->region;
+	struct xe_tile *tile = thread->tile;
 	int err = 0;
 
-	struct xe_svm_range *svm_range;
+	if (!region) {
+		xe_svm_range_migrate_to_smem(vm, svm_range);
+	} else if (xe_svm_range_needs_migrate_to_vram(svm_range, vma, region)) {
+		err = xe_svm_alloc_vram(vm, tile, svm_range, thread->ctx);
+		if (err) {
+			drm_dbg(&vm->xe->drm,
+				"VRAM allocation failed, retry from userspace, asid=%u, gpusvm=%p, errno=%pe\n",
+				vm->usm.asid, &vm->svm.gpusvm, ERR_PTR(err));
+			thread->err = -ENODATA;
+			return;
+		}
+		xe_svm_range_debug(svm_range, "PREFETCH - RANGE MIGRATED TO VRAM");
+	}
+
+	err = xe_svm_range_get_pages(vm, svm_range, thread->ctx);
+	if (err) {
+		drm_dbg(&vm->xe->drm, "Get pages failed, asid=%u, gpusvm=%p, errno=%pe\n",
+			vm->usm.asid, &vm->svm.gpusvm, ERR_PTR(err));
+		if (err == -EOPNOTSUPP || err == -EFAULT || err == -EPERM)
+			err = -ENODATA;
+		thread->err = err;
+		return;
+	}
+
+	xe_svm_range_debug(svm_range, "PREFETCH - RANGE GET PAGES DONE");
+}
+
+static int prefetch_ranges(struct xe_vm *vm, struct xe_vma_op *op)
+{
+	struct xe_vma *vma = gpuva_to_vma(op->base.prefetch.va);
+	u32 j, region = op->prefetch_range.region;
 	struct drm_gpusvm_ctx ctx = {};
-	struct xe_tile *tile;
+	struct prefetch_thread stack_thread;
+	struct xe_svm_range *svm_range;
+	struct xarray prefetches;
+	bool sram = region_to_mem_type[region] == XE_PL_TT;
+	struct xe_tile *tile = sram ? xe_device_get_root_tile(vm->xe) :
+		&vm->xe->tiles[region_to_mem_type[region] - XE_PL_VRAM0];
 	unsigned long i;
-	u32 region;
+	bool devmem_possible = IS_DGFX(vm->xe) &&
+		IS_ENABLED(CONFIG_DRM_XE_DEVMEM_MIRROR);
+	bool skip_threads = op->prefetch_range.ranges_count == 1 || sram;
+	struct prefetch_thread *thread = skip_threads ? &stack_thread : NULL;
+	int err = 0;
 
 	if (!xe_vma_is_cpu_addr_mirror(vma))
 		return 0;
 
-	region = op->prefetch_range.region;
+	if (!skip_threads)
+		xa_init_flags(&prefetches, XA_FLAGS_ALLOC);
 
 	ctx.read_only = xe_vma_read_only(vma);
 	ctx.devmem_possible = devmem_possible;
 	ctx.check_pages_threshold = devmem_possible ? SZ_64K : 0;
 
-	/* TODO: Threading the migration */
 	xa_for_each(&op->prefetch_range.range, i, svm_range) {
-		if (!region)
-			xe_svm_range_migrate_to_smem(vm, svm_range);
+		if (!skip_threads) {
+			thread = kmalloc(sizeof(*thread), GFP_KERNEL);
+			if (!thread)
+				goto wait_threads;
 
-		if (xe_svm_range_needs_migrate_to_vram(svm_range, vma, region)) {
-			tile = &vm->xe->tiles[region_to_mem_type[region] - XE_PL_VRAM0];
-			err = xe_svm_alloc_vram(vm, tile, svm_range, &ctx);
+			err = xa_alloc(&prefetches, &j, thread, xa_limit_32b,
+				       GFP_KERNEL);
 			if (err) {
-				drm_dbg(&vm->xe->drm, "VRAM allocation failed, retry from userspace, asid=%u, gpusvm=%p, errno=%pe\n",
-					vm->usm.asid, &vm->svm.gpusvm, ERR_PTR(err));
-				return -ENODATA;
+				kfree(thread);
+				goto wait_threads;
 			}
-			xe_svm_range_debug(svm_range, "PREFETCH - RANGE MIGRATED TO VRAM");
 		}
 
-		err = xe_svm_range_get_pages(vm, svm_range, &ctx);
-		if (err) {
-			drm_dbg(&vm->xe->drm, "Get pages failed, asid=%u, gpusvm=%p, errno=%pe\n",
-				vm->usm.asid, &vm->svm.gpusvm, ERR_PTR(err));
-			if (err == -EOPNOTSUPP || err == -EFAULT || err == -EPERM)
-				err = -ENODATA;
-			return err;
+		INIT_WORK(&thread->work, prefetch_work_func);
+		thread->ctx = &ctx;
+		thread->vma = vma;
+		thread->svm_range = svm_range;
+		thread->tile = tile;
+		thread->region = region;
+		thread->err = 0;
+
+		if (skip_threads) {
+			prefetch_work_func(&thread->work);
+			if (thread->err)
+				return thread->err;
+		} else {
+			/*
+			 * Prefetch uses a dedicated workqueue, as the page
+			 * fault workqueue cannot be shared without risking
+			 * deadlocks—due to holding the VM lock in write mode
+			 * here while work items in the page fault workqueue
+			 * also require the VM lock.
+			 */
+			queue_work(tile->primary_gt->usm.prefetch_wq,
+				   &thread->work);
+		}
+	}
+
+wait_threads:
+	if (!skip_threads) {
+		xa_for_each(&prefetches, i, thread) {
+			flush_work(&thread->work);
+			if (thread->err && (!err || err == -ENODATA))
+				err = thread->err;
+			kfree(thread);
 		}
-		xe_svm_range_debug(svm_range, "PREFETCH - RANGE GET PAGES DONE");
+		xa_destroy(&prefetches);
 	}
 
 	return err;
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 14+ messages in thread
* [PATCH] drm/xe: Thread prefetch of SVM ranges
@ 2025-05-28 17:27 Matthew Brost
  0 siblings, 0 replies; 14+ messages in thread
From: Matthew Brost @ 2025-05-28 17:27 UTC (permalink / raw)
  To: intel-xe; +Cc: thomas.hellstrom, himal.prasad.ghimiray

The migrate_vma_* functions are very CPU-intensive; thus, prefetching of
SVM ranges is limited by the CPU rather than the paging copy engine
bandwidth. In an effort to speed up the prefetching of SVM ranges, the
step that calls migrate_vma_* is now threaded. This utilizes the
existing page fault work queue for threading.

Cc: Thomas Hellström <thomas.hellstrom@linux.intel.com>
Cc: Himal Prasad Ghimiray <himal.prasad.ghimiray@intel.com>
Signed-off-by: Matthew Brost <matthew.brost@intel.com>
---
 drivers/gpu/drm/xe/xe_vm.c | 111 +++++++++++++++++++++++++++----------
 1 file changed, 83 insertions(+), 28 deletions(-)

diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c
index 5a978da411b0..18e5a36c6c21 100644
--- a/drivers/gpu/drm/xe/xe_vm.c
+++ b/drivers/gpu/drm/xe/xe_vm.c
@@ -2878,53 +2878,108 @@ static int check_ufence(struct xe_vma *vma)
 	return 0;
 }
 
-static int prefetch_ranges(struct xe_vm *vm, struct xe_vma_op *op)
+struct prefetch_thread {
+	struct work_struct work;
+	struct drm_gpusvm_ctx *ctx;
+	struct xe_vma *vma;
+	struct xe_svm_range *svm_range;
+	u32 region;
+	int err;
+};
+
+static void prefetch_work_func(struct work_struct *w)
 {
-	bool devmem_possible = IS_DGFX(vm->xe) && IS_ENABLED(CONFIG_DRM_XE_DEVMEM_MIRROR);
-	struct xe_vma *vma = gpuva_to_vma(op->base.prefetch.va);
+	struct prefetch_thread *thread =
+		container_of(w, struct prefetch_thread, work);
+	struct xe_vma *vma = thread->vma;
+	struct xe_vm *vm = xe_vma_vm(vma);
+	struct xe_svm_range *svm_range = thread->svm_range;
+	u32 region = thread->region;
+	struct xe_tile *tile =
+		&vm->xe->tiles[region_to_mem_type[region] - XE_PL_VRAM0];
 	int err = 0;
 
-	struct xe_svm_range *svm_range;
+	if (!region) {
+		xe_svm_range_migrate_to_smem(vm, svm_range);
+	} else if (xe_svm_range_needs_migrate_to_vram(svm_range, vma, region)) {
+		err = xe_svm_alloc_vram(vm, tile, svm_range, thread->ctx);
+		if (err) {
+			drm_dbg(&vm->xe->drm,
+				"VRAM allocation failed, retry from userspace, asid=%u, gpusvm=%p, errno=%pe\n",
+				vm->usm.asid, &vm->svm.gpusvm, ERR_PTR(err));
+			thread->err = -ENODATA;
+			return;
+		}
+		xe_svm_range_debug(svm_range, "PREFETCH - RANGE MIGRATED TO VRAM");
+	}
+
+	err = xe_svm_range_get_pages(vm, svm_range, thread->ctx);
+	if (err) {
+		if (err == -EOPNOTSUPP || err == -EFAULT || err == -EPERM)
+			err = -ENODATA;
+		drm_dbg(&vm->xe->drm, "Get pages failed, asid=%u, gpusvm=%p, errno=%pe\n",
+			vm->usm.asid, &vm->svm.gpusvm, ERR_PTR(err));
+		thread->err = err;
+		return;
+	}
+
+	xe_svm_range_debug(svm_range, "PREFETCH - RANGE GET PAGES DONE");
+}
+
+static int prefetch_ranges(struct xe_vm *vm, struct xe_vma_op *op)
+{
+	struct xe_vma *vma = gpuva_to_vma(op->base.prefetch.va);
+	u32 j, region = op->prefetch_range.region;
 	struct drm_gpusvm_ctx ctx = {};
-	struct xe_tile *tile;
+	struct prefetch_thread *thread;
+	struct xe_svm_range *svm_range;
+	struct xarray prefetches;
+	struct xe_tile *tile =
+		&vm->xe->tiles[region_to_mem_type[region] - XE_PL_VRAM0];
 	unsigned long i;
-	u32 region;
+	bool devmem_possible = IS_DGFX(vm->xe) &&
+		IS_ENABLED(CONFIG_DRM_XE_DEVMEM_MIRROR);
+	int err = 0;
 
 	if (!xe_vma_is_cpu_addr_mirror(vma))
 		return 0;
 
-	region = op->prefetch_range.region;
+	xa_init_flags(&prefetches, XA_FLAGS_ALLOC);
 
 	ctx.read_only = xe_vma_read_only(vma);
 	ctx.devmem_possible = devmem_possible;
 	ctx.check_pages_threshold = devmem_possible ? SZ_64K : 0;
 
-	/* TODO: Threading the migration */
 	xa_for_each(&op->prefetch_range.range, i, svm_range) {
-		if (!region)
-			xe_svm_range_migrate_to_smem(vm, svm_range);
+		thread = kmalloc(sizeof(*thread), GFP_KERNEL);
+		if (!thread)
+			goto wait_threads;
 
-		if (xe_svm_range_needs_migrate_to_vram(svm_range, vma, region)) {
-			tile = &vm->xe->tiles[region_to_mem_type[region] - XE_PL_VRAM0];
-			err = xe_svm_alloc_vram(vm, tile, svm_range, &ctx);
-			if (err) {
-				drm_dbg(&vm->xe->drm, "VRAM allocation failed, retry from userspace, asid=%u, gpusvm=%p, errno=%pe\n",
-					vm->usm.asid, &vm->svm.gpusvm, ERR_PTR(err));
-				return -ENODATA;
-			}
-			xe_svm_range_debug(svm_range, "PREFETCH - RANGE MIGRATED TO VRAM");
-		}
-
-		err = xe_svm_range_get_pages(vm, svm_range, &ctx);
+		err = xa_alloc(&prefetches, &j, thread, xa_limit_32b,
+			       GFP_KERNEL);
 		if (err) {
-			if (err == -EOPNOTSUPP || err == -EFAULT || err == -EPERM)
-				err = -ENODATA;
-			drm_dbg(&vm->xe->drm, "Get pages failed, asid=%u, gpusvm=%p, errno=%pe\n",
-				vm->usm.asid, &vm->svm.gpusvm, ERR_PTR(err));
-			return err;
+			kfree(thread);
+			goto wait_threads;
 		}
-		xe_svm_range_debug(svm_range, "PREFETCH - RANGE GET PAGES DONE");
+
+		INIT_WORK(&thread->work, prefetch_work_func);
+		thread->ctx = &ctx;
+		thread->vma = vma;
+		thread->svm_range = svm_range;
+		thread->region = region;
+		thread->err = 0;
+
+		queue_work(tile->primary_gt->usm.pf_wq, &thread->work);
+	}
+
+wait_threads:
+	xa_for_each(&prefetches, i, thread) {
+		flush_work(&thread->work);
+		if (thread->err && (!err || err == -ENODATA))
+			err = thread->err;
+		kfree(thread);
 	}
+	xa_destroy(&prefetches);
 
 	return err;
 }
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2025-06-17 17:09 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-06-16  6:47 [PATCH] drm/xe: Thread prefetch of SVM ranges Matthew Brost
2025-06-16  8:28 ` Thomas Hellström
2025-06-16  8:58   ` Matthew Brost
2025-06-16  9:24     ` Thomas Hellström
2025-06-16 12:06       ` Mrozek, Michal
2025-06-17 14:30         ` Matthew Brost
2025-06-16 11:20     ` Thomas Hellström
2025-06-17 17:10       ` Matthew Brost
2025-06-16 11:51 ` ✓ CI.KUnit: success for drm/xe: Thread prefetch of SVM ranges (rev2) Patchwork
2025-06-16 12:32 ` ✓ Xe.CI.BAT: " Patchwork
2025-06-16 17:53 ` ✗ Xe.CI.Full: failure " Patchwork
2025-06-17 12:43 ` [PATCH] drm/xe: Thread prefetch of SVM ranges Thomas Hellström
2025-06-17 14:43   ` Matthew Brost
  -- strict thread matches above, loose matches on Subject: below --
2025-05-28 17:27 Matthew Brost

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox