[PATCH v3 03/12] drm/xe: Thread prefetch of SVM ranges

Intel-XE Archive on lore.kernel.org
 help / color / mirror / Atom feed

From: Matthew Brost <matthew.brost@intel.com>
To: intel-xe@lists.freedesktop.org
Cc: stuart.summers@intel.com, arvind.yadav@intel.com,
	himal.prasad.ghimiray@intel.com,
	thomas.hellstrom@linux.intel.com, francois.dugast@intel.com
Subject: [PATCH v3 03/12] drm/xe: Thread prefetch of SVM ranges
Date: Wed, 25 Feb 2026 12:27:27 -0800	[thread overview]
Message-ID: <20260225202736.2723250-4-matthew.brost@intel.com> (raw)
In-Reply-To: <20260225202736.2723250-1-matthew.brost@intel.com>

The migrate_vma_* functions are very CPU-intensive; as a result,
prefetching SVM ranges is limited by CPU performance rather than paging
copy engine bandwidth. To accelerate SVM range prefetching, the step
that calls migrate_vma_* is now threaded. Reuses the page fault work
queue for threading.

Running xe_exec_system_allocator --r prefetch-benchmark, which tests
64MB prefetches, shows an increase from ~4.35 GB/s to 12.25 GB/s with
this patch on drm-tip. Enabling high SLPC further increases throughput
to ~15.25 GB/s, and combining SLPC with ULLS raises it to ~16 GB/s. Both
of these optimizations are upcoming.

v2:
 - Use dedicated prefetch workqueue
 - Pick dedicated prefetch thread count based on profiling
 - Skip threaded prefetch for only 1 range or if prefetching to SRAM
 - Fully tested
v3:
 - Use page fault work queue

Cc: Thomas Hellström <thomas.hellstrom@linux.intel.com>
Cc: Himal Prasad Ghimiray <himal.prasad.ghimiray@intel.com>
Signed-off-by: Matthew Brost <matthew.brost@intel.com>
---
 drivers/gpu/drm/xe/xe_pagefault.c |  31 +++++-
 drivers/gpu/drm/xe/xe_svm.c       |  23 ++++-
 drivers/gpu/drm/xe/xe_svm.h       |   3 +-
 drivers/gpu/drm/xe/xe_vm.c        | 150 +++++++++++++++++++++++-------
 drivers/gpu/drm/xe/xe_vm_types.h  |  15 +--
 5 files changed, 173 insertions(+), 49 deletions(-)

diff --git a/drivers/gpu/drm/xe/xe_pagefault.c b/drivers/gpu/drm/xe/xe_pagefault.c
index 421262c2a63a..a372db7cd839 100644
--- a/drivers/gpu/drm/xe/xe_pagefault.c
+++ b/drivers/gpu/drm/xe/xe_pagefault.c
@@ -173,7 +173,17 @@ static int xe_pagefault_service(struct xe_pagefault *pf)
 	if (IS_ERR(vm))
 		return PTR_ERR(vm);
 
-	down_read(&vm->lock);
+	/*
+	 * We can't block threaded prefetches from completing. down_read() can
+	 * block on a pending down_write(), so without a trylock here, we could
+	 * deadlock, since the page fault workqueue is shared with prefetches,
+	 * prefetches flush work items onto the same workqueue, and a
+	 * down_write() could be pending.
+	 */
+	if (!down_read_trylock(&vm->lock)) {
+		err = -EAGAIN;
+		goto put_vm;
+	}
 
 	if (xe_vm_is_closed(vm)) {
 		err = -ENOENT;
@@ -198,11 +208,23 @@ static int xe_pagefault_service(struct xe_pagefault *pf)
 	if (!err)
 		vm->usm.last_fault_vma = vma;
 	up_read(&vm->lock);
+put_vm:
 	xe_vm_put(vm);
 
 	return err;
 }
 
+static void xe_pagefault_queue_retry(struct xe_pagefault_queue *pf_queue,
+				     struct xe_pagefault *pf)
+{
+	spin_lock_irq(&pf_queue->lock);
+	if (!pf_queue->tail)
+		pf_queue->tail = pf_queue->size - xe_pagefault_entry_size();
+	else
+		pf_queue->tail -= xe_pagefault_entry_size();
+	spin_unlock_irq(&pf_queue->lock);
+}
+
 static bool xe_pagefault_queue_pop(struct xe_pagefault_queue *pf_queue,
 				   struct xe_pagefault *pf)
 {
@@ -260,7 +282,12 @@ static void xe_pagefault_queue_work(struct work_struct *w)
 			continue;
 
 		err = xe_pagefault_service(&pf);
-		if (err) {
+
+		if (err == -EAGAIN) {
+			xe_pagefault_queue_retry(pf_queue, &pf);
+			queue_work(gt_to_xe(pf.gt)->usm.pf_wq, w);
+			break;
+		} else if (err) {
 			if (!(pf.consumer.access_type & XE_PAGEFAULT_ACCESS_PREFETCH)) {
 				xe_pagefault_print(&pf);
 				xe_gt_info(pf.gt, "Fault response: Unsuccessful %pe\n",
diff --git a/drivers/gpu/drm/xe/xe_svm.c b/drivers/gpu/drm/xe/xe_svm.c
index 3e59695e0c01..66eee490a0c3 100644
--- a/drivers/gpu/drm/xe/xe_svm.c
+++ b/drivers/gpu/drm/xe/xe_svm.c
@@ -436,8 +436,19 @@ static void xe_svm_garbage_collector_work_func(struct work_struct *w)
 	struct xe_vm *vm = container_of(w, struct xe_vm,
 					svm.garbage_collector.work);
 
-	guard(rwsem_read)(&vm->lock);
-	xe_svm_garbage_collector(vm);
+	/*
+	 * We can't block threaded prefetches from completing. down_read() can
+	 * block on a pending down_write(), so without a trylock here, we could
+	 * deadlock, since the page fault workqueue is shared with prefetches,
+	 * prefetches flush work items onto the same workqueue, and a
+	 * down_write() could be pending.
+	 */
+	if (down_read_trylock(&vm->lock)) {
+		xe_svm_garbage_collector(vm);
+		up_read(&vm->lock);
+	} else {
+		queue_work(vm->xe->usm.pf_wq, &vm->svm.garbage_collector.work);
+	}
 }
 
 #if IS_ENABLED(CONFIG_DRM_XE_PAGEMAP)
@@ -988,6 +999,7 @@ void xe_svm_range_migrate_to_smem(struct xe_vm *vm, struct xe_svm_range *range)
  * @tile_mask: Mask representing the tiles to be checked
  * @dpagemap: if !%NULL, the range is expected to be present
  * in device memory identified by this parameter.
+ * @valid_pages: Pages are valid, result written back to caller
  *
  * The xe_svm_range_validate() function checks if a range is
  * valid and located in the desired memory region.
@@ -996,7 +1008,8 @@ void xe_svm_range_migrate_to_smem(struct xe_vm *vm, struct xe_svm_range *range)
  */
 bool xe_svm_range_validate(struct xe_vm *vm,
 			   struct xe_svm_range *range,
-			   u8 tile_mask, const struct drm_pagemap *dpagemap)
+			   u8 tile_mask, const struct drm_pagemap *dpagemap,
+			   bool *valid_pages)
 {
 	bool ret;
 
@@ -1008,6 +1021,8 @@ bool xe_svm_range_validate(struct xe_vm *vm,
 	else
 		ret = ret && !range->base.pages.dpagemap;
 
+	*valid_pages = xe_svm_range_pages_valid(range);
+
 	xe_svm_notifier_unlock(vm);
 
 	return ret;
@@ -2064,5 +2079,5 @@ struct drm_pagemap *xe_drm_pagemap_from_fd(int fd, u32 region_instance)
 void xe_svm_flush(struct xe_vm *vm)
 {
 	if (xe_vm_in_fault_mode(vm))
-		flush_work(&vm->svm.garbage_collector.work);
+		__flush_workqueue(vm->xe->usm.pf_wq);
 }
diff --git a/drivers/gpu/drm/xe/xe_svm.h b/drivers/gpu/drm/xe/xe_svm.h
index 7dcf8b084692..4fe6b846aca8 100644
--- a/drivers/gpu/drm/xe/xe_svm.h
+++ b/drivers/gpu/drm/xe/xe_svm.h
@@ -132,7 +132,8 @@ void xe_svm_range_migrate_to_smem(struct xe_vm *vm, struct xe_svm_range *range);
 
 bool xe_svm_range_validate(struct xe_vm *vm,
 			   struct xe_svm_range *range,
-			   u8 tile_mask, const struct drm_pagemap *dpagemap);
+			   u8 tile_mask, const struct drm_pagemap *dpagemap,
+			   bool *valid_pages);
 
 u64 xe_svm_find_vma_start(struct xe_vm *vm, u64 addr, u64 end,  struct xe_vma *vma);
 
diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c
index 204a89ca3397..06669e9c500d 100644
--- a/drivers/gpu/drm/xe/xe_vm.c
+++ b/drivers/gpu/drm/xe/xe_vm.c
@@ -2399,6 +2399,7 @@ vm_bind_ioctl_ops_create(struct xe_vm *vm, struct xe_vma_ops *vops,
 			struct drm_pagemap *dpagemap = NULL;
 			u8 id, tile_mask = 0;
 			u32 i;
+			bool valid_pages;
 
 			if (xe_vma_is_userptr(vma))
 				vops->flags |= XE_VMA_OPS_FLAG_MODIFIES_GPUVA;
@@ -2446,8 +2447,10 @@ vm_bind_ioctl_ops_create(struct xe_vm *vm, struct xe_vma_ops *vops,
 				goto unwind_prefetch_ops;
 			}
 
-			if (xe_svm_range_validate(vm, svm_range, tile_mask, dpagemap)) {
+			if (xe_svm_range_validate(vm, svm_range, tile_mask,
+						  dpagemap, &valid_pages)) {
 				xe_svm_range_debug(svm_range, "PREFETCH - RANGE IS VALID");
+				xe_assert(vm->xe, valid_pages);
 				goto check_next_range;
 			}
 
@@ -2460,6 +2463,8 @@ vm_bind_ioctl_ops_create(struct xe_vm *vm, struct xe_vma_ops *vops,
 
 			op->prefetch_range.ranges_count++;
 			vops->flags |= XE_VMA_OPS_FLAG_HAS_SVM_PREFETCH;
+			if (valid_pages)
+				vops->flags |= XE_VMA_OPS_FLAG_HAS_SVM_VALID_RANGE;
 			xe_svm_range_debug(svm_range, "PREFETCH - RANGE CREATED");
 check_next_range:
 			if (range_end > xe_svm_range_end(svm_range) &&
@@ -2976,16 +2981,83 @@ static int check_ufence(struct xe_vma *vma)
 	return 0;
 }
 
-static int prefetch_ranges(struct xe_vm *vm, struct xe_vma_op *op)
+struct prefetch_thread {
+	struct work_struct work;
+	struct drm_gpusvm_ctx *ctx;
+	struct xe_vma *vma;
+	struct xe_svm_range *svm_range;
+	struct drm_pagemap *dpagemap;
+	int err;
+};
+
+static void prefetch_thread_func(struct prefetch_thread *thread)
+{
+	struct xe_vma *vma = thread->vma;
+	struct xe_vm *vm = xe_vma_vm(vma);
+	struct xe_svm_range *svm_range = thread->svm_range;
+	struct drm_pagemap *dpagemap = thread->dpagemap;
+	int err = 0;
+
+	guard(mutex)(&svm_range->lock);
+
+	if (xe_svm_range_is_removed(svm_range)) {
+		thread->err = -ENODATA;
+		return;
+	}
+
+	if (!dpagemap)
+		xe_svm_range_migrate_to_smem(vm, svm_range);
+
+	if (IS_ENABLED(CONFIG_DRM_XE_DEBUG_VM)) {
+		drm_dbg(&vm->xe->drm,
+			"Prefetch pagemap is %s start 0x%016lx end 0x%016lx\n",
+			dpagemap ? dpagemap->drm->unique : "system",
+			xe_svm_range_start(svm_range), xe_svm_range_end(svm_range));
+	}
+
+	if (xe_svm_range_needs_migrate_to_vram(svm_range, vma, dpagemap)) {
+		err = xe_svm_alloc_vram(svm_range, thread->ctx, dpagemap);
+		if (err) {
+			drm_dbg(&vm->xe->drm, "VRAM allocation failed, retry from userspace, asid=%u, gpusvm=%p, errno=%pe\n",
+				vm->usm.asid, &vm->svm.gpusvm, ERR_PTR(err));
+			thread->err = -ENODATA;
+			return;
+		}
+		xe_svm_range_debug(svm_range, "PREFETCH - RANGE MIGRATED TO VRAM");
+	}
+
+	err = xe_svm_range_get_pages(vm, svm_range, thread->ctx);
+	if (err) {
+		drm_dbg(&vm->xe->drm, "Get pages failed, asid=%u, gpusvm=%p, errno=%pe\n",
+			vm->usm.asid, &vm->svm.gpusvm, ERR_PTR(err));
+		if (err == -EOPNOTSUPP || err == -EFAULT || err == -EPERM)
+			err = -ENODATA;
+		thread->err = -ENODATA;
+		return;
+	}
+	xe_svm_range_debug(svm_range, "PREFETCH - RANGE GET PAGES DONE");
+}
+
+static void prefetch_work_func(struct work_struct *w)
+{
+	struct prefetch_thread *thread =
+		container_of(w, struct prefetch_thread, work);
+
+	prefetch_thread_func(thread);
+}
+
+static int prefetch_ranges(struct xe_vm *vm, struct xe_vma_ops *vops,
+			   struct xe_vma_op *op)
 {
 	bool devmem_possible = IS_DGFX(vm->xe) && IS_ENABLED(CONFIG_DRM_XE_PAGEMAP);
 	struct xe_vma *vma = gpuva_to_vma(op->base.prefetch.va);
 	struct drm_pagemap *dpagemap = op->prefetch_range.dpagemap;
-	int err = 0;
-
 	struct xe_svm_range *svm_range;
 	struct drm_gpusvm_ctx ctx = {};
+	struct prefetch_thread stack_thread, *thread, *prefetches;
 	unsigned long i;
+	int err = 0, idx = 0;
+	bool skip_threads;
 
 	if (!xe_vma_is_cpu_addr_mirror(vma))
 		return 0;
@@ -2995,42 +3067,49 @@ static int prefetch_ranges(struct xe_vm *vm, struct xe_vma_op *op)
 	ctx.check_pages_threshold = devmem_possible ? SZ_64K : 0;
 	ctx.device_private_page_owner = xe_svm_private_page_owner(vm, !dpagemap);
 
-	/* TODO: Threading the migration */
-	xa_for_each(&op->prefetch_range.range, i, svm_range) {
-		guard(mutex)(&svm_range->lock);
-
-		if (xe_svm_range_is_removed(svm_range))
-			return -ENODATA;
+	skip_threads =  op->prefetch_range.ranges_count == 1 ||
+		(!dpagemap && !(vops->flags &
+				XE_VMA_OPS_FLAG_HAS_SVM_VALID_RANGE)) ||
+		!(vops->flags & XE_VMA_OPS_FLAG_DOWNGRADE_LOCK);
+	thread = skip_threads ? &stack_thread : NULL;
 
-		if (!dpagemap)
-			xe_svm_range_migrate_to_smem(vm, svm_range);
+	if (!skip_threads) {
+		prefetches = kvmalloc_array(op->prefetch_range.ranges_count,
+					    sizeof(*prefetches), GFP_KERNEL);
+		if (!prefetches)
+			return -ENOMEM;
+	}
 
-		if (IS_ENABLED(CONFIG_DRM_XE_DEBUG_VM)) {
-			drm_dbg(&vm->xe->drm,
-				"Prefetch pagemap is %s start 0x%016lx end 0x%016lx\n",
-				dpagemap ? dpagemap->drm->unique : "system",
-				xe_svm_range_start(svm_range), xe_svm_range_end(svm_range));
+	xa_for_each(&op->prefetch_range.range, i, svm_range) {
+		if (!skip_threads) {
+			thread = prefetches + idx++;
+			INIT_WORK(&thread->work, prefetch_work_func);
 		}
 
-		if (xe_svm_range_needs_migrate_to_vram(svm_range, vma, dpagemap)) {
-			err = xe_svm_alloc_vram(svm_range, &ctx, dpagemap);
-			if (err) {
-				drm_dbg(&vm->xe->drm, "VRAM allocation failed, retry from userspace, asid=%u, gpusvm=%p, errno=%pe\n",
-					vm->usm.asid, &vm->svm.gpusvm, ERR_PTR(err));
-				return -ENODATA;
-			}
-			xe_svm_range_debug(svm_range, "PREFETCH - RANGE MIGRATED TO VRAM");
+		thread->ctx = &ctx;
+		thread->vma = vma;
+		thread->svm_range = svm_range;
+		thread->dpagemap = dpagemap;
+		thread->err = 0;
+
+		if (skip_threads) {
+			prefetch_thread_func(thread);
+			if (thread->err)
+				return thread->err;
+		} else {
+			queue_work(vm->xe->usm.pf_wq, &thread->work);
 		}
+	}
 
-		err = xe_svm_range_get_pages(vm, svm_range, &ctx);
-		if (err) {
-			drm_dbg(&vm->xe->drm, "Get pages failed, asid=%u, gpusvm=%p, errno=%pe\n",
-				vm->usm.asid, &vm->svm.gpusvm, ERR_PTR(err));
-			if (err == -EOPNOTSUPP || err == -EFAULT || err == -EPERM)
-				err = -ENODATA;
-			return err;
+	if (!skip_threads) {
+		for (i = 0; i < idx; ++i) {
+			thread = prefetches + i;
+
+			flush_work(&thread->work);
+			if (thread->err && (!err || err == -ENODATA))
+				err = thread->err;
 		}
-		xe_svm_range_debug(svm_range, "PREFETCH - RANGE GET PAGES DONE");
+		kvfree(prefetches);
 	}
 
 	return err;
@@ -3109,7 +3188,8 @@ static int op_lock_and_prep(struct drm_exec *exec, struct xe_vm *vm,
 	return err;
 }
 
-static int vm_bind_ioctl_ops_prefetch_ranges(struct xe_vm *vm, struct xe_vma_ops *vops)
+static int vm_bind_ioctl_ops_prefetch_ranges(struct xe_vm *vm,
+					     struct xe_vma_ops *vops)
 {
 	struct xe_vma_op *op;
 	int err;
@@ -3119,7 +3199,7 @@ static int vm_bind_ioctl_ops_prefetch_ranges(struct xe_vm *vm, struct xe_vma_ops
 
 	list_for_each_entry(op, &vops->list, link) {
 		if (op->base.op  == DRM_GPUVA_OP_PREFETCH) {
-			err = prefetch_ranges(vm, op);
+			err = prefetch_ranges(vm, vops, op);
 			if (err)
 				return err;
 		}
diff --git a/drivers/gpu/drm/xe/xe_vm_types.h b/drivers/gpu/drm/xe/xe_vm_types.h
index db6e8e22a69f..7d5a82b2b64f 100644
--- a/drivers/gpu/drm/xe/xe_vm_types.h
+++ b/drivers/gpu/drm/xe/xe_vm_types.h
@@ -513,13 +513,14 @@ struct xe_vma_ops {
 	/** @pt_update_ops: page table update operations */
 	struct xe_vm_pgtable_update_ops pt_update_ops[XE_MAX_TILES_PER_DEVICE];
 	/** @flag: signify the properties within xe_vma_ops*/
-#define XE_VMA_OPS_FLAG_HAS_SVM_PREFETCH BIT(0)
-#define XE_VMA_OPS_FLAG_MADVISE          BIT(1)
-#define XE_VMA_OPS_ARRAY_OF_BINDS	 BIT(2)
-#define XE_VMA_OPS_FLAG_SKIP_TLB_WAIT	 BIT(3)
-#define XE_VMA_OPS_FLAG_ALLOW_SVM_UNMAP  BIT(4)
-#define XE_VMA_OPS_FLAG_MODIFIES_GPUVA	 BIT(5)
-#define XE_VMA_OPS_FLAG_DOWNGRADE_LOCK	 BIT(6)
+#define XE_VMA_OPS_FLAG_HAS_SVM_PREFETCH	BIT(0)
+#define XE_VMA_OPS_FLAG_MADVISE			BIT(1)
+#define XE_VMA_OPS_ARRAY_OF_BINDS		BIT(2)
+#define XE_VMA_OPS_FLAG_SKIP_TLB_WAIT		BIT(3)
+#define XE_VMA_OPS_FLAG_ALLOW_SVM_UNMAP		BIT(4)
+#define XE_VMA_OPS_FLAG_MODIFIES_GPUVA		BIT(5)
+#define XE_VMA_OPS_FLAG_DOWNGRADE_LOCK		BIT(6)
+#define XE_VMA_OPS_FLAG_HAS_SVM_VALID_RANGE	BIT(7)
 	u32 flags;
 #ifdef TEST_VM_OPS_ERROR
 	/** @inject_error: inject error to test error handling */
-- 
2.34.1

next prev parent reply	other threads:[~2026-02-25 20:27 UTC|newest]

Thread overview: 15+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-02-25 20:27 [PATCH v3 00/12] Fine grained fault locking, threaded prefetch, storm cache Matthew Brost
2026-02-25 20:27 ` [PATCH v3 01/12] drm/xe: Fine grained page fault locking Matthew Brost
2026-02-25 20:27 ` [PATCH v3 02/12] drm/xe: Allow prefetch-only VM bind IOCTLs to use VM read lock Matthew Brost
2026-02-25 20:27 ` Matthew Brost [this message]
2026-02-25 20:27 ` [PATCH v3 04/12] drm/xe: Use a single page-fault queue with multiple workers Matthew Brost
2026-02-25 20:27 ` [PATCH v3 05/12] drm/xe: Add num_pf_work modparam Matthew Brost
2026-02-25 20:27 ` [PATCH v3 06/12] drm/xe: Engine class and instance into a u8 Matthew Brost
2026-02-25 20:27 ` [PATCH v3 07/12] drm/xe: Track pagefault worker runtime Matthew Brost
2026-02-25 20:27 ` [PATCH v3 08/12] drm/xe: Chain page faults via queue-resident cache to avoid fault storms Matthew Brost
2026-02-25 20:27 ` [PATCH v3 09/12] drm/xe: Add pagefault chaining stats Matthew Brost
2026-02-25 20:27 ` [PATCH v3 10/12] drm/xe: Add debugfs pagefault_info Matthew Brost
2026-02-25 20:27 ` [PATCH v3 11/12] drm/xe: batch CT pagefault acks with periodic flush Matthew Brost
2026-02-25 20:27 ` [PATCH v3 12/12] drm/xe: Track parallel page fault activity in GT stats Matthew Brost
2026-02-26  3:51 ` ✗ CI.checkpatch: warning for Fine grained fault locking, threaded prefetch, storm cache (rev3) Patchwork
2026-02-26  3:51 ` ✗ CI.KUnit: failure " Patchwork

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:421262c2a63 dfblob:a372db7cd83 dfblob:3e59695e0c0
dfblob:66eee490a0c dfblob:7dcf8b08469 dfblob:4fe6b846aca
dfblob:204a89ca339 dfblob:06669e9c500 dfblob:db6e8e22a69
dfblob:7d5a82b2b64 )
 OR (
bs:"[PATCH v3 03/12] drm/xe: Thread prefetch of SVM ranges" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260225202736.2723250-4-matthew.brost@intel.com \
    --to=matthew.brost@intel.com \
    --cc=arvind.yadav@intel.com \
    --cc=francois.dugast@intel.com \
    --cc=himal.prasad.ghimiray@intel.com \
    --cc=intel-xe@lists.freedesktop.org \
    --cc=stuart.summers@intel.com \
    --cc=thomas.hellstrom@linux.intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox