Intel-XE Archive on lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v2 2/3] drm/xe: Userptr invalidation race with binds fixes
  2025-02-24  3:24 [PATCH v2 0/3] " Matthew Brost
@ 2025-02-24  3:24 ` Matthew Brost
  0 siblings, 0 replies; 14+ messages in thread
From: Matthew Brost @ 2025-02-24  3:24 UTC (permalink / raw)
  To: intel-xe; +Cc: thomas.hellstrom, matthew.auld

Squash bind operation after userptr invalidation into a clearing of
PTEs. Will prevent valid GPU page tables from pointing to stale CPU
pages.

Fixup initial bind handling always add VMAs to invalidation list and
clear PTEs.

Remove unused rebind variable in xe_pt.

Always hold notifier across TLB invalidation in notifier to prevent a
UAF if an unbind races.

Including all of the above changes for Fixes patch in hopes of an easier
backport which fix a single patch.

v2:
 - Wait dma-resv bookkeep before issuing PTE zap (Thomas)
 - Support scratch page on invalidation (Thomas)

Cc: Thomas Hellström <thomas.hellstrom@linux.intel.com>
Cc: <stable@vger.kernel.org>
Fixes: e8babb280b5e ("drm/xe: Convert multiple bind ops into single job")
Signed-off-by: Matthew Brost <matthew.brost@intel.com>
---
 drivers/gpu/drm/xe/xe_pt.c       | 146 +++++++++++++++++++++++--------
 drivers/gpu/drm/xe/xe_pt_types.h |   3 +-
 drivers/gpu/drm/xe/xe_vm.c       |   4 +-
 3 files changed, 115 insertions(+), 38 deletions(-)

diff --git a/drivers/gpu/drm/xe/xe_pt.c b/drivers/gpu/drm/xe/xe_pt.c
index 1ddcc7e79a93..add521b5c6ae 100644
--- a/drivers/gpu/drm/xe/xe_pt.c
+++ b/drivers/gpu/drm/xe/xe_pt.c
@@ -351,7 +351,8 @@ xe_pt_new_shared(struct xe_walk_update *wupd, struct xe_pt *parent,
  */
 static int
 xe_pt_insert_entry(struct xe_pt_stage_bind_walk *xe_walk, struct xe_pt *parent,
-		   pgoff_t offset, struct xe_pt *xe_child, u64 pte)
+		   pgoff_t offset, struct xe_pt *xe_child, u64 pte,
+		   unsigned int level)
 {
 	struct xe_pt_update *upd = &xe_walk->wupd.updates[parent->level];
 	struct xe_pt_update *child_upd = xe_child ?
@@ -389,6 +390,9 @@ xe_pt_insert_entry(struct xe_pt_stage_bind_walk *xe_walk, struct xe_pt *parent,
 		idx = offset - entry->ofs;
 		entry->pt_entries[idx].pt = xe_child;
 		entry->pt_entries[idx].pte = pte;
+		entry->pt_entries[idx].level = level;
+		if (likely(!xe_child))
+			entry->pt_entries[idx].level |= XE_PT_IS_LEAF;
 		entry->qwords++;
 	}
 
@@ -515,7 +519,8 @@ xe_pt_stage_bind_entry(struct xe_ptw *parent, pgoff_t offset,
 			}
 		}
 
-		ret = xe_pt_insert_entry(xe_walk, xe_parent, offset, NULL, pte);
+		ret = xe_pt_insert_entry(xe_walk, xe_parent, offset, NULL, pte,
+					 level);
 		if (unlikely(ret))
 			return ret;
 
@@ -571,7 +576,7 @@ xe_pt_stage_bind_entry(struct xe_ptw *parent, pgoff_t offset,
 
 		pte = vm->pt_ops->pde_encode_bo(xe_child->bo, 0, pat_index) | flags;
 		ret = xe_pt_insert_entry(xe_walk, xe_parent, offset, xe_child,
-					 pte);
+					 pte, level);
 	}
 
 	*action = ACTION_SUBTREE;
@@ -752,6 +757,10 @@ struct xe_pt_zap_ptes_walk {
 	/* Input parameters for the walk */
 	/** @tile: The tile we're building for */
 	struct xe_tile *tile;
+	/** @vm: VM we're building for */
+	struct xe_vm *vm;
+	/** @scratch: write entries with scratch */
+	bool scratch;
 
 	/* Output */
 	/** @needs_invalidate: Whether we need to invalidate TLB*/
@@ -779,9 +788,18 @@ static int xe_pt_zap_ptes_entry(struct xe_ptw *parent, pgoff_t offset,
 	 */
 	if (xe_pt_nonshared_offsets(addr, next, --level, walk, action, &offset,
 				    &end_offset)) {
-		xe_map_memset(tile_to_xe(xe_walk->tile), &xe_child->bo->vmap,
-			      offset * sizeof(u64), 0,
-			      (end_offset - offset) * sizeof(u64));
+		if (unlikely(xe_walk->scratch)) {
+			u64 pte = __xe_pt_empty_pte(xe_walk->tile, xe_walk->vm,
+						    level);
+
+			for (; offset < end_offset; ++offset)
+				xe_pt_write(tile_to_xe(xe_walk->tile),
+					    &xe_child->bo->vmap, offset, pte);
+		} else {
+			xe_map_memset(tile_to_xe(xe_walk->tile), &xe_child->bo->vmap,
+				      offset * sizeof(u64), 0,
+				      (end_offset - offset) * sizeof(u64));
+		}
 		xe_walk->needs_invalidate = true;
 	}
 
@@ -792,6 +810,31 @@ static const struct xe_pt_walk_ops xe_pt_zap_ptes_ops = {
 	.pt_entry = xe_pt_zap_ptes_entry,
 };
 
+struct xe_pt_zap_ptes_flags {
+	bool scratch:1;
+};
+
+static bool __xe_pt_zap_ptes(struct xe_tile *tile, struct xe_vma *vma,
+			     struct xe_pt_zap_ptes_flags flags)
+{
+	struct xe_pt_zap_ptes_walk xe_walk = {
+		.base = {
+			.ops = &xe_pt_zap_ptes_ops,
+			.shifts = xe_normal_pt_shifts,
+			.max_level = XE_PT_HIGHEST_LEVEL,
+		},
+		.tile = tile,
+		.vm = xe_vma_vm(vma),
+		.scratch = flags.scratch,
+	};
+	struct xe_pt *pt = xe_vma_vm(vma)->pt_root[tile->id];
+
+	(void)xe_pt_walk_shared(&pt->base, pt->level, xe_vma_start(vma),
+				xe_vma_end(vma), &xe_walk.base);
+
+	return xe_walk.needs_invalidate;
+}
+
 /**
  * xe_pt_zap_ptes() - Zap (zero) gpu ptes of an address range
  * @tile: The tile we're zapping for.
@@ -810,24 +853,13 @@ static const struct xe_pt_walk_ops xe_pt_zap_ptes_ops = {
  */
 bool xe_pt_zap_ptes(struct xe_tile *tile, struct xe_vma *vma)
 {
-	struct xe_pt_zap_ptes_walk xe_walk = {
-		.base = {
-			.ops = &xe_pt_zap_ptes_ops,
-			.shifts = xe_normal_pt_shifts,
-			.max_level = XE_PT_HIGHEST_LEVEL,
-		},
-		.tile = tile,
-	};
-	struct xe_pt *pt = xe_vma_vm(vma)->pt_root[tile->id];
+	struct xe_pt_zap_ptes_flags flags = {};
 	u8 pt_mask = (vma->tile_present & ~vma->tile_invalidated);
 
 	if (!(pt_mask & BIT(tile->id)))
 		return false;
 
-	(void)xe_pt_walk_shared(&pt->base, pt->level, xe_vma_start(vma),
-				xe_vma_end(vma), &xe_walk.base);
-
-	return xe_walk.needs_invalidate;
+	return __xe_pt_zap_ptes(tile, vma, flags);
 }
 
 static void
@@ -1201,7 +1233,46 @@ static bool xe_pt_userptr_inject_eagain(struct xe_userptr_vma *uvma)
 
 #endif
 
-static int vma_check_userptr(struct xe_vm *vm, struct xe_vma *vma,
+static void
+vma_convert_to_invalidation(struct xe_tile *tile, struct xe_vma *vma,
+			    struct xe_vm_pgtable_update_ops *pt_update)
+{
+	struct xe_pt_zap_ptes_flags flags = { .scratch = true, };
+	int i, j, k;
+
+	/*
+	 * Need to update this function to bypass scratch setup if in fault mode
+	 */
+	xe_assert(xe_vma_vm(vma)->xe, !xe_vm_in_fault_mode(xe_vma_vm(vma)));
+
+	for (i = 0; i < pt_update->current_op; ++i) {
+		struct xe_vm_pgtable_update_op *op = &pt_update->ops[i];
+
+		if (vma != op->vma || (!op->bind && !op->rebind))
+			continue;
+
+		for (j = 0; j < op->num_entries; ++j) {
+			for (k = 0; k < op->entries[j].qwords; ++k) {
+				struct xe_pt_entry *entry =
+					&op->entries[j].pt_entries[k];
+				unsigned int level = entry->level;
+
+				if (!(level & XE_PT_IS_LEAF))
+					continue;
+
+				level &= ~XE_PT_IS_LEAF;
+				entry->pte = __xe_pt_empty_pte(tile,
+							       xe_vma_vm(vma),
+							       level);
+			}
+		}
+	}
+
+	__xe_pt_zap_ptes(tile, vma, flags);
+}
+
+static int vma_check_userptr(struct xe_tile *tile, struct xe_vm *vm,
+			     struct xe_vma *vma,
 			     struct xe_vm_pgtable_update_ops *pt_update)
 {
 	struct xe_userptr_vma *uvma;
@@ -1215,9 +1286,6 @@ static int vma_check_userptr(struct xe_vm *vm, struct xe_vma *vma,
 	uvma = to_userptr_vma(vma);
 	notifier_seq = uvma->userptr.notifier_seq;
 
-	if (uvma->userptr.initial_bind && !xe_vm_in_fault_mode(vm))
-		return 0;
-
 	if (!mmu_interval_read_retry(&uvma->userptr.notifier,
 				     notifier_seq) &&
 	    !xe_pt_userptr_inject_eagain(uvma))
@@ -1226,6 +1294,8 @@ static int vma_check_userptr(struct xe_vm *vm, struct xe_vma *vma,
 	if (xe_vm_in_fault_mode(vm)) {
 		return -EAGAIN;
 	} else {
+		long err;
+
 		spin_lock(&vm->userptr.invalidated_lock);
 		list_move_tail(&uvma->userptr.invalidate_link,
 			       &vm->userptr.invalidated);
@@ -1234,25 +1304,27 @@ static int vma_check_userptr(struct xe_vm *vm, struct xe_vma *vma,
 		if (xe_vm_in_preempt_fence_mode(vm)) {
 			struct dma_resv_iter cursor;
 			struct dma_fence *fence;
-			long err;
 
 			dma_resv_iter_begin(&cursor, xe_vm_resv(vm),
 					    DMA_RESV_USAGE_BOOKKEEP);
 			dma_resv_for_each_fence_unlocked(&cursor, fence)
 				dma_fence_enable_sw_signaling(fence);
 			dma_resv_iter_end(&cursor);
-
-			err = dma_resv_wait_timeout(xe_vm_resv(vm),
-						    DMA_RESV_USAGE_BOOKKEEP,
-						    false, MAX_SCHEDULE_TIMEOUT);
-			XE_WARN_ON(err <= 0);
 		}
+
+		err = dma_resv_wait_timeout(xe_vm_resv(vm),
+					    DMA_RESV_USAGE_BOOKKEEP,
+					    false, MAX_SCHEDULE_TIMEOUT);
+		XE_WARN_ON(err <= 0);
+
+		vma_convert_to_invalidation(tile, vma, pt_update);
 	}
 
 	return 0;
 }
 
-static int op_check_userptr(struct xe_vm *vm, struct xe_vma_op *op,
+static int op_check_userptr(struct xe_tile *tile, struct xe_vm *vm,
+			    struct xe_vma_op *op,
 			    struct xe_vm_pgtable_update_ops *pt_update)
 {
 	int err = 0;
@@ -1264,18 +1336,21 @@ static int op_check_userptr(struct xe_vm *vm, struct xe_vma_op *op,
 		if (!op->map.immediate && xe_vm_in_fault_mode(vm))
 			break;
 
-		err = vma_check_userptr(vm, op->map.vma, pt_update);
+		err = vma_check_userptr(tile, vm, op->map.vma, pt_update);
 		break;
 	case DRM_GPUVA_OP_REMAP:
 		if (op->remap.prev)
-			err = vma_check_userptr(vm, op->remap.prev, pt_update);
+			err = vma_check_userptr(tile, vm, op->remap.prev,
+						pt_update);
 		if (!err && op->remap.next)
-			err = vma_check_userptr(vm, op->remap.next, pt_update);
+			err = vma_check_userptr(tile, vm, op->remap.next,
+						pt_update);
 		break;
 	case DRM_GPUVA_OP_UNMAP:
 		break;
 	case DRM_GPUVA_OP_PREFETCH:
-		err = vma_check_userptr(vm, gpuva_to_vma(op->base.prefetch.va),
+		err = vma_check_userptr(tile, vm,
+					gpuva_to_vma(op->base.prefetch.va),
 					pt_update);
 		break;
 	default:
@@ -1301,7 +1376,8 @@ static int xe_pt_userptr_pre_commit(struct xe_migrate_pt_update *pt_update)
 	down_read(&vm->userptr.notifier_lock);
 
 	list_for_each_entry(op, &vops->list, link) {
-		err = op_check_userptr(vm, op, pt_update_ops);
+		err = op_check_userptr(&vm->xe->tiles[pt_update->tile_id],
+				       vm, op, pt_update_ops);
 		if (err) {
 			up_read(&vm->userptr.notifier_lock);
 			break;
diff --git a/drivers/gpu/drm/xe/xe_pt_types.h b/drivers/gpu/drm/xe/xe_pt_types.h
index 384cc04de719..6f99ff2b8fce 100644
--- a/drivers/gpu/drm/xe/xe_pt_types.h
+++ b/drivers/gpu/drm/xe/xe_pt_types.h
@@ -29,7 +29,6 @@ struct xe_pt {
 	struct xe_bo *bo;
 	unsigned int level;
 	unsigned int num_live;
-	bool rebind;
 	bool is_compact;
 #if IS_ENABLED(CONFIG_DRM_XE_DEBUG_VM)
 	/** addr: Virtual address start address of the PT. */
@@ -52,6 +51,8 @@ struct xe_pt_ops {
 struct xe_pt_entry {
 	struct xe_pt *pt;
 	u64 pte;
+#define XE_PT_IS_LEAF	BIT(31)
+	unsigned int level;
 };
 
 struct xe_vm_pgtable_update {
diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c
index ea2e287e6526..f90e5c92010c 100644
--- a/drivers/gpu/drm/xe/xe_vm.c
+++ b/drivers/gpu/drm/xe/xe_vm.c
@@ -623,8 +623,6 @@ static bool vma_userptr_invalidate(struct mmu_interval_notifier *mni,
 		spin_unlock(&vm->userptr.invalidated_lock);
 	}
 
-	up_write(&vm->userptr.notifier_lock);
-
 	/*
 	 * Preempt fences turn into schedule disables, pipeline these.
 	 * Note that even in fault mode, we need to wait for binds and
@@ -647,6 +645,8 @@ static bool vma_userptr_invalidate(struct mmu_interval_notifier *mni,
 		XE_WARN_ON(err);
 	}
 
+	up_write(&vm->userptr.notifier_lock);
+
 	trace_xe_vma_userptr_invalidate_complete(vma);
 
 	return true;
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH v2 0/3] Userptr fixes
@ 2025-02-24  4:05 Matthew Brost
  2025-02-24  4:05 ` [PATCH v2 1/3] drm/xe/userptr: fix EFAULT handling Matthew Brost
                   ` (3 more replies)
  0 siblings, 4 replies; 14+ messages in thread
From: Matthew Brost @ 2025-02-24  4:05 UTC (permalink / raw)
  To: intel-xe; +Cc: thomas.hellstrom, matthew.auld

Tested with below tests on BMG + CONFIG_DRM_XE_USERPTR_INVAL_INJECT
xe_exec_basic; xe_exec_compute_mode; xe_exec_fault_mode; xe_vm; xe_exec_threads

Including a patch from Matt Auld as it required for above test case to pass.

Matt

Matthew Auld (1):
  drm/xe/userptr: fix EFAULT handling

Matthew Brost (2):
  drm/xe: Userptr invalidation race with binds fixes
  drm/xe: Add staging tree for VM binds

 drivers/gpu/drm/xe/xe_pt.c       | 194 +++++++++++++++++++++++--------
 drivers/gpu/drm/xe/xe_pt_types.h |   3 +-
 drivers/gpu/drm/xe/xe_pt_walk.c  |   3 +-
 drivers/gpu/drm/xe/xe_pt_walk.h  |   4 +
 drivers/gpu/drm/xe/xe_vm.c       |  16 ++-
 5 files changed, 170 insertions(+), 50 deletions(-)

-- 
2.34.1


^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH v2 1/3] drm/xe/userptr: fix EFAULT handling
  2025-02-24  4:05 [PATCH v2 0/3] Userptr fixes Matthew Brost
@ 2025-02-24  4:05 ` Matthew Brost
  2025-02-24  4:05 ` [PATCH v2 2/3] drm/xe: Userptr invalidation race with binds fixes Matthew Brost
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 14+ messages in thread
From: Matthew Brost @ 2025-02-24  4:05 UTC (permalink / raw)
  To: intel-xe; +Cc: thomas.hellstrom, matthew.auld

From: Matthew Auld <matthew.auld@intel.com>

Currently we treat EFAULT from hmm_range_fault() as a non-fatal error
when called from xe_vm_userptr_pin() with the idea that we want to avoid
killing the entire vm and chucking an error, under the assumption that
the user just did an unmap or something, and has no intention of
actually touching that memory from the GPU.  At this point we have
already zapped the PTEs so any access should generate a page fault, and
if the pin fails there also it will then become fatal.

However it looks like it's possible for the userptr vma to still be on
the rebind list in preempt_rebind_work_func(), if we had to retry the
pin again due to something happening in the caller before we did the
rebind step, but in the meantime needing to re-validate the userptr and
this time hitting the EFAULT.

This might explain an internal user report of hitting:

[  191.738349] WARNING: CPU: 1 PID: 157 at drivers/gpu/drm/xe/xe_res_cursor.h:158 xe_pt_stage_bind.constprop.0+0x60a/0x6b0 [xe]
[  191.738551] Workqueue: xe-ordered-wq preempt_rebind_work_func [xe]
[  191.738616] RIP: 0010:xe_pt_stage_bind.constprop.0+0x60a/0x6b0 [xe]
[  191.738690] Call Trace:
[  191.738692]  <TASK>
[  191.738694]  ? show_regs+0x69/0x80
[  191.738698]  ? __warn+0x93/0x1a0
[  191.738703]  ? xe_pt_stage_bind.constprop.0+0x60a/0x6b0 [xe]
[  191.738759]  ? report_bug+0x18f/0x1a0
[  191.738764]  ? handle_bug+0x63/0xa0
[  191.738767]  ? exc_invalid_op+0x19/0x70
[  191.738770]  ? asm_exc_invalid_op+0x1b/0x20
[  191.738777]  ? xe_pt_stage_bind.constprop.0+0x60a/0x6b0 [xe]
[  191.738834]  ? ret_from_fork_asm+0x1a/0x30
[  191.738849]  bind_op_prepare+0x105/0x7b0 [xe]
[  191.738906]  ? dma_resv_reserve_fences+0x301/0x380
[  191.738912]  xe_pt_update_ops_prepare+0x28c/0x4b0 [xe]
[  191.738966]  ? kmemleak_alloc+0x4b/0x80
[  191.738973]  ops_execute+0x188/0x9d0 [xe]
[  191.739036]  xe_vm_rebind+0x4ce/0x5a0 [xe]
[  191.739098]  ? trace_hardirqs_on+0x4d/0x60
[  191.739112]  preempt_rebind_work_func+0x76f/0xd00 [xe]

Followed by NPD, when running some workload, since the sg was never
actually populated but the vma is still marked for rebind when it should
be skipped for this special EFAULT case. And from the logs it does seem
like we hit this special EFAULT case before the explosions.

v2 (MattB):
 - Move earlier

Fixes: 521db22a1d70 ("drm/xe: Invalidate userptr VMA on page pin fault")
Signed-off-by: Matthew Auld <matthew.auld@intel.com>
Cc: Matthew Brost <matthew.brost@intel.com>
Cc: Thomas Hellström <thomas.hellstrom@linux.intel.com>
Cc: <stable@vger.kernel.org> # v6.10+
Reviewed-by: Matthew Brost <matthew.brost@intel.com>
Reviewed-by: Thomas Hellström <thomas.hellstrom@linux.intel.com>
---
 drivers/gpu/drm/xe/xe_vm.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c
index d664f2e418b2..ea2e287e6526 100644
--- a/drivers/gpu/drm/xe/xe_vm.c
+++ b/drivers/gpu/drm/xe/xe_vm.c
@@ -681,6 +681,18 @@ int xe_vm_userptr_pin(struct xe_vm *vm)
 		err = xe_vma_userptr_pin_pages(uvma);
 		if (err == -EFAULT) {
 			list_del_init(&uvma->userptr.repin_link);
+			/*
+			 * We might have already done the pin once already, but
+			 * then had to retry before the re-bind happened, due
+			 * some other condition in the caller, but in the
+			 * meantime the userptr got dinged by the notifier such
+			 * that we need to revalidate here, but this time we hit
+			 * the EFAULT. In such a case make sure we remove
+			 * ourselves from the rebind list to avoid going down in
+			 * flames.
+			 */
+			if (!list_empty(&uvma->vma.combined_links.rebind))
+				list_del_init(&uvma->vma.combined_links.rebind);
 
 			/* Wait for pending binds */
 			xe_vm_lock(vm, false);
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH v2 2/3] drm/xe: Userptr invalidation race with binds fixes
  2025-02-24  4:05 [PATCH v2 0/3] Userptr fixes Matthew Brost
  2025-02-24  4:05 ` [PATCH v2 1/3] drm/xe/userptr: fix EFAULT handling Matthew Brost
@ 2025-02-24  4:05 ` Matthew Brost
  2025-02-24  8:21   ` Thomas Hellström
  2025-02-24  4:05 ` [PATCH v2 3/3] drm/xe: Add staging tree for VM binds Matthew Brost
  2025-02-24 16:15 ` ✗ CI.Patch_applied: failure for Userptr fixes Patchwork
  3 siblings, 1 reply; 14+ messages in thread
From: Matthew Brost @ 2025-02-24  4:05 UTC (permalink / raw)
  To: intel-xe; +Cc: thomas.hellstrom, matthew.auld

Squash bind operation after userptr invalidation into a clearing of
PTEs. Will prevent valid GPU page tables from pointing to stale CPU
pages.

Fixup initial bind handling always add VMAs to invalidation list and
clear PTEs.

Remove unused rebind variable in xe_pt.

Always hold notifier across TLB invalidation in notifier to prevent a
UAF if an unbind races.

Including all of the above changes for Fixes patch in hopes of an easier
backport which fix a single patch.

v2:
 - Wait dma-resv bookkeep before issuing PTE zap (Thomas)
 - Support scratch page on invalidation (Thomas)

Cc: Thomas Hellström <thomas.hellstrom@linux.intel.com>
Cc: <stable@vger.kernel.org>
Fixes: e8babb280b5e ("drm/xe: Convert multiple bind ops into single job")
Signed-off-by: Matthew Brost <matthew.brost@intel.com>
---
 drivers/gpu/drm/xe/xe_pt.c       | 146 +++++++++++++++++++++++--------
 drivers/gpu/drm/xe/xe_pt_types.h |   3 +-
 drivers/gpu/drm/xe/xe_vm.c       |   4 +-
 3 files changed, 115 insertions(+), 38 deletions(-)

diff --git a/drivers/gpu/drm/xe/xe_pt.c b/drivers/gpu/drm/xe/xe_pt.c
index 1ddcc7e79a93..add521b5c6ae 100644
--- a/drivers/gpu/drm/xe/xe_pt.c
+++ b/drivers/gpu/drm/xe/xe_pt.c
@@ -351,7 +351,8 @@ xe_pt_new_shared(struct xe_walk_update *wupd, struct xe_pt *parent,
  */
 static int
 xe_pt_insert_entry(struct xe_pt_stage_bind_walk *xe_walk, struct xe_pt *parent,
-		   pgoff_t offset, struct xe_pt *xe_child, u64 pte)
+		   pgoff_t offset, struct xe_pt *xe_child, u64 pte,
+		   unsigned int level)
 {
 	struct xe_pt_update *upd = &xe_walk->wupd.updates[parent->level];
 	struct xe_pt_update *child_upd = xe_child ?
@@ -389,6 +390,9 @@ xe_pt_insert_entry(struct xe_pt_stage_bind_walk *xe_walk, struct xe_pt *parent,
 		idx = offset - entry->ofs;
 		entry->pt_entries[idx].pt = xe_child;
 		entry->pt_entries[idx].pte = pte;
+		entry->pt_entries[idx].level = level;
+		if (likely(!xe_child))
+			entry->pt_entries[idx].level |= XE_PT_IS_LEAF;
 		entry->qwords++;
 	}
 
@@ -515,7 +519,8 @@ xe_pt_stage_bind_entry(struct xe_ptw *parent, pgoff_t offset,
 			}
 		}
 
-		ret = xe_pt_insert_entry(xe_walk, xe_parent, offset, NULL, pte);
+		ret = xe_pt_insert_entry(xe_walk, xe_parent, offset, NULL, pte,
+					 level);
 		if (unlikely(ret))
 			return ret;
 
@@ -571,7 +576,7 @@ xe_pt_stage_bind_entry(struct xe_ptw *parent, pgoff_t offset,
 
 		pte = vm->pt_ops->pde_encode_bo(xe_child->bo, 0, pat_index) | flags;
 		ret = xe_pt_insert_entry(xe_walk, xe_parent, offset, xe_child,
-					 pte);
+					 pte, level);
 	}
 
 	*action = ACTION_SUBTREE;
@@ -752,6 +757,10 @@ struct xe_pt_zap_ptes_walk {
 	/* Input parameters for the walk */
 	/** @tile: The tile we're building for */
 	struct xe_tile *tile;
+	/** @vm: VM we're building for */
+	struct xe_vm *vm;
+	/** @scratch: write entries with scratch */
+	bool scratch;
 
 	/* Output */
 	/** @needs_invalidate: Whether we need to invalidate TLB*/
@@ -779,9 +788,18 @@ static int xe_pt_zap_ptes_entry(struct xe_ptw *parent, pgoff_t offset,
 	 */
 	if (xe_pt_nonshared_offsets(addr, next, --level, walk, action, &offset,
 				    &end_offset)) {
-		xe_map_memset(tile_to_xe(xe_walk->tile), &xe_child->bo->vmap,
-			      offset * sizeof(u64), 0,
-			      (end_offset - offset) * sizeof(u64));
+		if (unlikely(xe_walk->scratch)) {
+			u64 pte = __xe_pt_empty_pte(xe_walk->tile, xe_walk->vm,
+						    level);
+
+			for (; offset < end_offset; ++offset)
+				xe_pt_write(tile_to_xe(xe_walk->tile),
+					    &xe_child->bo->vmap, offset, pte);
+		} else {
+			xe_map_memset(tile_to_xe(xe_walk->tile), &xe_child->bo->vmap,
+				      offset * sizeof(u64), 0,
+				      (end_offset - offset) * sizeof(u64));
+		}
 		xe_walk->needs_invalidate = true;
 	}
 
@@ -792,6 +810,31 @@ static const struct xe_pt_walk_ops xe_pt_zap_ptes_ops = {
 	.pt_entry = xe_pt_zap_ptes_entry,
 };
 
+struct xe_pt_zap_ptes_flags {
+	bool scratch:1;
+};
+
+static bool __xe_pt_zap_ptes(struct xe_tile *tile, struct xe_vma *vma,
+			     struct xe_pt_zap_ptes_flags flags)
+{
+	struct xe_pt_zap_ptes_walk xe_walk = {
+		.base = {
+			.ops = &xe_pt_zap_ptes_ops,
+			.shifts = xe_normal_pt_shifts,
+			.max_level = XE_PT_HIGHEST_LEVEL,
+		},
+		.tile = tile,
+		.vm = xe_vma_vm(vma),
+		.scratch = flags.scratch,
+	};
+	struct xe_pt *pt = xe_vma_vm(vma)->pt_root[tile->id];
+
+	(void)xe_pt_walk_shared(&pt->base, pt->level, xe_vma_start(vma),
+				xe_vma_end(vma), &xe_walk.base);
+
+	return xe_walk.needs_invalidate;
+}
+
 /**
  * xe_pt_zap_ptes() - Zap (zero) gpu ptes of an address range
  * @tile: The tile we're zapping for.
@@ -810,24 +853,13 @@ static const struct xe_pt_walk_ops xe_pt_zap_ptes_ops = {
  */
 bool xe_pt_zap_ptes(struct xe_tile *tile, struct xe_vma *vma)
 {
-	struct xe_pt_zap_ptes_walk xe_walk = {
-		.base = {
-			.ops = &xe_pt_zap_ptes_ops,
-			.shifts = xe_normal_pt_shifts,
-			.max_level = XE_PT_HIGHEST_LEVEL,
-		},
-		.tile = tile,
-	};
-	struct xe_pt *pt = xe_vma_vm(vma)->pt_root[tile->id];
+	struct xe_pt_zap_ptes_flags flags = {};
 	u8 pt_mask = (vma->tile_present & ~vma->tile_invalidated);
 
 	if (!(pt_mask & BIT(tile->id)))
 		return false;
 
-	(void)xe_pt_walk_shared(&pt->base, pt->level, xe_vma_start(vma),
-				xe_vma_end(vma), &xe_walk.base);
-
-	return xe_walk.needs_invalidate;
+	return __xe_pt_zap_ptes(tile, vma, flags);
 }
 
 static void
@@ -1201,7 +1233,46 @@ static bool xe_pt_userptr_inject_eagain(struct xe_userptr_vma *uvma)
 
 #endif
 
-static int vma_check_userptr(struct xe_vm *vm, struct xe_vma *vma,
+static void
+vma_convert_to_invalidation(struct xe_tile *tile, struct xe_vma *vma,
+			    struct xe_vm_pgtable_update_ops *pt_update)
+{
+	struct xe_pt_zap_ptes_flags flags = { .scratch = true, };
+	int i, j, k;
+
+	/*
+	 * Need to update this function to bypass scratch setup if in fault mode
+	 */
+	xe_assert(xe_vma_vm(vma)->xe, !xe_vm_in_fault_mode(xe_vma_vm(vma)));
+
+	for (i = 0; i < pt_update->current_op; ++i) {
+		struct xe_vm_pgtable_update_op *op = &pt_update->ops[i];
+
+		if (vma != op->vma || (!op->bind && !op->rebind))
+			continue;
+
+		for (j = 0; j < op->num_entries; ++j) {
+			for (k = 0; k < op->entries[j].qwords; ++k) {
+				struct xe_pt_entry *entry =
+					&op->entries[j].pt_entries[k];
+				unsigned int level = entry->level;
+
+				if (!(level & XE_PT_IS_LEAF))
+					continue;
+
+				level &= ~XE_PT_IS_LEAF;
+				entry->pte = __xe_pt_empty_pte(tile,
+							       xe_vma_vm(vma),
+							       level);
+			}
+		}
+	}
+
+	__xe_pt_zap_ptes(tile, vma, flags);
+}
+
+static int vma_check_userptr(struct xe_tile *tile, struct xe_vm *vm,
+			     struct xe_vma *vma,
 			     struct xe_vm_pgtable_update_ops *pt_update)
 {
 	struct xe_userptr_vma *uvma;
@@ -1215,9 +1286,6 @@ static int vma_check_userptr(struct xe_vm *vm, struct xe_vma *vma,
 	uvma = to_userptr_vma(vma);
 	notifier_seq = uvma->userptr.notifier_seq;
 
-	if (uvma->userptr.initial_bind && !xe_vm_in_fault_mode(vm))
-		return 0;
-
 	if (!mmu_interval_read_retry(&uvma->userptr.notifier,
 				     notifier_seq) &&
 	    !xe_pt_userptr_inject_eagain(uvma))
@@ -1226,6 +1294,8 @@ static int vma_check_userptr(struct xe_vm *vm, struct xe_vma *vma,
 	if (xe_vm_in_fault_mode(vm)) {
 		return -EAGAIN;
 	} else {
+		long err;
+
 		spin_lock(&vm->userptr.invalidated_lock);
 		list_move_tail(&uvma->userptr.invalidate_link,
 			       &vm->userptr.invalidated);
@@ -1234,25 +1304,27 @@ static int vma_check_userptr(struct xe_vm *vm, struct xe_vma *vma,
 		if (xe_vm_in_preempt_fence_mode(vm)) {
 			struct dma_resv_iter cursor;
 			struct dma_fence *fence;
-			long err;
 
 			dma_resv_iter_begin(&cursor, xe_vm_resv(vm),
 					    DMA_RESV_USAGE_BOOKKEEP);
 			dma_resv_for_each_fence_unlocked(&cursor, fence)
 				dma_fence_enable_sw_signaling(fence);
 			dma_resv_iter_end(&cursor);
-
-			err = dma_resv_wait_timeout(xe_vm_resv(vm),
-						    DMA_RESV_USAGE_BOOKKEEP,
-						    false, MAX_SCHEDULE_TIMEOUT);
-			XE_WARN_ON(err <= 0);
 		}
+
+		err = dma_resv_wait_timeout(xe_vm_resv(vm),
+					    DMA_RESV_USAGE_BOOKKEEP,
+					    false, MAX_SCHEDULE_TIMEOUT);
+		XE_WARN_ON(err <= 0);
+
+		vma_convert_to_invalidation(tile, vma, pt_update);
 	}
 
 	return 0;
 }
 
-static int op_check_userptr(struct xe_vm *vm, struct xe_vma_op *op,
+static int op_check_userptr(struct xe_tile *tile, struct xe_vm *vm,
+			    struct xe_vma_op *op,
 			    struct xe_vm_pgtable_update_ops *pt_update)
 {
 	int err = 0;
@@ -1264,18 +1336,21 @@ static int op_check_userptr(struct xe_vm *vm, struct xe_vma_op *op,
 		if (!op->map.immediate && xe_vm_in_fault_mode(vm))
 			break;
 
-		err = vma_check_userptr(vm, op->map.vma, pt_update);
+		err = vma_check_userptr(tile, vm, op->map.vma, pt_update);
 		break;
 	case DRM_GPUVA_OP_REMAP:
 		if (op->remap.prev)
-			err = vma_check_userptr(vm, op->remap.prev, pt_update);
+			err = vma_check_userptr(tile, vm, op->remap.prev,
+						pt_update);
 		if (!err && op->remap.next)
-			err = vma_check_userptr(vm, op->remap.next, pt_update);
+			err = vma_check_userptr(tile, vm, op->remap.next,
+						pt_update);
 		break;
 	case DRM_GPUVA_OP_UNMAP:
 		break;
 	case DRM_GPUVA_OP_PREFETCH:
-		err = vma_check_userptr(vm, gpuva_to_vma(op->base.prefetch.va),
+		err = vma_check_userptr(tile, vm,
+					gpuva_to_vma(op->base.prefetch.va),
 					pt_update);
 		break;
 	default:
@@ -1301,7 +1376,8 @@ static int xe_pt_userptr_pre_commit(struct xe_migrate_pt_update *pt_update)
 	down_read(&vm->userptr.notifier_lock);
 
 	list_for_each_entry(op, &vops->list, link) {
-		err = op_check_userptr(vm, op, pt_update_ops);
+		err = op_check_userptr(&vm->xe->tiles[pt_update->tile_id],
+				       vm, op, pt_update_ops);
 		if (err) {
 			up_read(&vm->userptr.notifier_lock);
 			break;
diff --git a/drivers/gpu/drm/xe/xe_pt_types.h b/drivers/gpu/drm/xe/xe_pt_types.h
index 384cc04de719..6f99ff2b8fce 100644
--- a/drivers/gpu/drm/xe/xe_pt_types.h
+++ b/drivers/gpu/drm/xe/xe_pt_types.h
@@ -29,7 +29,6 @@ struct xe_pt {
 	struct xe_bo *bo;
 	unsigned int level;
 	unsigned int num_live;
-	bool rebind;
 	bool is_compact;
 #if IS_ENABLED(CONFIG_DRM_XE_DEBUG_VM)
 	/** addr: Virtual address start address of the PT. */
@@ -52,6 +51,8 @@ struct xe_pt_ops {
 struct xe_pt_entry {
 	struct xe_pt *pt;
 	u64 pte;
+#define XE_PT_IS_LEAF	BIT(31)
+	unsigned int level;
 };
 
 struct xe_vm_pgtable_update {
diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c
index ea2e287e6526..f90e5c92010c 100644
--- a/drivers/gpu/drm/xe/xe_vm.c
+++ b/drivers/gpu/drm/xe/xe_vm.c
@@ -623,8 +623,6 @@ static bool vma_userptr_invalidate(struct mmu_interval_notifier *mni,
 		spin_unlock(&vm->userptr.invalidated_lock);
 	}
 
-	up_write(&vm->userptr.notifier_lock);
-
 	/*
 	 * Preempt fences turn into schedule disables, pipeline these.
 	 * Note that even in fault mode, we need to wait for binds and
@@ -647,6 +645,8 @@ static bool vma_userptr_invalidate(struct mmu_interval_notifier *mni,
 		XE_WARN_ON(err);
 	}
 
+	up_write(&vm->userptr.notifier_lock);
+
 	trace_xe_vma_userptr_invalidate_complete(vma);
 
 	return true;
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH v2 3/3] drm/xe: Add staging tree for VM binds
  2025-02-24  4:05 [PATCH v2 0/3] Userptr fixes Matthew Brost
  2025-02-24  4:05 ` [PATCH v2 1/3] drm/xe/userptr: fix EFAULT handling Matthew Brost
  2025-02-24  4:05 ` [PATCH v2 2/3] drm/xe: Userptr invalidation race with binds fixes Matthew Brost
@ 2025-02-24  4:05 ` Matthew Brost
  2025-02-24  9:22   ` Thomas Hellström
  2025-02-24 16:15 ` ✗ CI.Patch_applied: failure for Userptr fixes Patchwork
  3 siblings, 1 reply; 14+ messages in thread
From: Matthew Brost @ 2025-02-24  4:05 UTC (permalink / raw)
  To: intel-xe; +Cc: thomas.hellstrom, matthew.auld

Concurrent VM bind staging and zapping of PTEs from a userptr notifier
do not work because the view of PTEs is not stable. VM binds cannot
acquire the notifier lock during staging, as memory allocations are
required. To resolve this race condition, use a staging tree for VM
binds that is committed only under the userptr notifier lock during the
final step of the bind. This ensures a consistent view of the PTEs in
the userptr notifier.

A follow up may only use staging for VM in fault mode as this is the
only mode in which the above race exists.

Suggested-by: Thomas Hellström <thomas.hellstrom@linux.intel.com>
Cc: <stable@vger.kernel.org>
Fixes: e8babb280b5e ("drm/xe: Convert multiple bind ops into single job")
Fixes: a708f6501c69 ("drm/xe: Update PT layer with better error handling")
Signed-off-by: Matthew Brost <matthew.brost@intel.com>
---
 drivers/gpu/drm/xe/xe_pt.c      | 50 +++++++++++++++++++++++++--------
 drivers/gpu/drm/xe/xe_pt_walk.c |  3 +-
 drivers/gpu/drm/xe/xe_pt_walk.h |  4 +++
 3 files changed, 44 insertions(+), 13 deletions(-)

diff --git a/drivers/gpu/drm/xe/xe_pt.c b/drivers/gpu/drm/xe/xe_pt.c
index add521b5c6ae..118b81cd7205 100644
--- a/drivers/gpu/drm/xe/xe_pt.c
+++ b/drivers/gpu/drm/xe/xe_pt.c
@@ -28,6 +28,8 @@ struct xe_pt_dir {
 	struct xe_pt pt;
 	/** @children: Array of page-table child nodes */
 	struct xe_ptw *children[XE_PDES];
+	/** @staging: Array of page-table staging nodes */
+	struct xe_ptw *staging[XE_PDES];
 };
 
 #if IS_ENABLED(CONFIG_DRM_XE_DEBUG_VM)
@@ -50,7 +52,7 @@ static struct xe_pt_dir *as_xe_pt_dir(struct xe_pt *pt)
 
 static struct xe_pt *xe_pt_entry(struct xe_pt_dir *pt_dir, unsigned int index)
 {
-	return container_of(pt_dir->children[index], struct xe_pt, base);
+	return container_of(pt_dir->staging[index], struct xe_pt, base);
 }
 
 static u64 __xe_pt_empty_pte(struct xe_tile *tile, struct xe_vm *vm,
@@ -125,6 +127,7 @@ struct xe_pt *xe_pt_create(struct xe_vm *vm, struct xe_tile *tile,
 	}
 	pt->bo = bo;
 	pt->base.children = level ? as_xe_pt_dir(pt)->children : NULL;
+	pt->base.staging = level ? as_xe_pt_dir(pt)->staging : NULL;
 
 	if (vm->xef)
 		xe_drm_client_add_bo(vm->xef->client, pt->bo);
@@ -377,8 +380,10 @@ xe_pt_insert_entry(struct xe_pt_stage_bind_walk *xe_walk, struct xe_pt *parent,
 		/* Continue building a non-connected subtree. */
 		struct iosys_map *map = &parent->bo->vmap;
 
-		if (unlikely(xe_child))
+		if (unlikely(xe_child)) {
 			parent->base.children[offset] = &xe_child->base;
+			parent->base.staging[offset] = &xe_child->base;
+		}
 
 		xe_pt_write(xe_walk->vm->xe, map, offset, pte);
 		parent->num_live++;
@@ -619,6 +624,7 @@ xe_pt_stage_bind(struct xe_tile *tile, struct xe_vma *vma,
 			.ops = &xe_pt_stage_bind_ops,
 			.shifts = xe_normal_pt_shifts,
 			.max_level = XE_PT_HIGHEST_LEVEL,
+			.staging = true,
 		},
 		.vm = xe_vma_vm(vma),
 		.tile = tile,
@@ -812,6 +818,7 @@ static const struct xe_pt_walk_ops xe_pt_zap_ptes_ops = {
 
 struct xe_pt_zap_ptes_flags {
 	bool scratch:1;
+	bool staging:1;
 };
 
 static bool __xe_pt_zap_ptes(struct xe_tile *tile, struct xe_vma *vma,
@@ -822,6 +829,7 @@ static bool __xe_pt_zap_ptes(struct xe_tile *tile, struct xe_vma *vma,
 			.ops = &xe_pt_zap_ptes_ops,
 			.shifts = xe_normal_pt_shifts,
 			.max_level = XE_PT_HIGHEST_LEVEL,
+			.staging = flags.staging,
 		},
 		.tile = tile,
 		.vm = xe_vma_vm(vma),
@@ -905,7 +913,7 @@ static void xe_pt_cancel_bind(struct xe_vma *vma,
 	}
 }
 
-static void xe_pt_commit_locks_assert(struct xe_vma *vma)
+static void xe_pt_commit_prepare_locks_assert(struct xe_vma *vma)
 {
 	struct xe_vm *vm = xe_vma_vm(vma);
 
@@ -917,6 +925,16 @@ static void xe_pt_commit_locks_assert(struct xe_vma *vma)
 	xe_vm_assert_held(vm);
 }
 
+static void xe_pt_commit_locks_assert(struct xe_vma *vma)
+{
+	struct xe_vm *vm = xe_vma_vm(vma);
+
+	xe_pt_commit_prepare_locks_assert(vma);
+
+	if (xe_vma_is_userptr(vma))
+		lockdep_assert_held_read(&vm->userptr.notifier_lock);
+}
+
 static void xe_pt_commit(struct xe_vma *vma,
 			 struct xe_vm_pgtable_update *entries,
 			 u32 num_entries, struct llist_head *deferred)
@@ -927,13 +945,17 @@ static void xe_pt_commit(struct xe_vma *vma,
 
 	for (i = 0; i < num_entries; i++) {
 		struct xe_pt *pt = entries[i].pt;
+		struct xe_pt_dir *pt_dir;
 
 		if (!pt->level)
 			continue;
 
+		pt_dir = as_xe_pt_dir(pt);
 		for (j = 0; j < entries[i].qwords; j++) {
 			struct xe_pt *oldpte = entries[i].pt_entries[j].pt;
+			int j_ = j + entries[i].ofs;
 
+			pt_dir->children[j_] = pt_dir->staging[j_];
 			xe_pt_destroy(oldpte, xe_vma_vm(vma)->flags, deferred);
 		}
 	}
@@ -945,7 +967,7 @@ static void xe_pt_abort_bind(struct xe_vma *vma,
 {
 	int i, j;
 
-	xe_pt_commit_locks_assert(vma);
+	xe_pt_commit_prepare_locks_assert(vma);
 
 	for (i = num_entries - 1; i >= 0; --i) {
 		struct xe_pt *pt = entries[i].pt;
@@ -963,7 +985,7 @@ static void xe_pt_abort_bind(struct xe_vma *vma,
 			struct xe_pt *newpte = xe_pt_entry(pt_dir, j_);
 			struct xe_pt *oldpte = entries[i].pt_entries[j].pt;
 
-			pt_dir->children[j_] = oldpte ? &oldpte->base : 0;
+			pt_dir->staging[j_] = oldpte ? &oldpte->base : 0;
 			xe_pt_destroy(newpte, xe_vma_vm(vma)->flags, NULL);
 		}
 	}
@@ -975,7 +997,7 @@ static void xe_pt_commit_prepare_bind(struct xe_vma *vma,
 {
 	u32 i, j;
 
-	xe_pt_commit_locks_assert(vma);
+	xe_pt_commit_prepare_locks_assert(vma);
 
 	for (i = 0; i < num_entries; i++) {
 		struct xe_pt *pt = entries[i].pt;
@@ -996,7 +1018,7 @@ static void xe_pt_commit_prepare_bind(struct xe_vma *vma,
 			if (xe_pt_entry(pt_dir, j_))
 				oldpte = xe_pt_entry(pt_dir, j_);
 
-			pt_dir->children[j_] = &newpte->base;
+			pt_dir->staging[j_] = &newpte->base;
 			entries[i].pt_entries[j].pt = oldpte;
 		}
 	}
@@ -1237,7 +1259,10 @@ static void
 vma_convert_to_invalidation(struct xe_tile *tile, struct xe_vma *vma,
 			    struct xe_vm_pgtable_update_ops *pt_update)
 {
-	struct xe_pt_zap_ptes_flags flags = { .scratch = true, };
+	struct xe_pt_zap_ptes_flags flags = {
+		.scratch = true,
+		.staging = true,
+	};
 	int i, j, k;
 
 	/*
@@ -1590,6 +1615,7 @@ static unsigned int xe_pt_stage_unbind(struct xe_tile *tile, struct xe_vma *vma,
 			.ops = &xe_pt_stage_unbind_ops,
 			.shifts = xe_normal_pt_shifts,
 			.max_level = XE_PT_HIGHEST_LEVEL,
+			.staging = true,
 		},
 		.tile = tile,
 		.modified_start = xe_vma_start(vma),
@@ -1631,7 +1657,7 @@ static void xe_pt_abort_unbind(struct xe_vma *vma,
 {
 	int i, j;
 
-	xe_pt_commit_locks_assert(vma);
+	xe_pt_commit_prepare_locks_assert(vma);
 
 	for (i = num_entries - 1; i >= 0; --i) {
 		struct xe_vm_pgtable_update *entry = &entries[i];
@@ -1644,7 +1670,7 @@ static void xe_pt_abort_unbind(struct xe_vma *vma,
 			continue;
 
 		for (j = entry->ofs; j < entry->ofs + entry->qwords; j++)
-			pt_dir->children[j] =
+			pt_dir->staging[j] =
 				entries[i].pt_entries[j - entry->ofs].pt ?
 				&entries[i].pt_entries[j - entry->ofs].pt->base : NULL;
 	}
@@ -1657,7 +1683,7 @@ xe_pt_commit_prepare_unbind(struct xe_vma *vma,
 {
 	int i, j;
 
-	xe_pt_commit_locks_assert(vma);
+	xe_pt_commit_prepare_locks_assert(vma);
 
 	for (i = 0; i < num_entries; ++i) {
 		struct xe_vm_pgtable_update *entry = &entries[i];
@@ -1672,7 +1698,7 @@ xe_pt_commit_prepare_unbind(struct xe_vma *vma,
 		for (j = entry->ofs; j < entry->ofs + entry->qwords; j++) {
 			entry->pt_entries[j - entry->ofs].pt =
 				xe_pt_entry(pt_dir, j);
-			pt_dir->children[j] = NULL;
+			pt_dir->staging[j] = NULL;
 		}
 	}
 }
diff --git a/drivers/gpu/drm/xe/xe_pt_walk.c b/drivers/gpu/drm/xe/xe_pt_walk.c
index b8b3d2aea492..be602a763ff3 100644
--- a/drivers/gpu/drm/xe/xe_pt_walk.c
+++ b/drivers/gpu/drm/xe/xe_pt_walk.c
@@ -74,7 +74,8 @@ int xe_pt_walk_range(struct xe_ptw *parent, unsigned int level,
 		     u64 addr, u64 end, struct xe_pt_walk *walk)
 {
 	pgoff_t offset = xe_pt_offset(addr, level, walk);
-	struct xe_ptw **entries = parent->children ? parent->children : NULL;
+	struct xe_ptw **entries = walk->staging ? (parent->staging ?: NULL) :
+		(parent->children ?: NULL);
 	const struct xe_pt_walk_ops *ops = walk->ops;
 	enum page_walk_action action;
 	struct xe_ptw *child;
diff --git a/drivers/gpu/drm/xe/xe_pt_walk.h b/drivers/gpu/drm/xe/xe_pt_walk.h
index 5ecc4d2f0f65..5c02c244f7de 100644
--- a/drivers/gpu/drm/xe/xe_pt_walk.h
+++ b/drivers/gpu/drm/xe/xe_pt_walk.h
@@ -11,12 +11,14 @@
 /**
  * struct xe_ptw - base class for driver pagetable subclassing.
  * @children: Pointer to an array of children if any.
+ * @staging: Pointer to an array of staging if any.
  *
  * Drivers could subclass this, and if it's a page-directory, typically
  * embed an array of xe_ptw pointers.
  */
 struct xe_ptw {
 	struct xe_ptw **children;
+	struct xe_ptw **staging;
 };
 
 /**
@@ -41,6 +43,8 @@ struct xe_pt_walk {
 	 * as shared pagetables.
 	 */
 	bool shared_pt_mode;
+	/** @staging: Walk staging PT structure */
+	bool staging;
 };
 
 /**
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH v2 2/3] drm/xe: Userptr invalidation race with binds fixes
  2025-02-24  4:05 ` [PATCH v2 2/3] drm/xe: Userptr invalidation race with binds fixes Matthew Brost
@ 2025-02-24  8:21   ` Thomas Hellström
  2025-02-24 15:06     ` Matthew Brost
  0 siblings, 1 reply; 14+ messages in thread
From: Thomas Hellström @ 2025-02-24  8:21 UTC (permalink / raw)
  To: Matthew Brost, intel-xe; +Cc: matthew.auld

On Sun, 2025-02-23 at 20:05 -0800, Matthew Brost wrote:
> Squash bind operation after userptr invalidation into a clearing of
> PTEs. Will prevent valid GPU page tables from pointing to stale CPU
> pages.
> 
> Fixup initial bind handling always add VMAs to invalidation list and
> clear PTEs.
> 
> Remove unused rebind variable in xe_pt.
> 
> Always hold notifier across TLB invalidation in notifier to prevent a
> UAF if an unbind races.
> 
> Including all of the above changes for Fixes patch in hopes of an
> easier
> backport which fix a single patch.
> 
> v2:
>  - Wait dma-resv bookkeep before issuing PTE zap (Thomas)
>  - Support scratch page on invalidation (Thomas)
> 
> Cc: Thomas Hellström <thomas.hellstrom@linux.intel.com>
> Cc: <stable@vger.kernel.org>
> Fixes: e8babb280b5e ("drm/xe: Convert multiple bind ops into single
> job")
> Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> ---
>  drivers/gpu/drm/xe/xe_pt.c       | 146 +++++++++++++++++++++++------
> --
>  drivers/gpu/drm/xe/xe_pt_types.h |   3 +-
>  drivers/gpu/drm/xe/xe_vm.c       |   4 +-
>  3 files changed, 115 insertions(+), 38 deletions(-)
> 
> diff --git a/drivers/gpu/drm/xe/xe_pt.c b/drivers/gpu/drm/xe/xe_pt.c
> index 1ddcc7e79a93..add521b5c6ae 100644
> --- a/drivers/gpu/drm/xe/xe_pt.c
> +++ b/drivers/gpu/drm/xe/xe_pt.c
> @@ -351,7 +351,8 @@ xe_pt_new_shared(struct xe_walk_update *wupd,
> struct xe_pt *parent,
>   */
>  static int
>  xe_pt_insert_entry(struct xe_pt_stage_bind_walk *xe_walk, struct
> xe_pt *parent,
> -		   pgoff_t offset, struct xe_pt *xe_child, u64 pte)
> +		   pgoff_t offset, struct xe_pt *xe_child, u64 pte,
> +		   unsigned int level)
>  {
>  	struct xe_pt_update *upd = &xe_walk->wupd.updates[parent-
> >level];
>  	struct xe_pt_update *child_upd = xe_child ?
> @@ -389,6 +390,9 @@ xe_pt_insert_entry(struct xe_pt_stage_bind_walk
> *xe_walk, struct xe_pt *parent,
>  		idx = offset - entry->ofs;
>  		entry->pt_entries[idx].pt = xe_child;
>  		entry->pt_entries[idx].pte = pte;
> +		entry->pt_entries[idx].level = level;
> +		if (likely(!xe_child))
> +			entry->pt_entries[idx].level |=
> XE_PT_IS_LEAF;
>  		entry->qwords++;
>  	}
>  
> @@ -515,7 +519,8 @@ xe_pt_stage_bind_entry(struct xe_ptw *parent,
> pgoff_t offset,
>  			}
>  		}
>  
> -		ret = xe_pt_insert_entry(xe_walk, xe_parent, offset,
> NULL, pte);
> +		ret = xe_pt_insert_entry(xe_walk, xe_parent, offset,
> NULL, pte,
> +					 level);
>  		if (unlikely(ret))
>  			return ret;
>  
> @@ -571,7 +576,7 @@ xe_pt_stage_bind_entry(struct xe_ptw *parent,
> pgoff_t offset,
>  
>  		pte = vm->pt_ops->pde_encode_bo(xe_child->bo, 0,
> pat_index) | flags;
>  		ret = xe_pt_insert_entry(xe_walk, xe_parent, offset,
> xe_child,
> -					 pte);
> +					 pte, level);
>  	}
>  
>  	*action = ACTION_SUBTREE;
> @@ -752,6 +757,10 @@ struct xe_pt_zap_ptes_walk {
>  	/* Input parameters for the walk */
>  	/** @tile: The tile we're building for */
>  	struct xe_tile *tile;
> +	/** @vm: VM we're building for */
> +	struct xe_vm *vm;
> +	/** @scratch: write entries with scratch */
> +	bool scratch;
>  
>  	/* Output */
>  	/** @needs_invalidate: Whether we need to invalidate TLB*/
> @@ -779,9 +788,18 @@ static int xe_pt_zap_ptes_entry(struct xe_ptw
> *parent, pgoff_t offset,
>  	 */
>  	if (xe_pt_nonshared_offsets(addr, next, --level, walk,
> action, &offset,
>  				    &end_offset)) {
> -		xe_map_memset(tile_to_xe(xe_walk->tile), &xe_child-
> >bo->vmap,
> -			      offset * sizeof(u64), 0,
> -			      (end_offset - offset) * sizeof(u64));
> +		if (unlikely(xe_walk->scratch)) {
> +			u64 pte = __xe_pt_empty_pte(xe_walk->tile,
> xe_walk->vm,
> +						    level);
> +
> +			for (; offset < end_offset; ++offset)
> +				xe_pt_write(tile_to_xe(xe_walk-
> >tile),
> +					    &xe_child->bo->vmap,
> offset, pte);
> +		} else {
> +			xe_map_memset(tile_to_xe(xe_walk->tile),
> &xe_child->bo->vmap,
> +				      offset * sizeof(u64), 0,
> +				      (end_offset - offset) *
> sizeof(u64));
> +		}
>  		xe_walk->needs_invalidate = true;
>  	}
>  
> @@ -792,6 +810,31 @@ static const struct xe_pt_walk_ops
> xe_pt_zap_ptes_ops = {
>  	.pt_entry = xe_pt_zap_ptes_entry,
>  };
>  
> +struct xe_pt_zap_ptes_flags {
> +	bool scratch:1;
> +};
> +
> +static bool __xe_pt_zap_ptes(struct xe_tile *tile, struct xe_vma
> *vma,
> +			     struct xe_pt_zap_ptes_flags flags)
> +{
> +	struct xe_pt_zap_ptes_walk xe_walk = {
> +		.base = {
> +			.ops = &xe_pt_zap_ptes_ops,
> +			.shifts = xe_normal_pt_shifts,
> +			.max_level = XE_PT_HIGHEST_LEVEL,
> +		},
> +		.tile = tile,
> +		.vm = xe_vma_vm(vma),
> +		.scratch = flags.scratch,
> +	};
> +	struct xe_pt *pt = xe_vma_vm(vma)->pt_root[tile->id];
> +
> +	(void)xe_pt_walk_shared(&pt->base, pt->level,
> xe_vma_start(vma),
> +				xe_vma_end(vma), &xe_walk.base);
> +
> +	return xe_walk.needs_invalidate;
> +}
> +
>  /**
>   * xe_pt_zap_ptes() - Zap (zero) gpu ptes of an address range
>   * @tile: The tile we're zapping for.
> @@ -810,24 +853,13 @@ static const struct xe_pt_walk_ops
> xe_pt_zap_ptes_ops = {
>   */
>  bool xe_pt_zap_ptes(struct xe_tile *tile, struct xe_vma *vma)
>  {
> -	struct xe_pt_zap_ptes_walk xe_walk = {
> -		.base = {
> -			.ops = &xe_pt_zap_ptes_ops,
> -			.shifts = xe_normal_pt_shifts,
> -			.max_level = XE_PT_HIGHEST_LEVEL,
> -		},
> -		.tile = tile,
> -	};
> -	struct xe_pt *pt = xe_vma_vm(vma)->pt_root[tile->id];
> +	struct xe_pt_zap_ptes_flags flags = {};
>  	u8 pt_mask = (vma->tile_present & ~vma->tile_invalidated);
>  
>  	if (!(pt_mask & BIT(tile->id)))
>  		return false;
>  
> -	(void)xe_pt_walk_shared(&pt->base, pt->level,
> xe_vma_start(vma),
> -				xe_vma_end(vma), &xe_walk.base);
> -
> -	return xe_walk.needs_invalidate;
> +	return __xe_pt_zap_ptes(tile, vma, flags);
>  }
>  
>  static void
> @@ -1201,7 +1233,46 @@ static bool xe_pt_userptr_inject_eagain(struct
> xe_userptr_vma *uvma)
>  
>  #endif
>  
> -static int vma_check_userptr(struct xe_vm *vm, struct xe_vma *vma,
> +static void
> +vma_convert_to_invalidation(struct xe_tile *tile, struct xe_vma
> *vma,
> +			    struct xe_vm_pgtable_update_ops
> *pt_update)
> +{
> +	struct xe_pt_zap_ptes_flags flags = { .scratch = true, };
> +	int i, j, k;
> +
> +	/*
> +	 * Need to update this function to bypass scratch setup if
> in fault mode
> +	 */
> +	xe_assert(xe_vma_vm(vma)->xe,
> !xe_vm_in_fault_mode(xe_vma_vm(vma)));
> +
> +	for (i = 0; i < pt_update->current_op; ++i) {
> +		struct xe_vm_pgtable_update_op *op = &pt_update-
> >ops[i];
> +
> +		if (vma != op->vma || (!op->bind && !op->rebind))
> +			continue;
> +
> +		for (j = 0; j < op->num_entries; ++j) {
> +			for (k = 0; k < op->entries[j].qwords; ++k)
> {
> +				struct xe_pt_entry *entry =
> +					&op-
> >entries[j].pt_entries[k];
> +				unsigned int level = entry->level;
> +
> +				if (!(level & XE_PT_IS_LEAF))
> +					continue;
> +
> +				level &= ~XE_PT_IS_LEAF;
> +				entry->pte = __xe_pt_empty_pte(tile,
> +							      
> xe_vma_vm(vma),
> +							      
> level);
> +			}
> +		}
> +	}
> +
> +	__xe_pt_zap_ptes(tile, vma, flags);

As mentioned in my previous email, I'm pretty sure if we modify all the
ptes in the entry array, not just the leaves, (that's basically all
ptes of shared page-table entries) that will be equivalent to a zap.

/Thomas


> +}
> +
> +static int vma_check_userptr(struct xe_tile *tile, struct xe_vm *vm,
> +			     struct xe_vma *vma,
>  			     struct xe_vm_pgtable_update_ops
> *pt_update)
>  {
>  	struct xe_userptr_vma *uvma;
> @@ -1215,9 +1286,6 @@ static int vma_check_userptr(struct xe_vm *vm,
> struct xe_vma *vma,
>  	uvma = to_userptr_vma(vma);
>  	notifier_seq = uvma->userptr.notifier_seq;
>  
> -	if (uvma->userptr.initial_bind && !xe_vm_in_fault_mode(vm))
> -		return 0;
> -
>  	if (!mmu_interval_read_retry(&uvma->userptr.notifier,
>  				     notifier_seq) &&
>  	    !xe_pt_userptr_inject_eagain(uvma))
> @@ -1226,6 +1294,8 @@ static int vma_check_userptr(struct xe_vm *vm,
> struct xe_vma *vma,
>  	if (xe_vm_in_fault_mode(vm)) {
>  		return -EAGAIN;
>  	} else {
> +		long err;
> +
>  		spin_lock(&vm->userptr.invalidated_lock);
>  		list_move_tail(&uvma->userptr.invalidate_link,
>  			       &vm->userptr.invalidated);
> @@ -1234,25 +1304,27 @@ static int vma_check_userptr(struct xe_vm
> *vm, struct xe_vma *vma,
>  		if (xe_vm_in_preempt_fence_mode(vm)) {
>  			struct dma_resv_iter cursor;
>  			struct dma_fence *fence;
> -			long err;
>  
>  			dma_resv_iter_begin(&cursor, xe_vm_resv(vm),
>  					   
> DMA_RESV_USAGE_BOOKKEEP);
>  			dma_resv_for_each_fence_unlocked(&cursor,
> fence)
>  				dma_fence_enable_sw_signaling(fence)
> ;
>  			dma_resv_iter_end(&cursor);
> -
> -			err = dma_resv_wait_timeout(xe_vm_resv(vm),
> -						   
> DMA_RESV_USAGE_BOOKKEEP,
> -						    false,
> MAX_SCHEDULE_TIMEOUT);
> -			XE_WARN_ON(err <= 0);
>  		}
> +
> +		err = dma_resv_wait_timeout(xe_vm_resv(vm),
> +					    DMA_RESV_USAGE_BOOKKEEP,
> +					    false,
> MAX_SCHEDULE_TIMEOUT);
> +		XE_WARN_ON(err <= 0);
> +
> +		vma_convert_to_invalidation(tile, vma, pt_update);
>  	}
>  
>  	return 0;
>  }
>  
> -static int op_check_userptr(struct xe_vm *vm, struct xe_vma_op *op,
> +static int op_check_userptr(struct xe_tile *tile, struct xe_vm *vm,
> +			    struct xe_vma_op *op,
>  			    struct xe_vm_pgtable_update_ops
> *pt_update)
>  {
>  	int err = 0;
> @@ -1264,18 +1336,21 @@ static int op_check_userptr(struct xe_vm *vm,
> struct xe_vma_op *op,
>  		if (!op->map.immediate && xe_vm_in_fault_mode(vm))
>  			break;
>  
> -		err = vma_check_userptr(vm, op->map.vma, pt_update);
> +		err = vma_check_userptr(tile, vm, op->map.vma,
> pt_update);
>  		break;
>  	case DRM_GPUVA_OP_REMAP:
>  		if (op->remap.prev)
> -			err = vma_check_userptr(vm, op->remap.prev,
> pt_update);
> +			err = vma_check_userptr(tile, vm, op-
> >remap.prev,
> +						pt_update);
>  		if (!err && op->remap.next)
> -			err = vma_check_userptr(vm, op->remap.next,
> pt_update);
> +			err = vma_check_userptr(tile, vm, op-
> >remap.next,
> +						pt_update);
>  		break;
>  	case DRM_GPUVA_OP_UNMAP:
>  		break;
>  	case DRM_GPUVA_OP_PREFETCH:
> -		err = vma_check_userptr(vm, gpuva_to_vma(op-
> >base.prefetch.va),
> +		err = vma_check_userptr(tile, vm,
> +					gpuva_to_vma(op-
> >base.prefetch.va),
>  					pt_update);
>  		break;
>  	default:
> @@ -1301,7 +1376,8 @@ static int xe_pt_userptr_pre_commit(struct
> xe_migrate_pt_update *pt_update)
>  	down_read(&vm->userptr.notifier_lock);
>  
>  	list_for_each_entry(op, &vops->list, link) {
> -		err = op_check_userptr(vm, op, pt_update_ops);
> +		err = op_check_userptr(&vm->xe->tiles[pt_update-
> >tile_id],
> +				       vm, op, pt_update_ops);
>  		if (err) {
>  			up_read(&vm->userptr.notifier_lock);
>  			break;
> diff --git a/drivers/gpu/drm/xe/xe_pt_types.h
> b/drivers/gpu/drm/xe/xe_pt_types.h
> index 384cc04de719..6f99ff2b8fce 100644
> --- a/drivers/gpu/drm/xe/xe_pt_types.h
> +++ b/drivers/gpu/drm/xe/xe_pt_types.h
> @@ -29,7 +29,6 @@ struct xe_pt {
>  	struct xe_bo *bo;
>  	unsigned int level;
>  	unsigned int num_live;
> -	bool rebind;
>  	bool is_compact;
>  #if IS_ENABLED(CONFIG_DRM_XE_DEBUG_VM)
>  	/** addr: Virtual address start address of the PT. */
> @@ -52,6 +51,8 @@ struct xe_pt_ops {
>  struct xe_pt_entry {
>  	struct xe_pt *pt;
>  	u64 pte;
> +#define XE_PT_IS_LEAF	BIT(31)
> +	unsigned int level;
>  };
>  
>  struct xe_vm_pgtable_update {
> diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c
> index ea2e287e6526..f90e5c92010c 100644
> --- a/drivers/gpu/drm/xe/xe_vm.c
> +++ b/drivers/gpu/drm/xe/xe_vm.c
> @@ -623,8 +623,6 @@ static bool vma_userptr_invalidate(struct
> mmu_interval_notifier *mni,
>  		spin_unlock(&vm->userptr.invalidated_lock);
>  	}
>  
> -	up_write(&vm->userptr.notifier_lock);
> -
>  	/*
>  	 * Preempt fences turn into schedule disables, pipeline
> these.
>  	 * Note that even in fault mode, we need to wait for binds
> and
> @@ -647,6 +645,8 @@ static bool vma_userptr_invalidate(struct
> mmu_interval_notifier *mni,
>  		XE_WARN_ON(err);
>  	}
>  
> +	up_write(&vm->userptr.notifier_lock);
> +
>  	trace_xe_vma_userptr_invalidate_complete(vma);
>  
>  	return true;


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v2 3/3] drm/xe: Add staging tree for VM binds
  2025-02-24  4:05 ` [PATCH v2 3/3] drm/xe: Add staging tree for VM binds Matthew Brost
@ 2025-02-24  9:22   ` Thomas Hellström
  2025-02-24 14:41     ` Matthew Brost
  0 siblings, 1 reply; 14+ messages in thread
From: Thomas Hellström @ 2025-02-24  9:22 UTC (permalink / raw)
  To: Matthew Brost, intel-xe; +Cc: matthew.auld

Hi,

On Sun, 2025-02-23 at 20:05 -0800, Matthew Brost wrote:
> Concurrent VM bind staging and zapping of PTEs from a userptr
> notifier
> do not work because the view of PTEs is not stable. VM binds cannot
> acquire the notifier lock during staging, as memory allocations are
> required. To resolve this race condition, use a staging tree for VM
> binds that is committed only under the userptr notifier lock during
> the
> final step of the bind. This ensures a consistent view of the PTEs in
> the userptr notifier.
> 
> A follow up may only use staging for VM in fault mode as this is the
> only mode in which the above race exists.
> 
> Suggested-by: Thomas Hellström <thomas.hellstrom@linux.intel.com>
> Cc: <stable@vger.kernel.org>
> Fixes: e8babb280b5e ("drm/xe: Convert multiple bind ops into single
> job")
> Fixes: a708f6501c69 ("drm/xe: Update PT layer with better error
> handling")
> Signed-off-by: Matthew Brost <matthew.brost@intel.com>

This looks even simpler than what I initially thought. Ideally I think
the staging tree would exist only under the vm lock during the multi
bind-unbind operation, but that adds code complication and execution
time compared to this solution which is more memory-hungry.

Minor comment below.

> ---
>  drivers/gpu/drm/xe/xe_pt.c      | 50 +++++++++++++++++++++++++------
> --
>  drivers/gpu/drm/xe/xe_pt_walk.c |  3 +-
>  drivers/gpu/drm/xe/xe_pt_walk.h |  4 +++
>  3 files changed, 44 insertions(+), 13 deletions(-)
> 
> diff --git a/drivers/gpu/drm/xe/xe_pt.c b/drivers/gpu/drm/xe/xe_pt.c
> index add521b5c6ae..118b81cd7205 100644
> --- a/drivers/gpu/drm/xe/xe_pt.c
> +++ b/drivers/gpu/drm/xe/xe_pt.c
> @@ -28,6 +28,8 @@ struct xe_pt_dir {
>  	struct xe_pt pt;
>  	/** @children: Array of page-table child nodes */
>  	struct xe_ptw *children[XE_PDES];
> +	/** @staging: Array of page-table staging nodes */
> +	struct xe_ptw *staging[XE_PDES];
>  };
>  
>  #if IS_ENABLED(CONFIG_DRM_XE_DEBUG_VM)
> @@ -50,7 +52,7 @@ static struct xe_pt_dir *as_xe_pt_dir(struct xe_pt
> *pt)
>  
>  static struct xe_pt *xe_pt_entry(struct xe_pt_dir *pt_dir, unsigned
> int index)
>  {
> -	return container_of(pt_dir->children[index], struct xe_pt,
> base);
> +	return container_of(pt_dir->staging[index], struct xe_pt,
> base);

It looks like the only user here is with staging == true, but I'm a
little worried about future users. Perhaps include a staging bool in
the interface and assert that it is true? Or rename to
xe_pt_entry_staging()?


>  }
>  
>  static u64 __xe_pt_empty_pte(struct xe_tile *tile, struct xe_vm *vm,
> @@ -125,6 +127,7 @@ struct xe_pt *xe_pt_create(struct xe_vm *vm,
> struct xe_tile *tile,
>  	}
>  	pt->bo = bo;
>  	pt->base.children = level ? as_xe_pt_dir(pt)->children :
> NULL;
> +	pt->base.staging = level ? as_xe_pt_dir(pt)->staging : NULL;
>  
>  	if (vm->xef)
>  		xe_drm_client_add_bo(vm->xef->client, pt->bo);
> @@ -377,8 +380,10 @@ xe_pt_insert_entry(struct xe_pt_stage_bind_walk
> *xe_walk, struct xe_pt *parent,
>  		/* Continue building a non-connected subtree. */
>  		struct iosys_map *map = &parent->bo->vmap;
>  
> -		if (unlikely(xe_child))
> +		if (unlikely(xe_child)) {
>  			parent->base.children[offset] = &xe_child-
> >base;
> +			parent->base.staging[offset] = &xe_child-
> >base;
> +		}
>  
>  		xe_pt_write(xe_walk->vm->xe, map, offset, pte);
>  		parent->num_live++;
> @@ -619,6 +624,7 @@ xe_pt_stage_bind(struct xe_tile *tile, struct
> xe_vma *vma,
>  			.ops = &xe_pt_stage_bind_ops,
>  			.shifts = xe_normal_pt_shifts,
>  			.max_level = XE_PT_HIGHEST_LEVEL,
> +			.staging = true,
>  		},
>  		.vm = xe_vma_vm(vma),
>  		.tile = tile,
> @@ -812,6 +818,7 @@ static const struct xe_pt_walk_ops
> xe_pt_zap_ptes_ops = {
>  
>  struct xe_pt_zap_ptes_flags {
>  	bool scratch:1;
> +	bool staging:1;
>  };

As mentioned in the comments to the previous commit. I don't think
zapping is needed when aborting a bind.

Otherwise LGTM
/Thomas

>  
>  static bool __xe_pt_zap_ptes(struct xe_tile *tile, struct xe_vma
> *vma,
> @@ -822,6 +829,7 @@ static bool __xe_pt_zap_ptes(struct xe_tile
> *tile, struct xe_vma *vma,
>  			.ops = &xe_pt_zap_ptes_ops,
>  			.shifts = xe_normal_pt_shifts,
>  			.max_level = XE_PT_HIGHEST_LEVEL,
> +			.staging = flags.staging,
>  		},
>  		.tile = tile,
>  		.vm = xe_vma_vm(vma),
> @@ -905,7 +913,7 @@ static void xe_pt_cancel_bind(struct xe_vma *vma,
>  	}
>  }
>  
> -static void xe_pt_commit_locks_assert(struct xe_vma *vma)
> +static void xe_pt_commit_prepare_locks_assert(struct xe_vma *vma)
>  {
>  	struct xe_vm *vm = xe_vma_vm(vma);
>  
> @@ -917,6 +925,16 @@ static void xe_pt_commit_locks_assert(struct
> xe_vma *vma)
>  	xe_vm_assert_held(vm);
>  }
>  
> +static void xe_pt_commit_locks_assert(struct xe_vma *vma)
> +{
> +	struct xe_vm *vm = xe_vma_vm(vma);
> +
> +	xe_pt_commit_prepare_locks_assert(vma);
> +
> +	if (xe_vma_is_userptr(vma))
> +		lockdep_assert_held_read(&vm-
> >userptr.notifier_lock);
> +}
> +
>  static void xe_pt_commit(struct xe_vma *vma,
>  			 struct xe_vm_pgtable_update *entries,
>  			 u32 num_entries, struct llist_head
> *deferred)
> @@ -927,13 +945,17 @@ static void xe_pt_commit(struct xe_vma *vma,
>  
>  	for (i = 0; i < num_entries; i++) {
>  		struct xe_pt *pt = entries[i].pt;
> +		struct xe_pt_dir *pt_dir;
>  
>  		if (!pt->level)
>  			continue;
>  
> +		pt_dir = as_xe_pt_dir(pt);
>  		for (j = 0; j < entries[i].qwords; j++) {
>  			struct xe_pt *oldpte =
> entries[i].pt_entries[j].pt;
> +			int j_ = j + entries[i].ofs;
>  
> +			pt_dir->children[j_] = pt_dir->staging[j_];
>  			xe_pt_destroy(oldpte, xe_vma_vm(vma)->flags,
> deferred);
>  		}
>  	}
> @@ -945,7 +967,7 @@ static void xe_pt_abort_bind(struct xe_vma *vma,
>  {
>  	int i, j;
>  
> -	xe_pt_commit_locks_assert(vma);
> +	xe_pt_commit_prepare_locks_assert(vma);
>  
>  	for (i = num_entries - 1; i >= 0; --i) {
>  		struct xe_pt *pt = entries[i].pt;
> @@ -963,7 +985,7 @@ static void xe_pt_abort_bind(struct xe_vma *vma,
>  			struct xe_pt *newpte = xe_pt_entry(pt_dir,
> j_);
>  			struct xe_pt *oldpte =
> entries[i].pt_entries[j].pt;
>  
> -			pt_dir->children[j_] = oldpte ? &oldpte-
> >base : 0;
> +			pt_dir->staging[j_] = oldpte ? &oldpte->base
> : 0;
>  			xe_pt_destroy(newpte, xe_vma_vm(vma)->flags,
> NULL);
>  		}
>  	}
> @@ -975,7 +997,7 @@ static void xe_pt_commit_prepare_bind(struct
> xe_vma *vma,
>  {
>  	u32 i, j;
>  
> -	xe_pt_commit_locks_assert(vma);
> +	xe_pt_commit_prepare_locks_assert(vma);
>  
>  	for (i = 0; i < num_entries; i++) {
>  		struct xe_pt *pt = entries[i].pt;
> @@ -996,7 +1018,7 @@ static void xe_pt_commit_prepare_bind(struct
> xe_vma *vma,
>  			if (xe_pt_entry(pt_dir, j_))
>  				oldpte = xe_pt_entry(pt_dir, j_);
>  
> -			pt_dir->children[j_] = &newpte->base;
> +			pt_dir->staging[j_] = &newpte->base;
>  			entries[i].pt_entries[j].pt = oldpte;
>  		}
>  	}
> @@ -1237,7 +1259,10 @@ static void
>  vma_convert_to_invalidation(struct xe_tile *tile, struct xe_vma
> *vma,
>  			    struct xe_vm_pgtable_update_ops
> *pt_update)
>  {
> -	struct xe_pt_zap_ptes_flags flags = { .scratch = true, };
> +	struct xe_pt_zap_ptes_flags flags = {
> +		.scratch = true,
> +		.staging = true,
> +	};
>  	int i, j, k;
>  
>  	/*
> @@ -1590,6 +1615,7 @@ static unsigned int xe_pt_stage_unbind(struct
> xe_tile *tile, struct xe_vma *vma,
>  			.ops = &xe_pt_stage_unbind_ops,
>  			.shifts = xe_normal_pt_shifts,
>  			.max_level = XE_PT_HIGHEST_LEVEL,
> +			.staging = true,
>  		},
>  		.tile = tile,
>  		.modified_start = xe_vma_start(vma),
> @@ -1631,7 +1657,7 @@ static void xe_pt_abort_unbind(struct xe_vma
> *vma,
>  {
>  	int i, j;
>  
> -	xe_pt_commit_locks_assert(vma);
> +	xe_pt_commit_prepare_locks_assert(vma);
>  
>  	for (i = num_entries - 1; i >= 0; --i) {
>  		struct xe_vm_pgtable_update *entry = &entries[i];
> @@ -1644,7 +1670,7 @@ static void xe_pt_abort_unbind(struct xe_vma
> *vma,
>  			continue;
>  
>  		for (j = entry->ofs; j < entry->ofs + entry->qwords;
> j++)
> -			pt_dir->children[j] =
> +			pt_dir->staging[j] =
>  				entries[i].pt_entries[j - entry-
> >ofs].pt ?
>  				&entries[i].pt_entries[j - entry-
> >ofs].pt->base : NULL;
>  	}
> @@ -1657,7 +1683,7 @@ xe_pt_commit_prepare_unbind(struct xe_vma *vma,
>  {
>  	int i, j;
>  
> -	xe_pt_commit_locks_assert(vma);
> +	xe_pt_commit_prepare_locks_assert(vma);
>  
>  	for (i = 0; i < num_entries; ++i) {
>  		struct xe_vm_pgtable_update *entry = &entries[i];
> @@ -1672,7 +1698,7 @@ xe_pt_commit_prepare_unbind(struct xe_vma *vma,
>  		for (j = entry->ofs; j < entry->ofs + entry->qwords;
> j++) {
>  			entry->pt_entries[j - entry->ofs].pt =
>  				xe_pt_entry(pt_dir, j);
> -			pt_dir->children[j] = NULL;
> +			pt_dir->staging[j] = NULL;
>  		}
>  	}
>  }
> diff --git a/drivers/gpu/drm/xe/xe_pt_walk.c
> b/drivers/gpu/drm/xe/xe_pt_walk.c
> index b8b3d2aea492..be602a763ff3 100644
> --- a/drivers/gpu/drm/xe/xe_pt_walk.c
> +++ b/drivers/gpu/drm/xe/xe_pt_walk.c
> @@ -74,7 +74,8 @@ int xe_pt_walk_range(struct xe_ptw *parent,
> unsigned int level,
>  		     u64 addr, u64 end, struct xe_pt_walk *walk)
>  {
>  	pgoff_t offset = xe_pt_offset(addr, level, walk);
> -	struct xe_ptw **entries = parent->children ? parent-
> >children : NULL;
> +	struct xe_ptw **entries = walk->staging ? (parent->staging
> ?: NULL) :
> +		(parent->children ?: NULL);
>  	const struct xe_pt_walk_ops *ops = walk->ops;
>  	enum page_walk_action action;
>  	struct xe_ptw *child;
> diff --git a/drivers/gpu/drm/xe/xe_pt_walk.h
> b/drivers/gpu/drm/xe/xe_pt_walk.h
> index 5ecc4d2f0f65..5c02c244f7de 100644
> --- a/drivers/gpu/drm/xe/xe_pt_walk.h
> +++ b/drivers/gpu/drm/xe/xe_pt_walk.h
> @@ -11,12 +11,14 @@
>  /**
>   * struct xe_ptw - base class for driver pagetable subclassing.
>   * @children: Pointer to an array of children if any.
> + * @staging: Pointer to an array of staging if any.
>   *
>   * Drivers could subclass this, and if it's a page-directory,
> typically
>   * embed an array of xe_ptw pointers.
>   */
>  struct xe_ptw {
>  	struct xe_ptw **children;
> +	struct xe_ptw **staging;
>  };
>  
>  /**
> @@ -41,6 +43,8 @@ struct xe_pt_walk {
>  	 * as shared pagetables.
>  	 */
>  	bool shared_pt_mode;
> +	/** @staging: Walk staging PT structure */
> +	bool staging;
>  };
>  
>  /**


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v2 3/3] drm/xe: Add staging tree for VM binds
  2025-02-24  9:22   ` Thomas Hellström
@ 2025-02-24 14:41     ` Matthew Brost
  0 siblings, 0 replies; 14+ messages in thread
From: Matthew Brost @ 2025-02-24 14:41 UTC (permalink / raw)
  To: Thomas Hellström; +Cc: intel-xe, matthew.auld

On Mon, Feb 24, 2025 at 10:22:37AM +0100, Thomas Hellström wrote:
> Hi,
> 
> On Sun, 2025-02-23 at 20:05 -0800, Matthew Brost wrote:
> > Concurrent VM bind staging and zapping of PTEs from a userptr
> > notifier
> > do not work because the view of PTEs is not stable. VM binds cannot
> > acquire the notifier lock during staging, as memory allocations are
> > required. To resolve this race condition, use a staging tree for VM
> > binds that is committed only under the userptr notifier lock during
> > the
> > final step of the bind. This ensures a consistent view of the PTEs in
> > the userptr notifier.
> > 
> > A follow up may only use staging for VM in fault mode as this is the
> > only mode in which the above race exists.
> > 
> > Suggested-by: Thomas Hellström <thomas.hellstrom@linux.intel.com>
> > Cc: <stable@vger.kernel.org>
> > Fixes: e8babb280b5e ("drm/xe: Convert multiple bind ops into single
> > job")
> > Fixes: a708f6501c69 ("drm/xe: Update PT layer with better error
> > handling")
> > Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> 
> This looks even simpler than what I initially thought. Ideally I think

Was pleasantly by how simple this was.

> the staging tree would exist only under the vm lock during the multi
> bind-unbind operation, but that adds code complication and execution
> time compared to this solution which is more memory-hungry.
>

It does take a bit more memory but I doubt that is a concern. We only
allocate an extra page of memory for each PDE which is pretty minor
considering the smallest PDE is still mapping 1G of memory. As I mention
in the commit message we could even drop the staging for non faulting
VMs too.
 
> Minor comment below.
> 
> > ---
> >  drivers/gpu/drm/xe/xe_pt.c      | 50 +++++++++++++++++++++++++------
> > --
> >  drivers/gpu/drm/xe/xe_pt_walk.c |  3 +-
> >  drivers/gpu/drm/xe/xe_pt_walk.h |  4 +++
> >  3 files changed, 44 insertions(+), 13 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/xe/xe_pt.c b/drivers/gpu/drm/xe/xe_pt.c
> > index add521b5c6ae..118b81cd7205 100644
> > --- a/drivers/gpu/drm/xe/xe_pt.c
> > +++ b/drivers/gpu/drm/xe/xe_pt.c
> > @@ -28,6 +28,8 @@ struct xe_pt_dir {
> >  	struct xe_pt pt;
> >  	/** @children: Array of page-table child nodes */
> >  	struct xe_ptw *children[XE_PDES];
> > +	/** @staging: Array of page-table staging nodes */
> > +	struct xe_ptw *staging[XE_PDES];
> >  };
> >  
> >  #if IS_ENABLED(CONFIG_DRM_XE_DEBUG_VM)
> > @@ -50,7 +52,7 @@ static struct xe_pt_dir *as_xe_pt_dir(struct xe_pt
> > *pt)
> >  
> >  static struct xe_pt *xe_pt_entry(struct xe_pt_dir *pt_dir, unsigned
> > int index)
> >  {
> > -	return container_of(pt_dir->children[index], struct xe_pt,
> > base);
> > +	return container_of(pt_dir->staging[index], struct xe_pt,
> > base);
> 
> It looks like the only user here is with staging == true, but I'm a
> little worried about future users. Perhaps include a staging bool in
> the interface and assert that it is true? Or rename to
> xe_pt_entry_staging()?
> 

I had a bool originally but found that xe_pt_entry is only called during
the staging steps so dropped it. Can rename to xe_pt_entry_staging for
clarity. 

> 
> >  }
> >  
> >  static u64 __xe_pt_empty_pte(struct xe_tile *tile, struct xe_vm *vm,
> > @@ -125,6 +127,7 @@ struct xe_pt *xe_pt_create(struct xe_vm *vm,
> > struct xe_tile *tile,
> >  	}
> >  	pt->bo = bo;
> >  	pt->base.children = level ? as_xe_pt_dir(pt)->children :
> > NULL;
> > +	pt->base.staging = level ? as_xe_pt_dir(pt)->staging : NULL;
> >  
> >  	if (vm->xef)
> >  		xe_drm_client_add_bo(vm->xef->client, pt->bo);
> > @@ -377,8 +380,10 @@ xe_pt_insert_entry(struct xe_pt_stage_bind_walk
> > *xe_walk, struct xe_pt *parent,
> >  		/* Continue building a non-connected subtree. */
> >  		struct iosys_map *map = &parent->bo->vmap;
> >  
> > -		if (unlikely(xe_child))
> > +		if (unlikely(xe_child)) {
> >  			parent->base.children[offset] = &xe_child-
> > >base;
> > +			parent->base.staging[offset] = &xe_child-
> > >base;
> > +		}
> >  
> >  		xe_pt_write(xe_walk->vm->xe, map, offset, pte);
> >  		parent->num_live++;
> > @@ -619,6 +624,7 @@ xe_pt_stage_bind(struct xe_tile *tile, struct
> > xe_vma *vma,
> >  			.ops = &xe_pt_stage_bind_ops,
> >  			.shifts = xe_normal_pt_shifts,
> >  			.max_level = XE_PT_HIGHEST_LEVEL,
> > +			.staging = true,
> >  		},
> >  		.vm = xe_vma_vm(vma),
> >  		.tile = tile,
> > @@ -812,6 +818,7 @@ static const struct xe_pt_walk_ops
> > xe_pt_zap_ptes_ops = {
> >  
> >  struct xe_pt_zap_ptes_flags {
> >  	bool scratch:1;
> > +	bool staging:1;
> >  };
> 
> As mentioned in the comments to the previous commit. I don't think
> zapping is needed when aborting a bind.
> 

Let me reply there as I think there is a bit of confusion still.

Matt

> Otherwise LGTM
> /Thomas
> 
> >  
> >  static bool __xe_pt_zap_ptes(struct xe_tile *tile, struct xe_vma
> > *vma,
> > @@ -822,6 +829,7 @@ static bool __xe_pt_zap_ptes(struct xe_tile
> > *tile, struct xe_vma *vma,
> >  			.ops = &xe_pt_zap_ptes_ops,
> >  			.shifts = xe_normal_pt_shifts,
> >  			.max_level = XE_PT_HIGHEST_LEVEL,
> > +			.staging = flags.staging,
> >  		},
> >  		.tile = tile,
> >  		.vm = xe_vma_vm(vma),
> > @@ -905,7 +913,7 @@ static void xe_pt_cancel_bind(struct xe_vma *vma,
> >  	}
> >  }
> >  
> > -static void xe_pt_commit_locks_assert(struct xe_vma *vma)
> > +static void xe_pt_commit_prepare_locks_assert(struct xe_vma *vma)
> >  {
> >  	struct xe_vm *vm = xe_vma_vm(vma);
> >  
> > @@ -917,6 +925,16 @@ static void xe_pt_commit_locks_assert(struct
> > xe_vma *vma)
> >  	xe_vm_assert_held(vm);
> >  }
> >  
> > +static void xe_pt_commit_locks_assert(struct xe_vma *vma)
> > +{
> > +	struct xe_vm *vm = xe_vma_vm(vma);
> > +
> > +	xe_pt_commit_prepare_locks_assert(vma);
> > +
> > +	if (xe_vma_is_userptr(vma))
> > +		lockdep_assert_held_read(&vm-
> > >userptr.notifier_lock);
> > +}
> > +
> >  static void xe_pt_commit(struct xe_vma *vma,
> >  			 struct xe_vm_pgtable_update *entries,
> >  			 u32 num_entries, struct llist_head
> > *deferred)
> > @@ -927,13 +945,17 @@ static void xe_pt_commit(struct xe_vma *vma,
> >  
> >  	for (i = 0; i < num_entries; i++) {
> >  		struct xe_pt *pt = entries[i].pt;
> > +		struct xe_pt_dir *pt_dir;
> >  
> >  		if (!pt->level)
> >  			continue;
> >  
> > +		pt_dir = as_xe_pt_dir(pt);
> >  		for (j = 0; j < entries[i].qwords; j++) {
> >  			struct xe_pt *oldpte =
> > entries[i].pt_entries[j].pt;
> > +			int j_ = j + entries[i].ofs;
> >  
> > +			pt_dir->children[j_] = pt_dir->staging[j_];
> >  			xe_pt_destroy(oldpte, xe_vma_vm(vma)->flags,
> > deferred);
> >  		}
> >  	}
> > @@ -945,7 +967,7 @@ static void xe_pt_abort_bind(struct xe_vma *vma,
> >  {
> >  	int i, j;
> >  
> > -	xe_pt_commit_locks_assert(vma);
> > +	xe_pt_commit_prepare_locks_assert(vma);
> >  
> >  	for (i = num_entries - 1; i >= 0; --i) {
> >  		struct xe_pt *pt = entries[i].pt;
> > @@ -963,7 +985,7 @@ static void xe_pt_abort_bind(struct xe_vma *vma,
> >  			struct xe_pt *newpte = xe_pt_entry(pt_dir,
> > j_);
> >  			struct xe_pt *oldpte =
> > entries[i].pt_entries[j].pt;
> >  
> > -			pt_dir->children[j_] = oldpte ? &oldpte-
> > >base : 0;
> > +			pt_dir->staging[j_] = oldpte ? &oldpte->base
> > : 0;
> >  			xe_pt_destroy(newpte, xe_vma_vm(vma)->flags,
> > NULL);
> >  		}
> >  	}
> > @@ -975,7 +997,7 @@ static void xe_pt_commit_prepare_bind(struct
> > xe_vma *vma,
> >  {
> >  	u32 i, j;
> >  
> > -	xe_pt_commit_locks_assert(vma);
> > +	xe_pt_commit_prepare_locks_assert(vma);
> >  
> >  	for (i = 0; i < num_entries; i++) {
> >  		struct xe_pt *pt = entries[i].pt;
> > @@ -996,7 +1018,7 @@ static void xe_pt_commit_prepare_bind(struct
> > xe_vma *vma,
> >  			if (xe_pt_entry(pt_dir, j_))
> >  				oldpte = xe_pt_entry(pt_dir, j_);
> >  
> > -			pt_dir->children[j_] = &newpte->base;
> > +			pt_dir->staging[j_] = &newpte->base;
> >  			entries[i].pt_entries[j].pt = oldpte;
> >  		}
> >  	}
> > @@ -1237,7 +1259,10 @@ static void
> >  vma_convert_to_invalidation(struct xe_tile *tile, struct xe_vma
> > *vma,
> >  			    struct xe_vm_pgtable_update_ops
> > *pt_update)
> >  {
> > -	struct xe_pt_zap_ptes_flags flags = { .scratch = true, };
> > +	struct xe_pt_zap_ptes_flags flags = {
> > +		.scratch = true,
> > +		.staging = true,
> > +	};
> >  	int i, j, k;
> >  
> >  	/*
> > @@ -1590,6 +1615,7 @@ static unsigned int xe_pt_stage_unbind(struct
> > xe_tile *tile, struct xe_vma *vma,
> >  			.ops = &xe_pt_stage_unbind_ops,
> >  			.shifts = xe_normal_pt_shifts,
> >  			.max_level = XE_PT_HIGHEST_LEVEL,
> > +			.staging = true,
> >  		},
> >  		.tile = tile,
> >  		.modified_start = xe_vma_start(vma),
> > @@ -1631,7 +1657,7 @@ static void xe_pt_abort_unbind(struct xe_vma
> > *vma,
> >  {
> >  	int i, j;
> >  
> > -	xe_pt_commit_locks_assert(vma);
> > +	xe_pt_commit_prepare_locks_assert(vma);
> >  
> >  	for (i = num_entries - 1; i >= 0; --i) {
> >  		struct xe_vm_pgtable_update *entry = &entries[i];
> > @@ -1644,7 +1670,7 @@ static void xe_pt_abort_unbind(struct xe_vma
> > *vma,
> >  			continue;
> >  
> >  		for (j = entry->ofs; j < entry->ofs + entry->qwords;
> > j++)
> > -			pt_dir->children[j] =
> > +			pt_dir->staging[j] =
> >  				entries[i].pt_entries[j - entry-
> > >ofs].pt ?
> >  				&entries[i].pt_entries[j - entry-
> > >ofs].pt->base : NULL;
> >  	}
> > @@ -1657,7 +1683,7 @@ xe_pt_commit_prepare_unbind(struct xe_vma *vma,
> >  {
> >  	int i, j;
> >  
> > -	xe_pt_commit_locks_assert(vma);
> > +	xe_pt_commit_prepare_locks_assert(vma);
> >  
> >  	for (i = 0; i < num_entries; ++i) {
> >  		struct xe_vm_pgtable_update *entry = &entries[i];
> > @@ -1672,7 +1698,7 @@ xe_pt_commit_prepare_unbind(struct xe_vma *vma,
> >  		for (j = entry->ofs; j < entry->ofs + entry->qwords;
> > j++) {
> >  			entry->pt_entries[j - entry->ofs].pt =
> >  				xe_pt_entry(pt_dir, j);
> > -			pt_dir->children[j] = NULL;
> > +			pt_dir->staging[j] = NULL;
> >  		}
> >  	}
> >  }
> > diff --git a/drivers/gpu/drm/xe/xe_pt_walk.c
> > b/drivers/gpu/drm/xe/xe_pt_walk.c
> > index b8b3d2aea492..be602a763ff3 100644
> > --- a/drivers/gpu/drm/xe/xe_pt_walk.c
> > +++ b/drivers/gpu/drm/xe/xe_pt_walk.c
> > @@ -74,7 +74,8 @@ int xe_pt_walk_range(struct xe_ptw *parent,
> > unsigned int level,
> >  		     u64 addr, u64 end, struct xe_pt_walk *walk)
> >  {
> >  	pgoff_t offset = xe_pt_offset(addr, level, walk);
> > -	struct xe_ptw **entries = parent->children ? parent-
> > >children : NULL;
> > +	struct xe_ptw **entries = walk->staging ? (parent->staging
> > ?: NULL) :
> > +		(parent->children ?: NULL);
> >  	const struct xe_pt_walk_ops *ops = walk->ops;
> >  	enum page_walk_action action;
> >  	struct xe_ptw *child;
> > diff --git a/drivers/gpu/drm/xe/xe_pt_walk.h
> > b/drivers/gpu/drm/xe/xe_pt_walk.h
> > index 5ecc4d2f0f65..5c02c244f7de 100644
> > --- a/drivers/gpu/drm/xe/xe_pt_walk.h
> > +++ b/drivers/gpu/drm/xe/xe_pt_walk.h
> > @@ -11,12 +11,14 @@
> >  /**
> >   * struct xe_ptw - base class for driver pagetable subclassing.
> >   * @children: Pointer to an array of children if any.
> > + * @staging: Pointer to an array of staging if any.
> >   *
> >   * Drivers could subclass this, and if it's a page-directory,
> > typically
> >   * embed an array of xe_ptw pointers.
> >   */
> >  struct xe_ptw {
> >  	struct xe_ptw **children;
> > +	struct xe_ptw **staging;
> >  };
> >  
> >  /**
> > @@ -41,6 +43,8 @@ struct xe_pt_walk {
> >  	 * as shared pagetables.
> >  	 */
> >  	bool shared_pt_mode;
> > +	/** @staging: Walk staging PT structure */
> > +	bool staging;
> >  };
> >  
> >  /**
> 

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v2 2/3] drm/xe: Userptr invalidation race with binds fixes
  2025-02-24  8:21   ` Thomas Hellström
@ 2025-02-24 15:06     ` Matthew Brost
  2025-02-24 15:20       ` Thomas Hellström
  0 siblings, 1 reply; 14+ messages in thread
From: Matthew Brost @ 2025-02-24 15:06 UTC (permalink / raw)
  To: Thomas Hellström; +Cc: intel-xe, matthew.auld

On Mon, Feb 24, 2025 at 09:21:09AM +0100, Thomas Hellström wrote:
> On Sun, 2025-02-23 at 20:05 -0800, Matthew Brost wrote:
> > Squash bind operation after userptr invalidation into a clearing of
> > PTEs. Will prevent valid GPU page tables from pointing to stale CPU
> > pages.
> > 
> > Fixup initial bind handling always add VMAs to invalidation list and
> > clear PTEs.
> > 
> > Remove unused rebind variable in xe_pt.
> > 
> > Always hold notifier across TLB invalidation in notifier to prevent a
> > UAF if an unbind races.
> > 
> > Including all of the above changes for Fixes patch in hopes of an
> > easier
> > backport which fix a single patch.
> > 
> > v2:
> >  - Wait dma-resv bookkeep before issuing PTE zap (Thomas)
> >  - Support scratch page on invalidation (Thomas)
> > 
> > Cc: Thomas Hellström <thomas.hellstrom@linux.intel.com>
> > Cc: <stable@vger.kernel.org>
> > Fixes: e8babb280b5e ("drm/xe: Convert multiple bind ops into single
> > job")
> > Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> > ---
> >  drivers/gpu/drm/xe/xe_pt.c       | 146 +++++++++++++++++++++++------
> > --
> >  drivers/gpu/drm/xe/xe_pt_types.h |   3 +-
> >  drivers/gpu/drm/xe/xe_vm.c       |   4 +-
> >  3 files changed, 115 insertions(+), 38 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/xe/xe_pt.c b/drivers/gpu/drm/xe/xe_pt.c
> > index 1ddcc7e79a93..add521b5c6ae 100644
> > --- a/drivers/gpu/drm/xe/xe_pt.c
> > +++ b/drivers/gpu/drm/xe/xe_pt.c
> > @@ -351,7 +351,8 @@ xe_pt_new_shared(struct xe_walk_update *wupd,
> > struct xe_pt *parent,
> >   */
> >  static int
> >  xe_pt_insert_entry(struct xe_pt_stage_bind_walk *xe_walk, struct
> > xe_pt *parent,
> > -		   pgoff_t offset, struct xe_pt *xe_child, u64 pte)
> > +		   pgoff_t offset, struct xe_pt *xe_child, u64 pte,
> > +		   unsigned int level)
> >  {
> >  	struct xe_pt_update *upd = &xe_walk->wupd.updates[parent-
> > >level];
> >  	struct xe_pt_update *child_upd = xe_child ?
> > @@ -389,6 +390,9 @@ xe_pt_insert_entry(struct xe_pt_stage_bind_walk
> > *xe_walk, struct xe_pt *parent,
> >  		idx = offset - entry->ofs;
> >  		entry->pt_entries[idx].pt = xe_child;
> >  		entry->pt_entries[idx].pte = pte;
> > +		entry->pt_entries[idx].level = level;
> > +		if (likely(!xe_child))
> > +			entry->pt_entries[idx].level |=
> > XE_PT_IS_LEAF;
> >  		entry->qwords++;
> >  	}
> >  
> > @@ -515,7 +519,8 @@ xe_pt_stage_bind_entry(struct xe_ptw *parent,
> > pgoff_t offset,
> >  			}
> >  		}
> >  
> > -		ret = xe_pt_insert_entry(xe_walk, xe_parent, offset,
> > NULL, pte);
> > +		ret = xe_pt_insert_entry(xe_walk, xe_parent, offset,
> > NULL, pte,
> > +					 level);
> >  		if (unlikely(ret))
> >  			return ret;
> >  
> > @@ -571,7 +576,7 @@ xe_pt_stage_bind_entry(struct xe_ptw *parent,
> > pgoff_t offset,
> >  
> >  		pte = vm->pt_ops->pde_encode_bo(xe_child->bo, 0,
> > pat_index) | flags;
> >  		ret = xe_pt_insert_entry(xe_walk, xe_parent, offset,
> > xe_child,
> > -					 pte);
> > +					 pte, level);
> >  	}
> >  
> >  	*action = ACTION_SUBTREE;
> > @@ -752,6 +757,10 @@ struct xe_pt_zap_ptes_walk {
> >  	/* Input parameters for the walk */
> >  	/** @tile: The tile we're building for */
> >  	struct xe_tile *tile;
> > +	/** @vm: VM we're building for */
> > +	struct xe_vm *vm;
> > +	/** @scratch: write entries with scratch */
> > +	bool scratch;
> >  
> >  	/* Output */
> >  	/** @needs_invalidate: Whether we need to invalidate TLB*/
> > @@ -779,9 +788,18 @@ static int xe_pt_zap_ptes_entry(struct xe_ptw
> > *parent, pgoff_t offset,
> >  	 */
> >  	if (xe_pt_nonshared_offsets(addr, next, --level, walk,
> > action, &offset,
> >  				    &end_offset)) {
> > -		xe_map_memset(tile_to_xe(xe_walk->tile), &xe_child-
> > >bo->vmap,
> > -			      offset * sizeof(u64), 0,
> > -			      (end_offset - offset) * sizeof(u64));
> > +		if (unlikely(xe_walk->scratch)) {
> > +			u64 pte = __xe_pt_empty_pte(xe_walk->tile,
> > xe_walk->vm,
> > +						    level);
> > +
> > +			for (; offset < end_offset; ++offset)
> > +				xe_pt_write(tile_to_xe(xe_walk-
> > >tile),
> > +					    &xe_child->bo->vmap,
> > offset, pte);
> > +		} else {
> > +			xe_map_memset(tile_to_xe(xe_walk->tile),
> > &xe_child->bo->vmap,
> > +				      offset * sizeof(u64), 0,
> > +				      (end_offset - offset) *
> > sizeof(u64));
> > +		}
> >  		xe_walk->needs_invalidate = true;
> >  	}
> >  
> > @@ -792,6 +810,31 @@ static const struct xe_pt_walk_ops
> > xe_pt_zap_ptes_ops = {
> >  	.pt_entry = xe_pt_zap_ptes_entry,
> >  };
> >  
> > +struct xe_pt_zap_ptes_flags {
> > +	bool scratch:1;
> > +};
> > +
> > +static bool __xe_pt_zap_ptes(struct xe_tile *tile, struct xe_vma
> > *vma,
> > +			     struct xe_pt_zap_ptes_flags flags)
> > +{
> > +	struct xe_pt_zap_ptes_walk xe_walk = {
> > +		.base = {
> > +			.ops = &xe_pt_zap_ptes_ops,
> > +			.shifts = xe_normal_pt_shifts,
> > +			.max_level = XE_PT_HIGHEST_LEVEL,
> > +		},
> > +		.tile = tile,
> > +		.vm = xe_vma_vm(vma),
> > +		.scratch = flags.scratch,
> > +	};
> > +	struct xe_pt *pt = xe_vma_vm(vma)->pt_root[tile->id];
> > +
> > +	(void)xe_pt_walk_shared(&pt->base, pt->level,
> > xe_vma_start(vma),
> > +				xe_vma_end(vma), &xe_walk.base);
> > +
> > +	return xe_walk.needs_invalidate;
> > +}
> > +
> >  /**
> >   * xe_pt_zap_ptes() - Zap (zero) gpu ptes of an address range
> >   * @tile: The tile we're zapping for.
> > @@ -810,24 +853,13 @@ static const struct xe_pt_walk_ops
> > xe_pt_zap_ptes_ops = {
> >   */
> >  bool xe_pt_zap_ptes(struct xe_tile *tile, struct xe_vma *vma)
> >  {
> > -	struct xe_pt_zap_ptes_walk xe_walk = {
> > -		.base = {
> > -			.ops = &xe_pt_zap_ptes_ops,
> > -			.shifts = xe_normal_pt_shifts,
> > -			.max_level = XE_PT_HIGHEST_LEVEL,
> > -		},
> > -		.tile = tile,
> > -	};
> > -	struct xe_pt *pt = xe_vma_vm(vma)->pt_root[tile->id];
> > +	struct xe_pt_zap_ptes_flags flags = {};
> >  	u8 pt_mask = (vma->tile_present & ~vma->tile_invalidated);
> >  
> >  	if (!(pt_mask & BIT(tile->id)))
> >  		return false;
> >  
> > -	(void)xe_pt_walk_shared(&pt->base, pt->level,
> > xe_vma_start(vma),
> > -				xe_vma_end(vma), &xe_walk.base);
> > -
> > -	return xe_walk.needs_invalidate;
> > +	return __xe_pt_zap_ptes(tile, vma, flags);
> >  }
> >  
> >  static void
> > @@ -1201,7 +1233,46 @@ static bool xe_pt_userptr_inject_eagain(struct
> > xe_userptr_vma *uvma)
> >  
> >  #endif
> >  
> > -static int vma_check_userptr(struct xe_vm *vm, struct xe_vma *vma,
> > +static void
> > +vma_convert_to_invalidation(struct xe_tile *tile, struct xe_vma
> > *vma,
> > +			    struct xe_vm_pgtable_update_ops
> > *pt_update)
> > +{
> > +	struct xe_pt_zap_ptes_flags flags = { .scratch = true, };
> > +	int i, j, k;
> > +
> > +	/*
> > +	 * Need to update this function to bypass scratch setup if
> > in fault mode
> > +	 */
> > +	xe_assert(xe_vma_vm(vma)->xe,
> > !xe_vm_in_fault_mode(xe_vma_vm(vma)));
> > +
> > +	for (i = 0; i < pt_update->current_op; ++i) {
> > +		struct xe_vm_pgtable_update_op *op = &pt_update-
> > >ops[i];
> > +
> > +		if (vma != op->vma || (!op->bind && !op->rebind))
> > +			continue;
> > +
> > +		for (j = 0; j < op->num_entries; ++j) {
> > +			for (k = 0; k < op->entries[j].qwords; ++k)
> > {
> > +				struct xe_pt_entry *entry =
> > +					&op-
> > >entries[j].pt_entries[k];
> > +				unsigned int level = entry->level;
> > +
> > +				if (!(level & XE_PT_IS_LEAF))
> > +					continue;
> > +
> > +				level &= ~XE_PT_IS_LEAF;
> > +				entry->pte = __xe_pt_empty_pte(tile,
> > +							      
> > xe_vma_vm(vma),
> > +							      
> > level);
> > +			}
> > +		}
> > +	}
> > +
> > +	__xe_pt_zap_ptes(tile, vma, flags);
> 
> As mentioned in my previous email, I'm pretty sure if we modify all the
> ptes in the entry array, not just the leaves, (that's basically all
> ptes of shared page-table entries) that will be equivalent to a zap.
> 

That doesn't work. I had that way originally but IGTs fail with IOMMU
CAT errors (e.g. xe_exec_basic.once-userptr fails like this [1])

Let me explain with example.

Array of bind 0x0000-0x1000 (A), 0x1000-0x2000 (B)

- (A) hits userptr invalidation
	- If modify all ptes in the entry array, highest level PDE is
	  invalidated. Both (A) and (B) either are 0 or scratch
- (A) does rebind in exec
	- We only modify leaf entry, not the highest level PDE which is
	  0 or scratch

Matt

[1]
[  359.895920] [IGT] xe_exec_basic: starting subtest once-userptr
[  359.902643] xe 0000:03:00.0: [drm:pf_queue_work_func [xe]]
                ASID: 462
                VFID: 0
                PDATA: 0x0c90
                Faulted Address: 0x00000000001a0000
                FaultType: 0
                AccessType: 0
                FaultLevel: 4
                EngineClass: 0 rcs
                EngineInstance: 0
[  359.902686] xe 0000:03:00.0: [drm:pf_queue_work_func [xe]] Fault response: Unsuccessful -22
[  359.902890] xe 0000:03:00.0: [drm:xe_guc_exec_queue_memory_cat_error_handler [xe]] GT0: Engine memory cat error: engine_class=rcs, logical_mask: 0x1, guc_id=2
[  359.905791] xe 0000:03:00.0: [drm] GT0: Engine reset: engine_class=rcs, logical_mask: 0x1, guc_id=2
[  359.905826] xe 0000:03:00.0: [drm] GT0: Timedout job: seqno=4294967169, lrc_seqno=4294967169, guc_id=2, flags=0x0 in xe_exec_basic [9607]
[  359.905831] xe 0000:03:00.0: [drm:xe_devcoredump [xe]] Multiple hangs are occurring, but only the first snapshot was taken
[  359.962840] [IGT] xe_exec_basic: finished subtest once-userptr, FAIL
[  359.963049] [IGT] xe_exec_basic: exiting, ret=98


> /Thomas
> 
> 
> > +}
> > +
> > +static int vma_check_userptr(struct xe_tile *tile, struct xe_vm *vm,
> > +			     struct xe_vma *vma,
> >  			     struct xe_vm_pgtable_update_ops
> > *pt_update)
> >  {
> >  	struct xe_userptr_vma *uvma;
> > @@ -1215,9 +1286,6 @@ static int vma_check_userptr(struct xe_vm *vm,
> > struct xe_vma *vma,
> >  	uvma = to_userptr_vma(vma);
> >  	notifier_seq = uvma->userptr.notifier_seq;
> >  
> > -	if (uvma->userptr.initial_bind && !xe_vm_in_fault_mode(vm))
> > -		return 0;
> > -
> >  	if (!mmu_interval_read_retry(&uvma->userptr.notifier,
> >  				     notifier_seq) &&
> >  	    !xe_pt_userptr_inject_eagain(uvma))
> > @@ -1226,6 +1294,8 @@ static int vma_check_userptr(struct xe_vm *vm,
> > struct xe_vma *vma,
> >  	if (xe_vm_in_fault_mode(vm)) {
> >  		return -EAGAIN;
> >  	} else {
> > +		long err;
> > +
> >  		spin_lock(&vm->userptr.invalidated_lock);
> >  		list_move_tail(&uvma->userptr.invalidate_link,
> >  			       &vm->userptr.invalidated);
> > @@ -1234,25 +1304,27 @@ static int vma_check_userptr(struct xe_vm
> > *vm, struct xe_vma *vma,
> >  		if (xe_vm_in_preempt_fence_mode(vm)) {
> >  			struct dma_resv_iter cursor;
> >  			struct dma_fence *fence;
> > -			long err;
> >  
> >  			dma_resv_iter_begin(&cursor, xe_vm_resv(vm),
> >  					   
> > DMA_RESV_USAGE_BOOKKEEP);
> >  			dma_resv_for_each_fence_unlocked(&cursor,
> > fence)
> >  				dma_fence_enable_sw_signaling(fence)
> > ;
> >  			dma_resv_iter_end(&cursor);
> > -
> > -			err = dma_resv_wait_timeout(xe_vm_resv(vm),
> > -						   
> > DMA_RESV_USAGE_BOOKKEEP,
> > -						    false,
> > MAX_SCHEDULE_TIMEOUT);
> > -			XE_WARN_ON(err <= 0);
> >  		}
> > +
> > +		err = dma_resv_wait_timeout(xe_vm_resv(vm),
> > +					    DMA_RESV_USAGE_BOOKKEEP,
> > +					    false,
> > MAX_SCHEDULE_TIMEOUT);
> > +		XE_WARN_ON(err <= 0);
> > +
> > +		vma_convert_to_invalidation(tile, vma, pt_update);
> >  	}
> >  
> >  	return 0;
> >  }
> >  
> > -static int op_check_userptr(struct xe_vm *vm, struct xe_vma_op *op,
> > +static int op_check_userptr(struct xe_tile *tile, struct xe_vm *vm,
> > +			    struct xe_vma_op *op,
> >  			    struct xe_vm_pgtable_update_ops
> > *pt_update)
> >  {
> >  	int err = 0;
> > @@ -1264,18 +1336,21 @@ static int op_check_userptr(struct xe_vm *vm,
> > struct xe_vma_op *op,
> >  		if (!op->map.immediate && xe_vm_in_fault_mode(vm))
> >  			break;
> >  
> > -		err = vma_check_userptr(vm, op->map.vma, pt_update);
> > +		err = vma_check_userptr(tile, vm, op->map.vma,
> > pt_update);
> >  		break;
> >  	case DRM_GPUVA_OP_REMAP:
> >  		if (op->remap.prev)
> > -			err = vma_check_userptr(vm, op->remap.prev,
> > pt_update);
> > +			err = vma_check_userptr(tile, vm, op-
> > >remap.prev,
> > +						pt_update);
> >  		if (!err && op->remap.next)
> > -			err = vma_check_userptr(vm, op->remap.next,
> > pt_update);
> > +			err = vma_check_userptr(tile, vm, op-
> > >remap.next,
> > +						pt_update);
> >  		break;
> >  	case DRM_GPUVA_OP_UNMAP:
> >  		break;
> >  	case DRM_GPUVA_OP_PREFETCH:
> > -		err = vma_check_userptr(vm, gpuva_to_vma(op-
> > >base.prefetch.va),
> > +		err = vma_check_userptr(tile, vm,
> > +					gpuva_to_vma(op-
> > >base.prefetch.va),
> >  					pt_update);
> >  		break;
> >  	default:
> > @@ -1301,7 +1376,8 @@ static int xe_pt_userptr_pre_commit(struct
> > xe_migrate_pt_update *pt_update)
> >  	down_read(&vm->userptr.notifier_lock);
> >  
> >  	list_for_each_entry(op, &vops->list, link) {
> > -		err = op_check_userptr(vm, op, pt_update_ops);
> > +		err = op_check_userptr(&vm->xe->tiles[pt_update-
> > >tile_id],
> > +				       vm, op, pt_update_ops);
> >  		if (err) {
> >  			up_read(&vm->userptr.notifier_lock);
> >  			break;
> > diff --git a/drivers/gpu/drm/xe/xe_pt_types.h
> > b/drivers/gpu/drm/xe/xe_pt_types.h
> > index 384cc04de719..6f99ff2b8fce 100644
> > --- a/drivers/gpu/drm/xe/xe_pt_types.h
> > +++ b/drivers/gpu/drm/xe/xe_pt_types.h
> > @@ -29,7 +29,6 @@ struct xe_pt {
> >  	struct xe_bo *bo;
> >  	unsigned int level;
> >  	unsigned int num_live;
> > -	bool rebind;
> >  	bool is_compact;
> >  #if IS_ENABLED(CONFIG_DRM_XE_DEBUG_VM)
> >  	/** addr: Virtual address start address of the PT. */
> > @@ -52,6 +51,8 @@ struct xe_pt_ops {
> >  struct xe_pt_entry {
> >  	struct xe_pt *pt;
> >  	u64 pte;
> > +#define XE_PT_IS_LEAF	BIT(31)
> > +	unsigned int level;
> >  };
> >  
> >  struct xe_vm_pgtable_update {
> > diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c
> > index ea2e287e6526..f90e5c92010c 100644
> > --- a/drivers/gpu/drm/xe/xe_vm.c
> > +++ b/drivers/gpu/drm/xe/xe_vm.c
> > @@ -623,8 +623,6 @@ static bool vma_userptr_invalidate(struct
> > mmu_interval_notifier *mni,
> >  		spin_unlock(&vm->userptr.invalidated_lock);
> >  	}
> >  
> > -	up_write(&vm->userptr.notifier_lock);
> > -
> >  	/*
> >  	 * Preempt fences turn into schedule disables, pipeline
> > these.
> >  	 * Note that even in fault mode, we need to wait for binds
> > and
> > @@ -647,6 +645,8 @@ static bool vma_userptr_invalidate(struct
> > mmu_interval_notifier *mni,
> >  		XE_WARN_ON(err);
> >  	}
> >  
> > +	up_write(&vm->userptr.notifier_lock);
> > +
> >  	trace_xe_vma_userptr_invalidate_complete(vma);
> >  
> >  	return true;
> 

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v2 2/3] drm/xe: Userptr invalidation race with binds fixes
  2025-02-24 15:06     ` Matthew Brost
@ 2025-02-24 15:20       ` Thomas Hellström
  2025-02-24 15:35         ` Matthew Brost
  0 siblings, 1 reply; 14+ messages in thread
From: Thomas Hellström @ 2025-02-24 15:20 UTC (permalink / raw)
  To: Matthew Brost; +Cc: intel-xe, matthew.auld

On Mon, 2025-02-24 at 07:06 -0800, Matthew Brost wrote:
> On Mon, Feb 24, 2025 at 09:21:09AM +0100, Thomas Hellström wrote:
> > On Sun, 2025-02-23 at 20:05 -0800, Matthew Brost wrote:
> > > Squash bind operation after userptr invalidation into a clearing
> > > of
> > > PTEs. Will prevent valid GPU page tables from pointing to stale
> > > CPU
> > > pages.
> > > 
> > > Fixup initial bind handling always add VMAs to invalidation list
> > > and
> > > clear PTEs.
> > > 
> > > Remove unused rebind variable in xe_pt.
> > > 
> > > Always hold notifier across TLB invalidation in notifier to
> > > prevent a
> > > UAF if an unbind races.
> > > 
> > > Including all of the above changes for Fixes patch in hopes of an
> > > easier
> > > backport which fix a single patch.
> > > 
> > > v2:
> > >  - Wait dma-resv bookkeep before issuing PTE zap (Thomas)
> > >  - Support scratch page on invalidation (Thomas)
> > > 
> > > Cc: Thomas Hellström <thomas.hellstrom@linux.intel.com>
> > > Cc: <stable@vger.kernel.org>
> > > Fixes: e8babb280b5e ("drm/xe: Convert multiple bind ops into
> > > single
> > > job")
> > > Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> > > ---
> > >  drivers/gpu/drm/xe/xe_pt.c       | 146 +++++++++++++++++++++++--
> > > ----
> > > --
> > >  drivers/gpu/drm/xe/xe_pt_types.h |   3 +-
> > >  drivers/gpu/drm/xe/xe_vm.c       |   4 +-
> > >  3 files changed, 115 insertions(+), 38 deletions(-)
> > > 
> > > diff --git a/drivers/gpu/drm/xe/xe_pt.c
> > > b/drivers/gpu/drm/xe/xe_pt.c
> > > index 1ddcc7e79a93..add521b5c6ae 100644
> > > --- a/drivers/gpu/drm/xe/xe_pt.c
> > > +++ b/drivers/gpu/drm/xe/xe_pt.c
> > > @@ -351,7 +351,8 @@ xe_pt_new_shared(struct xe_walk_update *wupd,
> > > struct xe_pt *parent,
> > >   */
> > >  static int
> > >  xe_pt_insert_entry(struct xe_pt_stage_bind_walk *xe_walk, struct
> > > xe_pt *parent,
> > > -		   pgoff_t offset, struct xe_pt *xe_child, u64
> > > pte)
> > > +		   pgoff_t offset, struct xe_pt *xe_child, u64
> > > pte,
> > > +		   unsigned int level)
> > >  {
> > >  	struct xe_pt_update *upd = &xe_walk-
> > > >wupd.updates[parent-
> > > > level];
> > >  	struct xe_pt_update *child_upd = xe_child ?
> > > @@ -389,6 +390,9 @@ xe_pt_insert_entry(struct
> > > xe_pt_stage_bind_walk
> > > *xe_walk, struct xe_pt *parent,
> > >  		idx = offset - entry->ofs;
> > >  		entry->pt_entries[idx].pt = xe_child;
> > >  		entry->pt_entries[idx].pte = pte;
> > > +		entry->pt_entries[idx].level = level;
> > > +		if (likely(!xe_child))
> > > +			entry->pt_entries[idx].level |=
> > > XE_PT_IS_LEAF;
> > >  		entry->qwords++;
> > >  	}
> > >  
> > > @@ -515,7 +519,8 @@ xe_pt_stage_bind_entry(struct xe_ptw *parent,
> > > pgoff_t offset,
> > >  			}
> > >  		}
> > >  
> > > -		ret = xe_pt_insert_entry(xe_walk, xe_parent,
> > > offset,
> > > NULL, pte);
> > > +		ret = xe_pt_insert_entry(xe_walk, xe_parent,
> > > offset,
> > > NULL, pte,
> > > +					 level);
> > >  		if (unlikely(ret))
> > >  			return ret;
> > >  
> > > @@ -571,7 +576,7 @@ xe_pt_stage_bind_entry(struct xe_ptw *parent,
> > > pgoff_t offset,
> > >  
> > >  		pte = vm->pt_ops->pde_encode_bo(xe_child->bo, 0,
> > > pat_index) | flags;
> > >  		ret = xe_pt_insert_entry(xe_walk, xe_parent,
> > > offset,
> > > xe_child,
> > > -					 pte);
> > > +					 pte, level);
> > >  	}
> > >  
> > >  	*action = ACTION_SUBTREE;
> > > @@ -752,6 +757,10 @@ struct xe_pt_zap_ptes_walk {
> > >  	/* Input parameters for the walk */
> > >  	/** @tile: The tile we're building for */
> > >  	struct xe_tile *tile;
> > > +	/** @vm: VM we're building for */
> > > +	struct xe_vm *vm;
> > > +	/** @scratch: write entries with scratch */
> > > +	bool scratch;
> > >  
> > >  	/* Output */
> > >  	/** @needs_invalidate: Whether we need to invalidate
> > > TLB*/
> > > @@ -779,9 +788,18 @@ static int xe_pt_zap_ptes_entry(struct
> > > xe_ptw
> > > *parent, pgoff_t offset,
> > >  	 */
> > >  	if (xe_pt_nonshared_offsets(addr, next, --level, walk,
> > > action, &offset,
> > >  				    &end_offset)) {
> > > -		xe_map_memset(tile_to_xe(xe_walk->tile),
> > > &xe_child-
> > > > bo->vmap,
> > > -			      offset * sizeof(u64), 0,
> > > -			      (end_offset - offset) *
> > > sizeof(u64));
> > > +		if (unlikely(xe_walk->scratch)) {
> > > +			u64 pte = __xe_pt_empty_pte(xe_walk-
> > > >tile,
> > > xe_walk->vm,
> > > +						    level);
> > > +
> > > +			for (; offset < end_offset; ++offset)
> > > +				xe_pt_write(tile_to_xe(xe_walk-
> > > > tile),
> > > +					    &xe_child->bo->vmap,
> > > offset, pte);
> > > +		} else {
> > > +			xe_map_memset(tile_to_xe(xe_walk->tile),
> > > &xe_child->bo->vmap,
> > > +				      offset * sizeof(u64), 0,
> > > +				      (end_offset - offset) *
> > > sizeof(u64));
> > > +		}
> > >  		xe_walk->needs_invalidate = true;
> > >  	}
> > >  
> > > @@ -792,6 +810,31 @@ static const struct xe_pt_walk_ops
> > > xe_pt_zap_ptes_ops = {
> > >  	.pt_entry = xe_pt_zap_ptes_entry,
> > >  };
> > >  
> > > +struct xe_pt_zap_ptes_flags {
> > > +	bool scratch:1;
> > > +};
> > > +
> > > +static bool __xe_pt_zap_ptes(struct xe_tile *tile, struct xe_vma
> > > *vma,
> > > +			     struct xe_pt_zap_ptes_flags flags)
> > > +{
> > > +	struct xe_pt_zap_ptes_walk xe_walk = {
> > > +		.base = {
> > > +			.ops = &xe_pt_zap_ptes_ops,
> > > +			.shifts = xe_normal_pt_shifts,
> > > +			.max_level = XE_PT_HIGHEST_LEVEL,
> > > +		},
> > > +		.tile = tile,
> > > +		.vm = xe_vma_vm(vma),
> > > +		.scratch = flags.scratch,
> > > +	};
> > > +	struct xe_pt *pt = xe_vma_vm(vma)->pt_root[tile->id];
> > > +
> > > +	(void)xe_pt_walk_shared(&pt->base, pt->level,
> > > xe_vma_start(vma),
> > > +				xe_vma_end(vma), &xe_walk.base);
> > > +
> > > +	return xe_walk.needs_invalidate;
> > > +}
> > > +
> > >  /**
> > >   * xe_pt_zap_ptes() - Zap (zero) gpu ptes of an address range
> > >   * @tile: The tile we're zapping for.
> > > @@ -810,24 +853,13 @@ static const struct xe_pt_walk_ops
> > > xe_pt_zap_ptes_ops = {
> > >   */
> > >  bool xe_pt_zap_ptes(struct xe_tile *tile, struct xe_vma *vma)
> > >  {
> > > -	struct xe_pt_zap_ptes_walk xe_walk = {
> > > -		.base = {
> > > -			.ops = &xe_pt_zap_ptes_ops,
> > > -			.shifts = xe_normal_pt_shifts,
> > > -			.max_level = XE_PT_HIGHEST_LEVEL,
> > > -		},
> > > -		.tile = tile,
> > > -	};
> > > -	struct xe_pt *pt = xe_vma_vm(vma)->pt_root[tile->id];
> > > +	struct xe_pt_zap_ptes_flags flags = {};
> > >  	u8 pt_mask = (vma->tile_present & ~vma-
> > > >tile_invalidated);
> > >  
> > >  	if (!(pt_mask & BIT(tile->id)))
> > >  		return false;
> > >  
> > > -	(void)xe_pt_walk_shared(&pt->base, pt->level,
> > > xe_vma_start(vma),
> > > -				xe_vma_end(vma), &xe_walk.base);
> > > -
> > > -	return xe_walk.needs_invalidate;
> > > +	return __xe_pt_zap_ptes(tile, vma, flags);
> > >  }
> > >  
> > >  static void
> > > @@ -1201,7 +1233,46 @@ static bool
> > > xe_pt_userptr_inject_eagain(struct
> > > xe_userptr_vma *uvma)
> > >  
> > >  #endif
> > >  
> > > -static int vma_check_userptr(struct xe_vm *vm, struct xe_vma
> > > *vma,
> > > +static void
> > > +vma_convert_to_invalidation(struct xe_tile *tile, struct xe_vma
> > > *vma,
> > > +			    struct xe_vm_pgtable_update_ops
> > > *pt_update)
> > > +{
> > > +	struct xe_pt_zap_ptes_flags flags = { .scratch = true,
> > > };
> > > +	int i, j, k;
> > > +
> > > +	/*
> > > +	 * Need to update this function to bypass scratch setup
> > > if
> > > in fault mode
> > > +	 */
> > > +	xe_assert(xe_vma_vm(vma)->xe,
> > > !xe_vm_in_fault_mode(xe_vma_vm(vma)));
> > > +
> > > +	for (i = 0; i < pt_update->current_op; ++i) {
> > > +		struct xe_vm_pgtable_update_op *op = &pt_update-
> > > > ops[i];
> > > +
> > > +		if (vma != op->vma || (!op->bind && !op-
> > > >rebind))
> > > +			continue;
> > > +
> > > +		for (j = 0; j < op->num_entries; ++j) {
> > > +			for (k = 0; k < op->entries[j].qwords;
> > > ++k)
> > > {
> > > +				struct xe_pt_entry *entry =
> > > +					&op-
> > > > entries[j].pt_entries[k];
> > > +				unsigned int level = entry-
> > > >level;
> > > +
> > > +				if (!(level & XE_PT_IS_LEAF))
> > > +					continue;
> > > +
> > > +				level &= ~XE_PT_IS_LEAF;
> > > +				entry->pte =
> > > __xe_pt_empty_pte(tile,
> > > +							      
> > > xe_vma_vm(vma),
> > > +							      
> > > level);
> > > +			}
> > > +		}
> > > +	}
> > > +
> > > +	__xe_pt_zap_ptes(tile, vma, flags);
> > 
> > As mentioned in my previous email, I'm pretty sure if we modify all
> > the
> > ptes in the entry array, not just the leaves, (that's basically all
> > ptes of shared page-table entries) that will be equivalent to a
> > zap.
> > 
> 
> That doesn't work. I had that way originally but IGTs fail with IOMMU
> CAT errors (e.g. xe_exec_basic.once-userptr fails like this [1])
> 
> Let me explain with example.
> 
> Array of bind 0x0000-0x1000 (A), 0x1000-0x2000 (B)
> 
> - (A) hits userptr invalidation
> 	- If modify all ptes in the entry array, highest level PDE
> is
> 	  invalidated. Both (A) and (B) either are 0 or scratch
> - (A) does rebind in exec
> 	- We only modify leaf entry, not the highest level PDE which
> is
> 	  0 or scratch
> 
> Matt

Argh. You're right. What would happen if we don't do anything to the
ptes, then? It looks from the code in faulting mode we error and unwind
with -EAGAIN.

 In preempt-fence mode and !LR we could instead ensure properly stop
gpu access and put the userptr on the invalidated list in the notifier.

The reason I don't really like zapping here is it's yet another corner
case in the code. If it works by doing this in the notifier, we get rid
of two corner cases.

/Thomas




> 
> [1]
> [  359.895920] [IGT] xe_exec_basic: starting subtest once-userptr
> [  359.902643] xe 0000:03:00.0: [drm:pf_queue_work_func [xe]]
>                 ASID: 462
>                 VFID: 0
>                 PDATA: 0x0c90
>                 Faulted Address: 0x00000000001a0000
>                 FaultType: 0
>                 AccessType: 0
>                 FaultLevel: 4
>                 EngineClass: 0 rcs
>                 EngineInstance: 0
> [  359.902686] xe 0000:03:00.0: [drm:pf_queue_work_func [xe]] Fault
> response: Unsuccessful -22
> [  359.902890] xe 0000:03:00.0:
> [drm:xe_guc_exec_queue_memory_cat_error_handler [xe]] GT0: Engine
> memory cat error: engine_class=rcs, logical_mask: 0x1, guc_id=2
> [  359.905791] xe 0000:03:00.0: [drm] GT0: Engine reset:
> engine_class=rcs, logical_mask: 0x1, guc_id=2
> [  359.905826] xe 0000:03:00.0: [drm] GT0: Timedout job:
> seqno=4294967169, lrc_seqno=4294967169, guc_id=2, flags=0x0 in
> xe_exec_basic [9607]
> [  359.905831] xe 0000:03:00.0: [drm:xe_devcoredump [xe]] Multiple
> hangs are occurring, but only the first snapshot was taken
> [  359.962840] [IGT] xe_exec_basic: finished subtest once-userptr,
> FAIL
> [  359.963049] [IGT] xe_exec_basic: exiting, ret=98
> 
> 
> > /Thomas
> > 
> > 
> > > +}
> > > +
> > > +static int vma_check_userptr(struct xe_tile *tile, struct xe_vm
> > > *vm,
> > > +			     struct xe_vma *vma,
> > >  			     struct xe_vm_pgtable_update_ops
> > > *pt_update)
> > >  {
> > >  	struct xe_userptr_vma *uvma;
> > > @@ -1215,9 +1286,6 @@ static int vma_check_userptr(struct xe_vm
> > > *vm,
> > > struct xe_vma *vma,
> > >  	uvma = to_userptr_vma(vma);
> > >  	notifier_seq = uvma->userptr.notifier_seq;
> > >  
> > > -	if (uvma->userptr.initial_bind &&
> > > !xe_vm_in_fault_mode(vm))
> > > -		return 0;
> > > -
> > >  	if (!mmu_interval_read_retry(&uvma->userptr.notifier,
> > >  				     notifier_seq) &&
> > >  	    !xe_pt_userptr_inject_eagain(uvma))
> > > @@ -1226,6 +1294,8 @@ static int vma_check_userptr(struct xe_vm
> > > *vm,
> > > struct xe_vma *vma,
> > >  	if (xe_vm_in_fault_mode(vm)) {
> > >  		return -EAGAIN;
> > >  	} else {
> > > +		long err;
> > > +
> > >  		spin_lock(&vm->userptr.invalidated_lock);
> > >  		list_move_tail(&uvma->userptr.invalidate_link,
> > >  			       &vm->userptr.invalidated);
> > > @@ -1234,25 +1304,27 @@ static int vma_check_userptr(struct xe_vm
> > > *vm, struct xe_vma *vma,
> > >  		if (xe_vm_in_preempt_fence_mode(vm)) {
> > >  			struct dma_resv_iter cursor;
> > >  			struct dma_fence *fence;
> > > -			long err;
> > >  
> > >  			dma_resv_iter_begin(&cursor,
> > > xe_vm_resv(vm),
> > >  					   
> > > DMA_RESV_USAGE_BOOKKEEP);
> > >  			dma_resv_for_each_fence_unlocked(&cursor
> > > ,
> > > fence)
> > >  				dma_fence_enable_sw_signaling(fe
> > > nce)
> > > ;
> > >  			dma_resv_iter_end(&cursor);
> > > -
> > > -			err =
> > > dma_resv_wait_timeout(xe_vm_resv(vm),
> > > -						   
> > > DMA_RESV_USAGE_BOOKKEEP,
> > > -						    false,
> > > MAX_SCHEDULE_TIMEOUT);
> > > -			XE_WARN_ON(err <= 0);
> > >  		}
> > > +
> > > +		err = dma_resv_wait_timeout(xe_vm_resv(vm),
> > > +					   
> > > DMA_RESV_USAGE_BOOKKEEP,
> > > +					    false,
> > > MAX_SCHEDULE_TIMEOUT);
> > > +		XE_WARN_ON(err <= 0);
> > > +
> > > +		vma_convert_to_invalidation(tile, vma,
> > > pt_update);
> > >  	}
> > >  
> > >  	return 0;
> > >  }
> > >  
> > > -static int op_check_userptr(struct xe_vm *vm, struct xe_vma_op
> > > *op,
> > > +static int op_check_userptr(struct xe_tile *tile, struct xe_vm
> > > *vm,
> > > +			    struct xe_vma_op *op,
> > >  			    struct xe_vm_pgtable_update_ops
> > > *pt_update)
> > >  {
> > >  	int err = 0;
> > > @@ -1264,18 +1336,21 @@ static int op_check_userptr(struct xe_vm
> > > *vm,
> > > struct xe_vma_op *op,
> > >  		if (!op->map.immediate &&
> > > xe_vm_in_fault_mode(vm))
> > >  			break;
> > >  
> > > -		err = vma_check_userptr(vm, op->map.vma,
> > > pt_update);
> > > +		err = vma_check_userptr(tile, vm, op->map.vma,
> > > pt_update);
> > >  		break;
> > >  	case DRM_GPUVA_OP_REMAP:
> > >  		if (op->remap.prev)
> > > -			err = vma_check_userptr(vm, op-
> > > >remap.prev,
> > > pt_update);
> > > +			err = vma_check_userptr(tile, vm, op-
> > > > remap.prev,
> > > +						pt_update);
> > >  		if (!err && op->remap.next)
> > > -			err = vma_check_userptr(vm, op-
> > > >remap.next,
> > > pt_update);
> > > +			err = vma_check_userptr(tile, vm, op-
> > > > remap.next,
> > > +						pt_update);
> > >  		break;
> > >  	case DRM_GPUVA_OP_UNMAP:
> > >  		break;
> > >  	case DRM_GPUVA_OP_PREFETCH:
> > > -		err = vma_check_userptr(vm, gpuva_to_vma(op-
> > > > base.prefetch.va),
> > > +		err = vma_check_userptr(tile, vm,
> > > +					gpuva_to_vma(op-
> > > > base.prefetch.va),
> > >  					pt_update);
> > >  		break;
> > >  	default:
> > > @@ -1301,7 +1376,8 @@ static int xe_pt_userptr_pre_commit(struct
> > > xe_migrate_pt_update *pt_update)
> > >  	down_read(&vm->userptr.notifier_lock);
> > >  
> > >  	list_for_each_entry(op, &vops->list, link) {
> > > -		err = op_check_userptr(vm, op, pt_update_ops);
> > > +		err = op_check_userptr(&vm->xe->tiles[pt_update-
> > > > tile_id],
> > > +				       vm, op, pt_update_ops);
> > >  		if (err) {
> > >  			up_read(&vm->userptr.notifier_lock);
> > >  			break;
> > > diff --git a/drivers/gpu/drm/xe/xe_pt_types.h
> > > b/drivers/gpu/drm/xe/xe_pt_types.h
> > > index 384cc04de719..6f99ff2b8fce 100644
> > > --- a/drivers/gpu/drm/xe/xe_pt_types.h
> > > +++ b/drivers/gpu/drm/xe/xe_pt_types.h
> > > @@ -29,7 +29,6 @@ struct xe_pt {
> > >  	struct xe_bo *bo;
> > >  	unsigned int level;
> > >  	unsigned int num_live;
> > > -	bool rebind;
> > >  	bool is_compact;
> > >  #if IS_ENABLED(CONFIG_DRM_XE_DEBUG_VM)
> > >  	/** addr: Virtual address start address of the PT. */
> > > @@ -52,6 +51,8 @@ struct xe_pt_ops {
> > >  struct xe_pt_entry {
> > >  	struct xe_pt *pt;
> > >  	u64 pte;
> > > +#define XE_PT_IS_LEAF	BIT(31)
> > > +	unsigned int level;
> > >  };
> > >  
> > >  struct xe_vm_pgtable_update {
> > > diff --git a/drivers/gpu/drm/xe/xe_vm.c
> > > b/drivers/gpu/drm/xe/xe_vm.c
> > > index ea2e287e6526..f90e5c92010c 100644
> > > --- a/drivers/gpu/drm/xe/xe_vm.c
> > > +++ b/drivers/gpu/drm/xe/xe_vm.c
> > > @@ -623,8 +623,6 @@ static bool vma_userptr_invalidate(struct
> > > mmu_interval_notifier *mni,
> > >  		spin_unlock(&vm->userptr.invalidated_lock);
> > >  	}
> > >  
> > > -	up_write(&vm->userptr.notifier_lock);
> > > -
> > >  	/*
> > >  	 * Preempt fences turn into schedule disables, pipeline
> > > these.
> > >  	 * Note that even in fault mode, we need to wait for
> > > binds
> > > and
> > > @@ -647,6 +645,8 @@ static bool vma_userptr_invalidate(struct
> > > mmu_interval_notifier *mni,
> > >  		XE_WARN_ON(err);
> > >  	}
> > >  
> > > +	up_write(&vm->userptr.notifier_lock);
> > > +
> > >  	trace_xe_vma_userptr_invalidate_complete(vma);
> > >  
> > >  	return true;
> > 


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v2 2/3] drm/xe: Userptr invalidation race with binds fixes
  2025-02-24 15:20       ` Thomas Hellström
@ 2025-02-24 15:35         ` Matthew Brost
  2025-02-24 16:22           ` Thomas Hellström
  0 siblings, 1 reply; 14+ messages in thread
From: Matthew Brost @ 2025-02-24 15:35 UTC (permalink / raw)
  To: Thomas Hellström; +Cc: intel-xe, matthew.auld

On Mon, Feb 24, 2025 at 04:20:07PM +0100, Thomas Hellström wrote:
> On Mon, 2025-02-24 at 07:06 -0800, Matthew Brost wrote:
> > On Mon, Feb 24, 2025 at 09:21:09AM +0100, Thomas Hellström wrote:
> > > On Sun, 2025-02-23 at 20:05 -0800, Matthew Brost wrote:
> > > > Squash bind operation after userptr invalidation into a clearing
> > > > of
> > > > PTEs. Will prevent valid GPU page tables from pointing to stale
> > > > CPU
> > > > pages.
> > > > 
> > > > Fixup initial bind handling always add VMAs to invalidation list
> > > > and
> > > > clear PTEs.
> > > > 
> > > > Remove unused rebind variable in xe_pt.
> > > > 
> > > > Always hold notifier across TLB invalidation in notifier to
> > > > prevent a
> > > > UAF if an unbind races.
> > > > 
> > > > Including all of the above changes for Fixes patch in hopes of an
> > > > easier
> > > > backport which fix a single patch.
> > > > 
> > > > v2:
> > > >  - Wait dma-resv bookkeep before issuing PTE zap (Thomas)
> > > >  - Support scratch page on invalidation (Thomas)
> > > > 
> > > > Cc: Thomas Hellström <thomas.hellstrom@linux.intel.com>
> > > > Cc: <stable@vger.kernel.org>
> > > > Fixes: e8babb280b5e ("drm/xe: Convert multiple bind ops into
> > > > single
> > > > job")
> > > > Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> > > > ---
> > > >  drivers/gpu/drm/xe/xe_pt.c       | 146 +++++++++++++++++++++++--
> > > > ----
> > > > --
> > > >  drivers/gpu/drm/xe/xe_pt_types.h |   3 +-
> > > >  drivers/gpu/drm/xe/xe_vm.c       |   4 +-
> > > >  3 files changed, 115 insertions(+), 38 deletions(-)
> > > > 
> > > > diff --git a/drivers/gpu/drm/xe/xe_pt.c
> > > > b/drivers/gpu/drm/xe/xe_pt.c
> > > > index 1ddcc7e79a93..add521b5c6ae 100644
> > > > --- a/drivers/gpu/drm/xe/xe_pt.c
> > > > +++ b/drivers/gpu/drm/xe/xe_pt.c
> > > > @@ -351,7 +351,8 @@ xe_pt_new_shared(struct xe_walk_update *wupd,
> > > > struct xe_pt *parent,
> > > >   */
> > > >  static int
> > > >  xe_pt_insert_entry(struct xe_pt_stage_bind_walk *xe_walk, struct
> > > > xe_pt *parent,
> > > > -		   pgoff_t offset, struct xe_pt *xe_child, u64
> > > > pte)
> > > > +		   pgoff_t offset, struct xe_pt *xe_child, u64
> > > > pte,
> > > > +		   unsigned int level)
> > > >  {
> > > >  	struct xe_pt_update *upd = &xe_walk-
> > > > >wupd.updates[parent-
> > > > > level];
> > > >  	struct xe_pt_update *child_upd = xe_child ?
> > > > @@ -389,6 +390,9 @@ xe_pt_insert_entry(struct
> > > > xe_pt_stage_bind_walk
> > > > *xe_walk, struct xe_pt *parent,
> > > >  		idx = offset - entry->ofs;
> > > >  		entry->pt_entries[idx].pt = xe_child;
> > > >  		entry->pt_entries[idx].pte = pte;
> > > > +		entry->pt_entries[idx].level = level;
> > > > +		if (likely(!xe_child))
> > > > +			entry->pt_entries[idx].level |=
> > > > XE_PT_IS_LEAF;
> > > >  		entry->qwords++;
> > > >  	}
> > > >  
> > > > @@ -515,7 +519,8 @@ xe_pt_stage_bind_entry(struct xe_ptw *parent,
> > > > pgoff_t offset,
> > > >  			}
> > > >  		}
> > > >  
> > > > -		ret = xe_pt_insert_entry(xe_walk, xe_parent,
> > > > offset,
> > > > NULL, pte);
> > > > +		ret = xe_pt_insert_entry(xe_walk, xe_parent,
> > > > offset,
> > > > NULL, pte,
> > > > +					 level);
> > > >  		if (unlikely(ret))
> > > >  			return ret;
> > > >  
> > > > @@ -571,7 +576,7 @@ xe_pt_stage_bind_entry(struct xe_ptw *parent,
> > > > pgoff_t offset,
> > > >  
> > > >  		pte = vm->pt_ops->pde_encode_bo(xe_child->bo, 0,
> > > > pat_index) | flags;
> > > >  		ret = xe_pt_insert_entry(xe_walk, xe_parent,
> > > > offset,
> > > > xe_child,
> > > > -					 pte);
> > > > +					 pte, level);
> > > >  	}
> > > >  
> > > >  	*action = ACTION_SUBTREE;
> > > > @@ -752,6 +757,10 @@ struct xe_pt_zap_ptes_walk {
> > > >  	/* Input parameters for the walk */
> > > >  	/** @tile: The tile we're building for */
> > > >  	struct xe_tile *tile;
> > > > +	/** @vm: VM we're building for */
> > > > +	struct xe_vm *vm;
> > > > +	/** @scratch: write entries with scratch */
> > > > +	bool scratch;
> > > >  
> > > >  	/* Output */
> > > >  	/** @needs_invalidate: Whether we need to invalidate
> > > > TLB*/
> > > > @@ -779,9 +788,18 @@ static int xe_pt_zap_ptes_entry(struct
> > > > xe_ptw
> > > > *parent, pgoff_t offset,
> > > >  	 */
> > > >  	if (xe_pt_nonshared_offsets(addr, next, --level, walk,
> > > > action, &offset,
> > > >  				    &end_offset)) {
> > > > -		xe_map_memset(tile_to_xe(xe_walk->tile),
> > > > &xe_child-
> > > > > bo->vmap,
> > > > -			      offset * sizeof(u64), 0,
> > > > -			      (end_offset - offset) *
> > > > sizeof(u64));
> > > > +		if (unlikely(xe_walk->scratch)) {
> > > > +			u64 pte = __xe_pt_empty_pte(xe_walk-
> > > > >tile,
> > > > xe_walk->vm,
> > > > +						    level);
> > > > +
> > > > +			for (; offset < end_offset; ++offset)
> > > > +				xe_pt_write(tile_to_xe(xe_walk-
> > > > > tile),
> > > > +					    &xe_child->bo->vmap,
> > > > offset, pte);
> > > > +		} else {
> > > > +			xe_map_memset(tile_to_xe(xe_walk->tile),
> > > > &xe_child->bo->vmap,
> > > > +				      offset * sizeof(u64), 0,
> > > > +				      (end_offset - offset) *
> > > > sizeof(u64));
> > > > +		}
> > > >  		xe_walk->needs_invalidate = true;
> > > >  	}
> > > >  
> > > > @@ -792,6 +810,31 @@ static const struct xe_pt_walk_ops
> > > > xe_pt_zap_ptes_ops = {
> > > >  	.pt_entry = xe_pt_zap_ptes_entry,
> > > >  };
> > > >  
> > > > +struct xe_pt_zap_ptes_flags {
> > > > +	bool scratch:1;
> > > > +};
> > > > +
> > > > +static bool __xe_pt_zap_ptes(struct xe_tile *tile, struct xe_vma
> > > > *vma,
> > > > +			     struct xe_pt_zap_ptes_flags flags)
> > > > +{
> > > > +	struct xe_pt_zap_ptes_walk xe_walk = {
> > > > +		.base = {
> > > > +			.ops = &xe_pt_zap_ptes_ops,
> > > > +			.shifts = xe_normal_pt_shifts,
> > > > +			.max_level = XE_PT_HIGHEST_LEVEL,
> > > > +		},
> > > > +		.tile = tile,
> > > > +		.vm = xe_vma_vm(vma),
> > > > +		.scratch = flags.scratch,
> > > > +	};
> > > > +	struct xe_pt *pt = xe_vma_vm(vma)->pt_root[tile->id];
> > > > +
> > > > +	(void)xe_pt_walk_shared(&pt->base, pt->level,
> > > > xe_vma_start(vma),
> > > > +				xe_vma_end(vma), &xe_walk.base);
> > > > +
> > > > +	return xe_walk.needs_invalidate;
> > > > +}
> > > > +
> > > >  /**
> > > >   * xe_pt_zap_ptes() - Zap (zero) gpu ptes of an address range
> > > >   * @tile: The tile we're zapping for.
> > > > @@ -810,24 +853,13 @@ static const struct xe_pt_walk_ops
> > > > xe_pt_zap_ptes_ops = {
> > > >   */
> > > >  bool xe_pt_zap_ptes(struct xe_tile *tile, struct xe_vma *vma)
> > > >  {
> > > > -	struct xe_pt_zap_ptes_walk xe_walk = {
> > > > -		.base = {
> > > > -			.ops = &xe_pt_zap_ptes_ops,
> > > > -			.shifts = xe_normal_pt_shifts,
> > > > -			.max_level = XE_PT_HIGHEST_LEVEL,
> > > > -		},
> > > > -		.tile = tile,
> > > > -	};
> > > > -	struct xe_pt *pt = xe_vma_vm(vma)->pt_root[tile->id];
> > > > +	struct xe_pt_zap_ptes_flags flags = {};
> > > >  	u8 pt_mask = (vma->tile_present & ~vma-
> > > > >tile_invalidated);
> > > >  
> > > >  	if (!(pt_mask & BIT(tile->id)))
> > > >  		return false;
> > > >  
> > > > -	(void)xe_pt_walk_shared(&pt->base, pt->level,
> > > > xe_vma_start(vma),
> > > > -				xe_vma_end(vma), &xe_walk.base);
> > > > -
> > > > -	return xe_walk.needs_invalidate;
> > > > +	return __xe_pt_zap_ptes(tile, vma, flags);
> > > >  }
> > > >  
> > > >  static void
> > > > @@ -1201,7 +1233,46 @@ static bool
> > > > xe_pt_userptr_inject_eagain(struct
> > > > xe_userptr_vma *uvma)
> > > >  
> > > >  #endif
> > > >  
> > > > -static int vma_check_userptr(struct xe_vm *vm, struct xe_vma
> > > > *vma,
> > > > +static void
> > > > +vma_convert_to_invalidation(struct xe_tile *tile, struct xe_vma
> > > > *vma,
> > > > +			    struct xe_vm_pgtable_update_ops
> > > > *pt_update)
> > > > +{
> > > > +	struct xe_pt_zap_ptes_flags flags = { .scratch = true,
> > > > };
> > > > +	int i, j, k;
> > > > +
> > > > +	/*
> > > > +	 * Need to update this function to bypass scratch setup
> > > > if
> > > > in fault mode
> > > > +	 */
> > > > +	xe_assert(xe_vma_vm(vma)->xe,
> > > > !xe_vm_in_fault_mode(xe_vma_vm(vma)));
> > > > +
> > > > +	for (i = 0; i < pt_update->current_op; ++i) {
> > > > +		struct xe_vm_pgtable_update_op *op = &pt_update-
> > > > > ops[i];
> > > > +
> > > > +		if (vma != op->vma || (!op->bind && !op-
> > > > >rebind))
> > > > +			continue;
> > > > +
> > > > +		for (j = 0; j < op->num_entries; ++j) {
> > > > +			for (k = 0; k < op->entries[j].qwords;
> > > > ++k)
> > > > {
> > > > +				struct xe_pt_entry *entry =
> > > > +					&op-
> > > > > entries[j].pt_entries[k];
> > > > +				unsigned int level = entry-
> > > > >level;
> > > > +
> > > > +				if (!(level & XE_PT_IS_LEAF))
> > > > +					continue;
> > > > +
> > > > +				level &= ~XE_PT_IS_LEAF;
> > > > +				entry->pte =
> > > > __xe_pt_empty_pte(tile,
> > > > +							      
> > > > xe_vma_vm(vma),
> > > > +							      
> > > > level);
> > > > +			}
> > > > +		}
> > > > +	}
> > > > +
> > > > +	__xe_pt_zap_ptes(tile, vma, flags);
> > > 
> > > As mentioned in my previous email, I'm pretty sure if we modify all
> > > the
> > > ptes in the entry array, not just the leaves, (that's basically all
> > > ptes of shared page-table entries) that will be equivalent to a
> > > zap.
> > > 
> > 
> > That doesn't work. I had that way originally but IGTs fail with IOMMU
> > CAT errors (e.g. xe_exec_basic.once-userptr fails like this [1])
> > 
> > Let me explain with example.
> > 
> > Array of bind 0x0000-0x1000 (A), 0x1000-0x2000 (B)
> > 
> > - (A) hits userptr invalidation
> > 	- If modify all ptes in the entry array, highest level PDE
> > is
> > 	  invalidated. Both (A) and (B) either are 0 or scratch
> > - (A) does rebind in exec
> > 	- We only modify leaf entry, not the highest level PDE which
> > is
> > 	  0 or scratch
> > 
> > Matt
> 
> Argh. You're right. What would happen if we don't do anything to the
> ptes, then? It looks from the code in faulting mode we error and unwind
> with -EAGAIN.
> 
>  In preempt-fence mode and !LR we could instead ensure properly stop
> gpu access and put the userptr on the invalidated list in the notifier.
> 

In dma-fence mode we'd have a window where existing (previously
submited) GPU jobs could corrupt pages which have been invalidated.
Unlikely but a very subtle security / data corruption issue.

> The reason I don't really like zapping here is it's yet another corner
> case in the code. If it works by doing this in the notifier, we get rid
> of two corner cases.
> 

I think the only other option here is return -EAGAIN but with that
approach we also have to tweak the error injection to be per user bind
rather per bind OP.

Can't say I love the approach in this patch but it does seem to work and
do I think a tangible beneift could be seen in SVM prefetch where we can
squash parts of a prefetch into an invalidation where this race is more
likely to occur.

Open to this patch or changing to -EAGAIN.

Matt

> /Thomas
> 
> 
> 
> 
> > 
> > [1]
> > [  359.895920] [IGT] xe_exec_basic: starting subtest once-userptr
> > [  359.902643] xe 0000:03:00.0: [drm:pf_queue_work_func [xe]]
> >                 ASID: 462
> >                 VFID: 0
> >                 PDATA: 0x0c90
> >                 Faulted Address: 0x00000000001a0000
> >                 FaultType: 0
> >                 AccessType: 0
> >                 FaultLevel: 4
> >                 EngineClass: 0 rcs
> >                 EngineInstance: 0
> > [  359.902686] xe 0000:03:00.0: [drm:pf_queue_work_func [xe]] Fault
> > response: Unsuccessful -22
> > [  359.902890] xe 0000:03:00.0:
> > [drm:xe_guc_exec_queue_memory_cat_error_handler [xe]] GT0: Engine
> > memory cat error: engine_class=rcs, logical_mask: 0x1, guc_id=2
> > [  359.905791] xe 0000:03:00.0: [drm] GT0: Engine reset:
> > engine_class=rcs, logical_mask: 0x1, guc_id=2
> > [  359.905826] xe 0000:03:00.0: [drm] GT0: Timedout job:
> > seqno=4294967169, lrc_seqno=4294967169, guc_id=2, flags=0x0 in
> > xe_exec_basic [9607]
> > [  359.905831] xe 0000:03:00.0: [drm:xe_devcoredump [xe]] Multiple
> > hangs are occurring, but only the first snapshot was taken
> > [  359.962840] [IGT] xe_exec_basic: finished subtest once-userptr,
> > FAIL
> > [  359.963049] [IGT] xe_exec_basic: exiting, ret=98
> > 
> > 
> > > /Thomas
> > > 
> > > 
> > > > +}
> > > > +
> > > > +static int vma_check_userptr(struct xe_tile *tile, struct xe_vm
> > > > *vm,
> > > > +			     struct xe_vma *vma,
> > > >  			     struct xe_vm_pgtable_update_ops
> > > > *pt_update)
> > > >  {
> > > >  	struct xe_userptr_vma *uvma;
> > > > @@ -1215,9 +1286,6 @@ static int vma_check_userptr(struct xe_vm
> > > > *vm,
> > > > struct xe_vma *vma,
> > > >  	uvma = to_userptr_vma(vma);
> > > >  	notifier_seq = uvma->userptr.notifier_seq;
> > > >  
> > > > -	if (uvma->userptr.initial_bind &&
> > > > !xe_vm_in_fault_mode(vm))
> > > > -		return 0;
> > > > -
> > > >  	if (!mmu_interval_read_retry(&uvma->userptr.notifier,
> > > >  				     notifier_seq) &&
> > > >  	    !xe_pt_userptr_inject_eagain(uvma))
> > > > @@ -1226,6 +1294,8 @@ static int vma_check_userptr(struct xe_vm
> > > > *vm,
> > > > struct xe_vma *vma,
> > > >  	if (xe_vm_in_fault_mode(vm)) {
> > > >  		return -EAGAIN;
> > > >  	} else {
> > > > +		long err;
> > > > +
> > > >  		spin_lock(&vm->userptr.invalidated_lock);
> > > >  		list_move_tail(&uvma->userptr.invalidate_link,
> > > >  			       &vm->userptr.invalidated);
> > > > @@ -1234,25 +1304,27 @@ static int vma_check_userptr(struct xe_vm
> > > > *vm, struct xe_vma *vma,
> > > >  		if (xe_vm_in_preempt_fence_mode(vm)) {
> > > >  			struct dma_resv_iter cursor;
> > > >  			struct dma_fence *fence;
> > > > -			long err;
> > > >  
> > > >  			dma_resv_iter_begin(&cursor,
> > > > xe_vm_resv(vm),
> > > >  					   
> > > > DMA_RESV_USAGE_BOOKKEEP);
> > > >  			dma_resv_for_each_fence_unlocked(&cursor
> > > > ,
> > > > fence)
> > > >  				dma_fence_enable_sw_signaling(fe
> > > > nce)
> > > > ;
> > > >  			dma_resv_iter_end(&cursor);
> > > > -
> > > > -			err =
> > > > dma_resv_wait_timeout(xe_vm_resv(vm),
> > > > -						   
> > > > DMA_RESV_USAGE_BOOKKEEP,
> > > > -						    false,
> > > > MAX_SCHEDULE_TIMEOUT);
> > > > -			XE_WARN_ON(err <= 0);
> > > >  		}
> > > > +
> > > > +		err = dma_resv_wait_timeout(xe_vm_resv(vm),
> > > > +					   
> > > > DMA_RESV_USAGE_BOOKKEEP,
> > > > +					    false,
> > > > MAX_SCHEDULE_TIMEOUT);
> > > > +		XE_WARN_ON(err <= 0);
> > > > +
> > > > +		vma_convert_to_invalidation(tile, vma,
> > > > pt_update);
> > > >  	}
> > > >  
> > > >  	return 0;
> > > >  }
> > > >  
> > > > -static int op_check_userptr(struct xe_vm *vm, struct xe_vma_op
> > > > *op,
> > > > +static int op_check_userptr(struct xe_tile *tile, struct xe_vm
> > > > *vm,
> > > > +			    struct xe_vma_op *op,
> > > >  			    struct xe_vm_pgtable_update_ops
> > > > *pt_update)
> > > >  {
> > > >  	int err = 0;
> > > > @@ -1264,18 +1336,21 @@ static int op_check_userptr(struct xe_vm
> > > > *vm,
> > > > struct xe_vma_op *op,
> > > >  		if (!op->map.immediate &&
> > > > xe_vm_in_fault_mode(vm))
> > > >  			break;
> > > >  
> > > > -		err = vma_check_userptr(vm, op->map.vma,
> > > > pt_update);
> > > > +		err = vma_check_userptr(tile, vm, op->map.vma,
> > > > pt_update);
> > > >  		break;
> > > >  	case DRM_GPUVA_OP_REMAP:
> > > >  		if (op->remap.prev)
> > > > -			err = vma_check_userptr(vm, op-
> > > > >remap.prev,
> > > > pt_update);
> > > > +			err = vma_check_userptr(tile, vm, op-
> > > > > remap.prev,
> > > > +						pt_update);
> > > >  		if (!err && op->remap.next)
> > > > -			err = vma_check_userptr(vm, op-
> > > > >remap.next,
> > > > pt_update);
> > > > +			err = vma_check_userptr(tile, vm, op-
> > > > > remap.next,
> > > > +						pt_update);
> > > >  		break;
> > > >  	case DRM_GPUVA_OP_UNMAP:
> > > >  		break;
> > > >  	case DRM_GPUVA_OP_PREFETCH:
> > > > -		err = vma_check_userptr(vm, gpuva_to_vma(op-
> > > > > base.prefetch.va),
> > > > +		err = vma_check_userptr(tile, vm,
> > > > +					gpuva_to_vma(op-
> > > > > base.prefetch.va),
> > > >  					pt_update);
> > > >  		break;
> > > >  	default:
> > > > @@ -1301,7 +1376,8 @@ static int xe_pt_userptr_pre_commit(struct
> > > > xe_migrate_pt_update *pt_update)
> > > >  	down_read(&vm->userptr.notifier_lock);
> > > >  
> > > >  	list_for_each_entry(op, &vops->list, link) {
> > > > -		err = op_check_userptr(vm, op, pt_update_ops);
> > > > +		err = op_check_userptr(&vm->xe->tiles[pt_update-
> > > > > tile_id],
> > > > +				       vm, op, pt_update_ops);
> > > >  		if (err) {
> > > >  			up_read(&vm->userptr.notifier_lock);
> > > >  			break;
> > > > diff --git a/drivers/gpu/drm/xe/xe_pt_types.h
> > > > b/drivers/gpu/drm/xe/xe_pt_types.h
> > > > index 384cc04de719..6f99ff2b8fce 100644
> > > > --- a/drivers/gpu/drm/xe/xe_pt_types.h
> > > > +++ b/drivers/gpu/drm/xe/xe_pt_types.h
> > > > @@ -29,7 +29,6 @@ struct xe_pt {
> > > >  	struct xe_bo *bo;
> > > >  	unsigned int level;
> > > >  	unsigned int num_live;
> > > > -	bool rebind;
> > > >  	bool is_compact;
> > > >  #if IS_ENABLED(CONFIG_DRM_XE_DEBUG_VM)
> > > >  	/** addr: Virtual address start address of the PT. */
> > > > @@ -52,6 +51,8 @@ struct xe_pt_ops {
> > > >  struct xe_pt_entry {
> > > >  	struct xe_pt *pt;
> > > >  	u64 pte;
> > > > +#define XE_PT_IS_LEAF	BIT(31)
> > > > +	unsigned int level;
> > > >  };
> > > >  
> > > >  struct xe_vm_pgtable_update {
> > > > diff --git a/drivers/gpu/drm/xe/xe_vm.c
> > > > b/drivers/gpu/drm/xe/xe_vm.c
> > > > index ea2e287e6526..f90e5c92010c 100644
> > > > --- a/drivers/gpu/drm/xe/xe_vm.c
> > > > +++ b/drivers/gpu/drm/xe/xe_vm.c
> > > > @@ -623,8 +623,6 @@ static bool vma_userptr_invalidate(struct
> > > > mmu_interval_notifier *mni,
> > > >  		spin_unlock(&vm->userptr.invalidated_lock);
> > > >  	}
> > > >  
> > > > -	up_write(&vm->userptr.notifier_lock);
> > > > -
> > > >  	/*
> > > >  	 * Preempt fences turn into schedule disables, pipeline
> > > > these.
> > > >  	 * Note that even in fault mode, we need to wait for
> > > > binds
> > > > and
> > > > @@ -647,6 +645,8 @@ static bool vma_userptr_invalidate(struct
> > > > mmu_interval_notifier *mni,
> > > >  		XE_WARN_ON(err);
> > > >  	}
> > > >  
> > > > +	up_write(&vm->userptr.notifier_lock);
> > > > +
> > > >  	trace_xe_vma_userptr_invalidate_complete(vma);
> > > >  
> > > >  	return true;
> > > 
> 

^ permalink raw reply	[flat|nested] 14+ messages in thread

* ✗ CI.Patch_applied: failure for Userptr fixes
  2025-02-24  4:05 [PATCH v2 0/3] Userptr fixes Matthew Brost
                   ` (2 preceding siblings ...)
  2025-02-24  4:05 ` [PATCH v2 3/3] drm/xe: Add staging tree for VM binds Matthew Brost
@ 2025-02-24 16:15 ` Patchwork
  3 siblings, 0 replies; 14+ messages in thread
From: Patchwork @ 2025-02-24 16:15 UTC (permalink / raw)
  To: Matthew Brost; +Cc: intel-xe

== Series Details ==

Series: Userptr fixes
URL   : https://patchwork.freedesktop.org/series/145317/
State : failure

== Summary ==

=== Applying kernel patches on branch 'drm-tip' with base: ===
Base commit: a5a9504217e2 drm-tip: 2025y-02m-24d-16h-04m-10s UTC integration manifest
=== git am output follows ===
error: patch failed: drivers/gpu/drm/xe/xe_vm.c:681
error: drivers/gpu/drm/xe/xe_vm.c: patch does not apply
hint: Use 'git am --show-current-patch=diff' to see the failed patch
Applying: drm/xe/userptr: fix EFAULT handling
Patch failed at 0001 drm/xe/userptr: fix EFAULT handling
When you have resolved this problem, run "git am --continue".
If you prefer to skip this patch, run "git am --skip" instead.
To restore the original branch and stop patching, run "git am --abort".



^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v2 2/3] drm/xe: Userptr invalidation race with binds fixes
  2025-02-24 15:35         ` Matthew Brost
@ 2025-02-24 16:22           ` Thomas Hellström
  2025-02-24 16:30             ` Matthew Brost
  0 siblings, 1 reply; 14+ messages in thread
From: Thomas Hellström @ 2025-02-24 16:22 UTC (permalink / raw)
  To: Matthew Brost; +Cc: intel-xe, matthew.auld

On Mon, 2025-02-24 at 07:35 -0800, Matthew Brost wrote:
> On Mon, Feb 24, 2025 at 04:20:07PM +0100, Thomas Hellström wrote:
> > On Mon, 2025-02-24 at 07:06 -0800, Matthew Brost wrote:
> > > On Mon, Feb 24, 2025 at 09:21:09AM +0100, Thomas Hellström wrote:
> > > > On Sun, 2025-02-23 at 20:05 -0800, Matthew Brost wrote:
> > > > > Squash bind operation after userptr invalidation into a
> > > > > clearing
> > > > > of
> > > > > PTEs. Will prevent valid GPU page tables from pointing to
> > > > > stale
> > > > > CPU
> > > > > pages.
> > > > > 
> > > > > Fixup initial bind handling always add VMAs to invalidation
> > > > > list
> > > > > and
> > > > > clear PTEs.
> > > > > 
> > > > > Remove unused rebind variable in xe_pt.
> > > > > 
> > > > > Always hold notifier across TLB invalidation in notifier to
> > > > > prevent a
> > > > > UAF if an unbind races.
> > > > > 
> > > > > Including all of the above changes for Fixes patch in hopes
> > > > > of an
> > > > > easier
> > > > > backport which fix a single patch.
> > > > > 
> > > > > v2:
> > > > >  - Wait dma-resv bookkeep before issuing PTE zap (Thomas)
> > > > >  - Support scratch page on invalidation (Thomas)
> > > > > 
> > > > > Cc: Thomas Hellström <thomas.hellstrom@linux.intel.com>
> > > > > Cc: <stable@vger.kernel.org>
> > > > > Fixes: e8babb280b5e ("drm/xe: Convert multiple bind ops into
> > > > > single
> > > > > job")
> > > > > Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> > > > > ---
> > > > >  drivers/gpu/drm/xe/xe_pt.c       | 146
> > > > > +++++++++++++++++++++++--
> > > > > ----
> > > > > --
> > > > >  drivers/gpu/drm/xe/xe_pt_types.h |   3 +-
> > > > >  drivers/gpu/drm/xe/xe_vm.c       |   4 +-
> > > > >  3 files changed, 115 insertions(+), 38 deletions(-)
> > > > > 
> > > > > diff --git a/drivers/gpu/drm/xe/xe_pt.c
> > > > > b/drivers/gpu/drm/xe/xe_pt.c
> > > > > index 1ddcc7e79a93..add521b5c6ae 100644
> > > > > --- a/drivers/gpu/drm/xe/xe_pt.c
> > > > > +++ b/drivers/gpu/drm/xe/xe_pt.c
> > > > > @@ -351,7 +351,8 @@ xe_pt_new_shared(struct xe_walk_update
> > > > > *wupd,
> > > > > struct xe_pt *parent,
> > > > >   */
> > > > >  static int
> > > > >  xe_pt_insert_entry(struct xe_pt_stage_bind_walk *xe_walk,
> > > > > struct
> > > > > xe_pt *parent,
> > > > > -		   pgoff_t offset, struct xe_pt *xe_child,
> > > > > u64
> > > > > pte)
> > > > > +		   pgoff_t offset, struct xe_pt *xe_child,
> > > > > u64
> > > > > pte,
> > > > > +		   unsigned int level)
> > > > >  {
> > > > >  	struct xe_pt_update *upd = &xe_walk-
> > > > > > wupd.updates[parent-
> > > > > > level];
> > > > >  	struct xe_pt_update *child_upd = xe_child ?
> > > > > @@ -389,6 +390,9 @@ xe_pt_insert_entry(struct
> > > > > xe_pt_stage_bind_walk
> > > > > *xe_walk, struct xe_pt *parent,
> > > > >  		idx = offset - entry->ofs;
> > > > >  		entry->pt_entries[idx].pt = xe_child;
> > > > >  		entry->pt_entries[idx].pte = pte;
> > > > > +		entry->pt_entries[idx].level = level;
> > > > > +		if (likely(!xe_child))
> > > > > +			entry->pt_entries[idx].level |=
> > > > > XE_PT_IS_LEAF;
> > > > >  		entry->qwords++;
> > > > >  	}
> > > > >  
> > > > > @@ -515,7 +519,8 @@ xe_pt_stage_bind_entry(struct xe_ptw
> > > > > *parent,
> > > > > pgoff_t offset,
> > > > >  			}
> > > > >  		}
> > > > >  
> > > > > -		ret = xe_pt_insert_entry(xe_walk, xe_parent,
> > > > > offset,
> > > > > NULL, pte);
> > > > > +		ret = xe_pt_insert_entry(xe_walk, xe_parent,
> > > > > offset,
> > > > > NULL, pte,
> > > > > +					 level);
> > > > >  		if (unlikely(ret))
> > > > >  			return ret;
> > > > >  
> > > > > @@ -571,7 +576,7 @@ xe_pt_stage_bind_entry(struct xe_ptw
> > > > > *parent,
> > > > > pgoff_t offset,
> > > > >  
> > > > >  		pte = vm->pt_ops->pde_encode_bo(xe_child-
> > > > > >bo, 0,
> > > > > pat_index) | flags;
> > > > >  		ret = xe_pt_insert_entry(xe_walk, xe_parent,
> > > > > offset,
> > > > > xe_child,
> > > > > -					 pte);
> > > > > +					 pte, level);
> > > > >  	}
> > > > >  
> > > > >  	*action = ACTION_SUBTREE;
> > > > > @@ -752,6 +757,10 @@ struct xe_pt_zap_ptes_walk {
> > > > >  	/* Input parameters for the walk */
> > > > >  	/** @tile: The tile we're building for */
> > > > >  	struct xe_tile *tile;
> > > > > +	/** @vm: VM we're building for */
> > > > > +	struct xe_vm *vm;
> > > > > +	/** @scratch: write entries with scratch */
> > > > > +	bool scratch;
> > > > >  
> > > > >  	/* Output */
> > > > >  	/** @needs_invalidate: Whether we need to invalidate
> > > > > TLB*/
> > > > > @@ -779,9 +788,18 @@ static int xe_pt_zap_ptes_entry(struct
> > > > > xe_ptw
> > > > > *parent, pgoff_t offset,
> > > > >  	 */
> > > > >  	if (xe_pt_nonshared_offsets(addr, next, --level,
> > > > > walk,
> > > > > action, &offset,
> > > > >  				    &end_offset)) {
> > > > > -		xe_map_memset(tile_to_xe(xe_walk->tile),
> > > > > &xe_child-
> > > > > > bo->vmap,
> > > > > -			      offset * sizeof(u64), 0,
> > > > > -			      (end_offset - offset) *
> > > > > sizeof(u64));
> > > > > +		if (unlikely(xe_walk->scratch)) {
> > > > > +			u64 pte = __xe_pt_empty_pte(xe_walk-
> > > > > > tile,
> > > > > xe_walk->vm,
> > > > > +						    level);
> > > > > +
> > > > > +			for (; offset < end_offset;
> > > > > ++offset)
> > > > > +				xe_pt_write(tile_to_xe(xe_wa
> > > > > lk-
> > > > > > tile),
> > > > > +					    &xe_child->bo-
> > > > > >vmap,
> > > > > offset, pte);
> > > > > +		} else {
> > > > > +			xe_map_memset(tile_to_xe(xe_walk-
> > > > > >tile),
> > > > > &xe_child->bo->vmap,
> > > > > +				      offset * sizeof(u64),
> > > > > 0,
> > > > > +				      (end_offset - offset)
> > > > > *
> > > > > sizeof(u64));
> > > > > +		}
> > > > >  		xe_walk->needs_invalidate = true;
> > > > >  	}
> > > > >  
> > > > > @@ -792,6 +810,31 @@ static const struct xe_pt_walk_ops
> > > > > xe_pt_zap_ptes_ops = {
> > > > >  	.pt_entry = xe_pt_zap_ptes_entry,
> > > > >  };
> > > > >  
> > > > > +struct xe_pt_zap_ptes_flags {
> > > > > +	bool scratch:1;
> > > > > +};
> > > > > +
> > > > > +static bool __xe_pt_zap_ptes(struct xe_tile *tile, struct
> > > > > xe_vma
> > > > > *vma,
> > > > > +			     struct xe_pt_zap_ptes_flags
> > > > > flags)
> > > > > +{
> > > > > +	struct xe_pt_zap_ptes_walk xe_walk = {
> > > > > +		.base = {
> > > > > +			.ops = &xe_pt_zap_ptes_ops,
> > > > > +			.shifts = xe_normal_pt_shifts,
> > > > > +			.max_level = XE_PT_HIGHEST_LEVEL,
> > > > > +		},
> > > > > +		.tile = tile,
> > > > > +		.vm = xe_vma_vm(vma),
> > > > > +		.scratch = flags.scratch,
> > > > > +	};
> > > > > +	struct xe_pt *pt = xe_vma_vm(vma)->pt_root[tile-
> > > > > >id];
> > > > > +
> > > > > +	(void)xe_pt_walk_shared(&pt->base, pt->level,
> > > > > xe_vma_start(vma),
> > > > > +				xe_vma_end(vma),
> > > > > &xe_walk.base);
> > > > > +
> > > > > +	return xe_walk.needs_invalidate;
> > > > > +}
> > > > > +
> > > > >  /**
> > > > >   * xe_pt_zap_ptes() - Zap (zero) gpu ptes of an address
> > > > > range
> > > > >   * @tile: The tile we're zapping for.
> > > > > @@ -810,24 +853,13 @@ static const struct xe_pt_walk_ops
> > > > > xe_pt_zap_ptes_ops = {
> > > > >   */
> > > > >  bool xe_pt_zap_ptes(struct xe_tile *tile, struct xe_vma
> > > > > *vma)
> > > > >  {
> > > > > -	struct xe_pt_zap_ptes_walk xe_walk = {
> > > > > -		.base = {
> > > > > -			.ops = &xe_pt_zap_ptes_ops,
> > > > > -			.shifts = xe_normal_pt_shifts,
> > > > > -			.max_level = XE_PT_HIGHEST_LEVEL,
> > > > > -		},
> > > > > -		.tile = tile,
> > > > > -	};
> > > > > -	struct xe_pt *pt = xe_vma_vm(vma)->pt_root[tile-
> > > > > >id];
> > > > > +	struct xe_pt_zap_ptes_flags flags = {};
> > > > >  	u8 pt_mask = (vma->tile_present & ~vma-
> > > > > > tile_invalidated);
> > > > >  
> > > > >  	if (!(pt_mask & BIT(tile->id)))
> > > > >  		return false;
> > > > >  
> > > > > -	(void)xe_pt_walk_shared(&pt->base, pt->level,
> > > > > xe_vma_start(vma),
> > > > > -				xe_vma_end(vma),
> > > > > &xe_walk.base);
> > > > > -
> > > > > -	return xe_walk.needs_invalidate;
> > > > > +	return __xe_pt_zap_ptes(tile, vma, flags);
> > > > >  }
> > > > >  
> > > > >  static void
> > > > > @@ -1201,7 +1233,46 @@ static bool
> > > > > xe_pt_userptr_inject_eagain(struct
> > > > > xe_userptr_vma *uvma)
> > > > >  
> > > > >  #endif
> > > > >  
> > > > > -static int vma_check_userptr(struct xe_vm *vm, struct xe_vma
> > > > > *vma,
> > > > > +static void
> > > > > +vma_convert_to_invalidation(struct xe_tile *tile, struct
> > > > > xe_vma
> > > > > *vma,
> > > > > +			    struct xe_vm_pgtable_update_ops
> > > > > *pt_update)
> > > > > +{
> > > > > +	struct xe_pt_zap_ptes_flags flags = { .scratch =
> > > > > true,
> > > > > };
> > > > > +	int i, j, k;
> > > > > +
> > > > > +	/*
> > > > > +	 * Need to update this function to bypass scratch
> > > > > setup
> > > > > if
> > > > > in fault mode
> > > > > +	 */
> > > > > +	xe_assert(xe_vma_vm(vma)->xe,
> > > > > !xe_vm_in_fault_mode(xe_vma_vm(vma)));
> > > > > +
> > > > > +	for (i = 0; i < pt_update->current_op; ++i) {
> > > > > +		struct xe_vm_pgtable_update_op *op =
> > > > > &pt_update-
> > > > > > ops[i];
> > > > > +
> > > > > +		if (vma != op->vma || (!op->bind && !op-
> > > > > > rebind))
> > > > > +			continue;
> > > > > +
> > > > > +		for (j = 0; j < op->num_entries; ++j) {
> > > > > +			for (k = 0; k < op-
> > > > > >entries[j].qwords;
> > > > > ++k)
> > > > > {
> > > > > +				struct xe_pt_entry *entry =
> > > > > +					&op-
> > > > > > entries[j].pt_entries[k];
> > > > > +				unsigned int level = entry-
> > > > > > level;
> > > > > +
> > > > > +				if (!(level &
> > > > > XE_PT_IS_LEAF))
> > > > > +					continue;
> > > > > +
> > > > > +				level &= ~XE_PT_IS_LEAF;
> > > > > +				entry->pte =
> > > > > __xe_pt_empty_pte(tile,
> > > > > +							    
> > > > >   
> > > > > xe_vma_vm(vma),
> > > > > +							    
> > > > >   
> > > > > level);
> > > > > +			}
> > > > > +		}
> > > > > +	}
> > > > > +
> > > > > +	__xe_pt_zap_ptes(tile, vma, flags);
> > > > 
> > > > As mentioned in my previous email, I'm pretty sure if we modify
> > > > all
> > > > the
> > > > ptes in the entry array, not just the leaves, (that's basically
> > > > all
> > > > ptes of shared page-table entries) that will be equivalent to a
> > > > zap.
> > > > 
> > > 
> > > That doesn't work. I had that way originally but IGTs fail with
> > > IOMMU
> > > CAT errors (e.g. xe_exec_basic.once-userptr fails like this [1])
> > > 
> > > Let me explain with example.
> > > 
> > > Array of bind 0x0000-0x1000 (A), 0x1000-0x2000 (B)
> > > 
> > > - (A) hits userptr invalidation
> > > 	- If modify all ptes in the entry array, highest level
> > > PDE
> > > is
> > > 	  invalidated. Both (A) and (B) either are 0 or scratch
> > > - (A) does rebind in exec
> > > 	- We only modify leaf entry, not the highest level PDE
> > > which
> > > is
> > > 	  0 or scratch
> > > 
> > > Matt
> > 
> > Argh. You're right. What would happen if we don't do anything to
> > the
> > ptes, then? It looks from the code in faulting mode we error and
> > unwind
> > with -EAGAIN.
> > 
> >  In preempt-fence mode and !LR we could instead ensure properly
> > stop
> > gpu access and put the userptr on the invalidated list in the
> > notifier.
> > 
> 
> In dma-fence mode we'd have a window where existing (previously
> submited) GPU jobs could corrupt pages which have been invalidated.
> Unlikely but a very subtle security / data corruption issue.

Agree we should not allow that to happen however small the chance is.

However, I was thinking of something along the following. I can't see
any dma-fence jobs accessing the invalidated pages here, since we wait
for already submitted jobs to complete, and no new ones could be
submitted unless the userptr has been rebound.

Or are you thinking about the iommu issue where we should really
dma_unmap() in the notifier, like the SVM code does? 

Or could you describe the race more in detail?

/Thomas


diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c
index d664f2e418b2..c90c0687be51 100644
--- a/drivers/gpu/drm/xe/xe_vm.c
+++ b/drivers/gpu/drm/xe/xe_vm.c
@@ -605,12 +605,6 @@ static bool vma_userptr_invalidate(struct
mmu_interval_notifier *mni,
        down_write(&vm->userptr.notifier_lock);
        mmu_interval_set_seq(mni, cur_seq);
 
-       /* No need to stop gpu access if the userptr is not yet bound.
*/
-       if (!userptr->initial_bind) {
-               up_write(&vm->userptr.notifier_lock);
-               return true;
-       }
-
        /*
         * Tell exec and rebind worker they need to repin and rebind
this
         * userptr.
@@ -623,8 +617,6 @@ static bool vma_userptr_invalidate(struct
mmu_interval_notifier *mni,
                spin_unlock(&vm->userptr.invalidated_lock);
        }
 
-       up_write(&vm->userptr.notifier_lock);
-
        /*
         * Preempt fences turn into schedule disables, pipeline these.
         * Note that even in fault mode, we need to wait for binds and
@@ -642,11 +634,13 @@ static bool vma_userptr_invalidate(struct
mmu_interval_notifier *mni,
                                    false, MAX_SCHEDULE_TIMEOUT);
        XE_WARN_ON(err <= 0);
 
-       if (xe_vm_in_fault_mode(vm)) {
+       if (xe_vm_in_fault_mode(vm) && userptr->initial_bind) {
                err = xe_vm_invalidate_vma(vma);
                XE_WARN_ON(err);
        }
 
+       up_write(&vm->userptr.notifier_lock);
+
        trace_xe_vma_userptr_invalidate_complete(vma);
 
        return true;
(END)



^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH v2 2/3] drm/xe: Userptr invalidation race with binds fixes
  2025-02-24 16:22           ` Thomas Hellström
@ 2025-02-24 16:30             ` Matthew Brost
  0 siblings, 0 replies; 14+ messages in thread
From: Matthew Brost @ 2025-02-24 16:30 UTC (permalink / raw)
  To: Thomas Hellström; +Cc: intel-xe, matthew.auld

On Mon, Feb 24, 2025 at 05:22:40PM +0100, Thomas Hellström wrote:
> On Mon, 2025-02-24 at 07:35 -0800, Matthew Brost wrote:
> > On Mon, Feb 24, 2025 at 04:20:07PM +0100, Thomas Hellström wrote:
> > > On Mon, 2025-02-24 at 07:06 -0800, Matthew Brost wrote:
> > > > On Mon, Feb 24, 2025 at 09:21:09AM +0100, Thomas Hellström wrote:
> > > > > On Sun, 2025-02-23 at 20:05 -0800, Matthew Brost wrote:
> > > > > > Squash bind operation after userptr invalidation into a
> > > > > > clearing
> > > > > > of
> > > > > > PTEs. Will prevent valid GPU page tables from pointing to
> > > > > > stale
> > > > > > CPU
> > > > > > pages.
> > > > > > 
> > > > > > Fixup initial bind handling always add VMAs to invalidation
> > > > > > list
> > > > > > and
> > > > > > clear PTEs.
> > > > > > 
> > > > > > Remove unused rebind variable in xe_pt.
> > > > > > 
> > > > > > Always hold notifier across TLB invalidation in notifier to
> > > > > > prevent a
> > > > > > UAF if an unbind races.
> > > > > > 
> > > > > > Including all of the above changes for Fixes patch in hopes
> > > > > > of an
> > > > > > easier
> > > > > > backport which fix a single patch.
> > > > > > 
> > > > > > v2:
> > > > > >  - Wait dma-resv bookkeep before issuing PTE zap (Thomas)
> > > > > >  - Support scratch page on invalidation (Thomas)
> > > > > > 
> > > > > > Cc: Thomas Hellström <thomas.hellstrom@linux.intel.com>
> > > > > > Cc: <stable@vger.kernel.org>
> > > > > > Fixes: e8babb280b5e ("drm/xe: Convert multiple bind ops into
> > > > > > single
> > > > > > job")
> > > > > > Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> > > > > > ---
> > > > > >  drivers/gpu/drm/xe/xe_pt.c       | 146
> > > > > > +++++++++++++++++++++++--
> > > > > > ----
> > > > > > --
> > > > > >  drivers/gpu/drm/xe/xe_pt_types.h |   3 +-
> > > > > >  drivers/gpu/drm/xe/xe_vm.c       |   4 +-
> > > > > >  3 files changed, 115 insertions(+), 38 deletions(-)
> > > > > > 
> > > > > > diff --git a/drivers/gpu/drm/xe/xe_pt.c
> > > > > > b/drivers/gpu/drm/xe/xe_pt.c
> > > > > > index 1ddcc7e79a93..add521b5c6ae 100644
> > > > > > --- a/drivers/gpu/drm/xe/xe_pt.c
> > > > > > +++ b/drivers/gpu/drm/xe/xe_pt.c
> > > > > > @@ -351,7 +351,8 @@ xe_pt_new_shared(struct xe_walk_update
> > > > > > *wupd,
> > > > > > struct xe_pt *parent,
> > > > > >   */
> > > > > >  static int
> > > > > >  xe_pt_insert_entry(struct xe_pt_stage_bind_walk *xe_walk,
> > > > > > struct
> > > > > > xe_pt *parent,
> > > > > > -		   pgoff_t offset, struct xe_pt *xe_child,
> > > > > > u64
> > > > > > pte)
> > > > > > +		   pgoff_t offset, struct xe_pt *xe_child,
> > > > > > u64
> > > > > > pte,
> > > > > > +		   unsigned int level)
> > > > > >  {
> > > > > >  	struct xe_pt_update *upd = &xe_walk-
> > > > > > > wupd.updates[parent-
> > > > > > > level];
> > > > > >  	struct xe_pt_update *child_upd = xe_child ?
> > > > > > @@ -389,6 +390,9 @@ xe_pt_insert_entry(struct
> > > > > > xe_pt_stage_bind_walk
> > > > > > *xe_walk, struct xe_pt *parent,
> > > > > >  		idx = offset - entry->ofs;
> > > > > >  		entry->pt_entries[idx].pt = xe_child;
> > > > > >  		entry->pt_entries[idx].pte = pte;
> > > > > > +		entry->pt_entries[idx].level = level;
> > > > > > +		if (likely(!xe_child))
> > > > > > +			entry->pt_entries[idx].level |=
> > > > > > XE_PT_IS_LEAF;
> > > > > >  		entry->qwords++;
> > > > > >  	}
> > > > > >  
> > > > > > @@ -515,7 +519,8 @@ xe_pt_stage_bind_entry(struct xe_ptw
> > > > > > *parent,
> > > > > > pgoff_t offset,
> > > > > >  			}
> > > > > >  		}
> > > > > >  
> > > > > > -		ret = xe_pt_insert_entry(xe_walk, xe_parent,
> > > > > > offset,
> > > > > > NULL, pte);
> > > > > > +		ret = xe_pt_insert_entry(xe_walk, xe_parent,
> > > > > > offset,
> > > > > > NULL, pte,
> > > > > > +					 level);
> > > > > >  		if (unlikely(ret))
> > > > > >  			return ret;
> > > > > >  
> > > > > > @@ -571,7 +576,7 @@ xe_pt_stage_bind_entry(struct xe_ptw
> > > > > > *parent,
> > > > > > pgoff_t offset,
> > > > > >  
> > > > > >  		pte = vm->pt_ops->pde_encode_bo(xe_child-
> > > > > > >bo, 0,
> > > > > > pat_index) | flags;
> > > > > >  		ret = xe_pt_insert_entry(xe_walk, xe_parent,
> > > > > > offset,
> > > > > > xe_child,
> > > > > > -					 pte);
> > > > > > +					 pte, level);
> > > > > >  	}
> > > > > >  
> > > > > >  	*action = ACTION_SUBTREE;
> > > > > > @@ -752,6 +757,10 @@ struct xe_pt_zap_ptes_walk {
> > > > > >  	/* Input parameters for the walk */
> > > > > >  	/** @tile: The tile we're building for */
> > > > > >  	struct xe_tile *tile;
> > > > > > +	/** @vm: VM we're building for */
> > > > > > +	struct xe_vm *vm;
> > > > > > +	/** @scratch: write entries with scratch */
> > > > > > +	bool scratch;
> > > > > >  
> > > > > >  	/* Output */
> > > > > >  	/** @needs_invalidate: Whether we need to invalidate
> > > > > > TLB*/
> > > > > > @@ -779,9 +788,18 @@ static int xe_pt_zap_ptes_entry(struct
> > > > > > xe_ptw
> > > > > > *parent, pgoff_t offset,
> > > > > >  	 */
> > > > > >  	if (xe_pt_nonshared_offsets(addr, next, --level,
> > > > > > walk,
> > > > > > action, &offset,
> > > > > >  				    &end_offset)) {
> > > > > > -		xe_map_memset(tile_to_xe(xe_walk->tile),
> > > > > > &xe_child-
> > > > > > > bo->vmap,
> > > > > > -			      offset * sizeof(u64), 0,
> > > > > > -			      (end_offset - offset) *
> > > > > > sizeof(u64));
> > > > > > +		if (unlikely(xe_walk->scratch)) {
> > > > > > +			u64 pte = __xe_pt_empty_pte(xe_walk-
> > > > > > > tile,
> > > > > > xe_walk->vm,
> > > > > > +						    level);
> > > > > > +
> > > > > > +			for (; offset < end_offset;
> > > > > > ++offset)
> > > > > > +				xe_pt_write(tile_to_xe(xe_wa
> > > > > > lk-
> > > > > > > tile),
> > > > > > +					    &xe_child->bo-
> > > > > > >vmap,
> > > > > > offset, pte);
> > > > > > +		} else {
> > > > > > +			xe_map_memset(tile_to_xe(xe_walk-
> > > > > > >tile),
> > > > > > &xe_child->bo->vmap,
> > > > > > +				      offset * sizeof(u64),
> > > > > > 0,
> > > > > > +				      (end_offset - offset)
> > > > > > *
> > > > > > sizeof(u64));
> > > > > > +		}
> > > > > >  		xe_walk->needs_invalidate = true;
> > > > > >  	}
> > > > > >  
> > > > > > @@ -792,6 +810,31 @@ static const struct xe_pt_walk_ops
> > > > > > xe_pt_zap_ptes_ops = {
> > > > > >  	.pt_entry = xe_pt_zap_ptes_entry,
> > > > > >  };
> > > > > >  
> > > > > > +struct xe_pt_zap_ptes_flags {
> > > > > > +	bool scratch:1;
> > > > > > +};
> > > > > > +
> > > > > > +static bool __xe_pt_zap_ptes(struct xe_tile *tile, struct
> > > > > > xe_vma
> > > > > > *vma,
> > > > > > +			     struct xe_pt_zap_ptes_flags
> > > > > > flags)
> > > > > > +{
> > > > > > +	struct xe_pt_zap_ptes_walk xe_walk = {
> > > > > > +		.base = {
> > > > > > +			.ops = &xe_pt_zap_ptes_ops,
> > > > > > +			.shifts = xe_normal_pt_shifts,
> > > > > > +			.max_level = XE_PT_HIGHEST_LEVEL,
> > > > > > +		},
> > > > > > +		.tile = tile,
> > > > > > +		.vm = xe_vma_vm(vma),
> > > > > > +		.scratch = flags.scratch,
> > > > > > +	};
> > > > > > +	struct xe_pt *pt = xe_vma_vm(vma)->pt_root[tile-
> > > > > > >id];
> > > > > > +
> > > > > > +	(void)xe_pt_walk_shared(&pt->base, pt->level,
> > > > > > xe_vma_start(vma),
> > > > > > +				xe_vma_end(vma),
> > > > > > &xe_walk.base);
> > > > > > +
> > > > > > +	return xe_walk.needs_invalidate;
> > > > > > +}
> > > > > > +
> > > > > >  /**
> > > > > >   * xe_pt_zap_ptes() - Zap (zero) gpu ptes of an address
> > > > > > range
> > > > > >   * @tile: The tile we're zapping for.
> > > > > > @@ -810,24 +853,13 @@ static const struct xe_pt_walk_ops
> > > > > > xe_pt_zap_ptes_ops = {
> > > > > >   */
> > > > > >  bool xe_pt_zap_ptes(struct xe_tile *tile, struct xe_vma
> > > > > > *vma)
> > > > > >  {
> > > > > > -	struct xe_pt_zap_ptes_walk xe_walk = {
> > > > > > -		.base = {
> > > > > > -			.ops = &xe_pt_zap_ptes_ops,
> > > > > > -			.shifts = xe_normal_pt_shifts,
> > > > > > -			.max_level = XE_PT_HIGHEST_LEVEL,
> > > > > > -		},
> > > > > > -		.tile = tile,
> > > > > > -	};
> > > > > > -	struct xe_pt *pt = xe_vma_vm(vma)->pt_root[tile-
> > > > > > >id];
> > > > > > +	struct xe_pt_zap_ptes_flags flags = {};
> > > > > >  	u8 pt_mask = (vma->tile_present & ~vma-
> > > > > > > tile_invalidated);
> > > > > >  
> > > > > >  	if (!(pt_mask & BIT(tile->id)))
> > > > > >  		return false;
> > > > > >  
> > > > > > -	(void)xe_pt_walk_shared(&pt->base, pt->level,
> > > > > > xe_vma_start(vma),
> > > > > > -				xe_vma_end(vma),
> > > > > > &xe_walk.base);
> > > > > > -
> > > > > > -	return xe_walk.needs_invalidate;
> > > > > > +	return __xe_pt_zap_ptes(tile, vma, flags);
> > > > > >  }
> > > > > >  
> > > > > >  static void
> > > > > > @@ -1201,7 +1233,46 @@ static bool
> > > > > > xe_pt_userptr_inject_eagain(struct
> > > > > > xe_userptr_vma *uvma)
> > > > > >  
> > > > > >  #endif
> > > > > >  
> > > > > > -static int vma_check_userptr(struct xe_vm *vm, struct xe_vma
> > > > > > *vma,
> > > > > > +static void
> > > > > > +vma_convert_to_invalidation(struct xe_tile *tile, struct
> > > > > > xe_vma
> > > > > > *vma,
> > > > > > +			    struct xe_vm_pgtable_update_ops
> > > > > > *pt_update)
> > > > > > +{
> > > > > > +	struct xe_pt_zap_ptes_flags flags = { .scratch =
> > > > > > true,
> > > > > > };
> > > > > > +	int i, j, k;
> > > > > > +
> > > > > > +	/*
> > > > > > +	 * Need to update this function to bypass scratch
> > > > > > setup
> > > > > > if
> > > > > > in fault mode
> > > > > > +	 */
> > > > > > +	xe_assert(xe_vma_vm(vma)->xe,
> > > > > > !xe_vm_in_fault_mode(xe_vma_vm(vma)));
> > > > > > +
> > > > > > +	for (i = 0; i < pt_update->current_op; ++i) {
> > > > > > +		struct xe_vm_pgtable_update_op *op =
> > > > > > &pt_update-
> > > > > > > ops[i];
> > > > > > +
> > > > > > +		if (vma != op->vma || (!op->bind && !op-
> > > > > > > rebind))
> > > > > > +			continue;
> > > > > > +
> > > > > > +		for (j = 0; j < op->num_entries; ++j) {
> > > > > > +			for (k = 0; k < op-
> > > > > > >entries[j].qwords;
> > > > > > ++k)
> > > > > > {
> > > > > > +				struct xe_pt_entry *entry =
> > > > > > +					&op-
> > > > > > > entries[j].pt_entries[k];
> > > > > > +				unsigned int level = entry-
> > > > > > > level;
> > > > > > +
> > > > > > +				if (!(level &
> > > > > > XE_PT_IS_LEAF))
> > > > > > +					continue;
> > > > > > +
> > > > > > +				level &= ~XE_PT_IS_LEAF;
> > > > > > +				entry->pte =
> > > > > > __xe_pt_empty_pte(tile,
> > > > > > +							    
> > > > > >   
> > > > > > xe_vma_vm(vma),
> > > > > > +							    
> > > > > >   
> > > > > > level);
> > > > > > +			}
> > > > > > +		}
> > > > > > +	}
> > > > > > +
> > > > > > +	__xe_pt_zap_ptes(tile, vma, flags);
> > > > > 
> > > > > As mentioned in my previous email, I'm pretty sure if we modify
> > > > > all
> > > > > the
> > > > > ptes in the entry array, not just the leaves, (that's basically
> > > > > all
> > > > > ptes of shared page-table entries) that will be equivalent to a
> > > > > zap.
> > > > > 
> > > > 
> > > > That doesn't work. I had that way originally but IGTs fail with
> > > > IOMMU
> > > > CAT errors (e.g. xe_exec_basic.once-userptr fails like this [1])
> > > > 
> > > > Let me explain with example.
> > > > 
> > > > Array of bind 0x0000-0x1000 (A), 0x1000-0x2000 (B)
> > > > 
> > > > - (A) hits userptr invalidation
> > > > 	- If modify all ptes in the entry array, highest level
> > > > PDE
> > > > is
> > > > 	  invalidated. Both (A) and (B) either are 0 or scratch
> > > > - (A) does rebind in exec
> > > > 	- We only modify leaf entry, not the highest level PDE
> > > > which
> > > > is
> > > > 	  0 or scratch
> > > > 
> > > > Matt
> > > 
> > > Argh. You're right. What would happen if we don't do anything to
> > > the
> > > ptes, then? It looks from the code in faulting mode we error and
> > > unwind
> > > with -EAGAIN.
> > > 
> > >  In preempt-fence mode and !LR we could instead ensure properly
> > > stop
> > > gpu access and put the userptr on the invalidated list in the
> > > notifier.
> > > 
> > 
> > In dma-fence mode we'd have a window where existing (previously
> > submited) GPU jobs could corrupt pages which have been invalidated.
> > Unlikely but a very subtle security / data corruption issue.
> 
> Agree we should not allow that to happen however small the chance is.
> 
> However, I was thinking of something along the following. I can't see
> any dma-fence jobs accessing the invalidated pages here, since we wait
> for already submitted jobs to complete, and no new ones could be
> submitted unless the userptr has been rebound.
> 

Yes, I missed this. After my change to wait on dma-resv in dma-fence
mode too, there shouldn't anything running so it safe to temporally
setup the page tables pointing to the invalidated pages as we will fixup
the pages next exec. The original code before my patch had this window.
Let me respin this patch simply wait on the dma-resv and drop
the complicated invalidation.

Matt

> Or are you thinking about the iommu issue where we should really
> dma_unmap() in the notifier, like the SVM code does? 
> 
> Or could you describe the race more in detail?
> 
> /Thomas
> 
> 
> diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c
> index d664f2e418b2..c90c0687be51 100644
> --- a/drivers/gpu/drm/xe/xe_vm.c
> +++ b/drivers/gpu/drm/xe/xe_vm.c
> @@ -605,12 +605,6 @@ static bool vma_userptr_invalidate(struct
> mmu_interval_notifier *mni,
>         down_write(&vm->userptr.notifier_lock);
>         mmu_interval_set_seq(mni, cur_seq);
>  
> -       /* No need to stop gpu access if the userptr is not yet bound.
> */
> -       if (!userptr->initial_bind) {
> -               up_write(&vm->userptr.notifier_lock);
> -               return true;
> -       }
> -
>         /*
>          * Tell exec and rebind worker they need to repin and rebind
> this
>          * userptr.
> @@ -623,8 +617,6 @@ static bool vma_userptr_invalidate(struct
> mmu_interval_notifier *mni,
>                 spin_unlock(&vm->userptr.invalidated_lock);
>         }
>  
> -       up_write(&vm->userptr.notifier_lock);
> -
>         /*
>          * Preempt fences turn into schedule disables, pipeline these.
>          * Note that even in fault mode, we need to wait for binds and
> @@ -642,11 +634,13 @@ static bool vma_userptr_invalidate(struct
> mmu_interval_notifier *mni,
>                                     false, MAX_SCHEDULE_TIMEOUT);
>         XE_WARN_ON(err <= 0);
>  
> -       if (xe_vm_in_fault_mode(vm)) {
> +       if (xe_vm_in_fault_mode(vm) && userptr->initial_bind) {
>                 err = xe_vm_invalidate_vma(vma);
>                 XE_WARN_ON(err);
>         }
>  
> +       up_write(&vm->userptr.notifier_lock);
> +
>         trace_xe_vma_userptr_invalidate_complete(vma);
>  
>         return true;
> (END)
> 
> 

^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2025-02-24 16:29 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-02-24  4:05 [PATCH v2 0/3] Userptr fixes Matthew Brost
2025-02-24  4:05 ` [PATCH v2 1/3] drm/xe/userptr: fix EFAULT handling Matthew Brost
2025-02-24  4:05 ` [PATCH v2 2/3] drm/xe: Userptr invalidation race with binds fixes Matthew Brost
2025-02-24  8:21   ` Thomas Hellström
2025-02-24 15:06     ` Matthew Brost
2025-02-24 15:20       ` Thomas Hellström
2025-02-24 15:35         ` Matthew Brost
2025-02-24 16:22           ` Thomas Hellström
2025-02-24 16:30             ` Matthew Brost
2025-02-24  4:05 ` [PATCH v2 3/3] drm/xe: Add staging tree for VM binds Matthew Brost
2025-02-24  9:22   ` Thomas Hellström
2025-02-24 14:41     ` Matthew Brost
2025-02-24 16:15 ` ✗ CI.Patch_applied: failure for Userptr fixes Patchwork
  -- strict thread matches above, loose matches on Subject: below --
2025-02-24  3:24 [PATCH v2 0/3] " Matthew Brost
2025-02-24  3:24 ` [PATCH v2 2/3] drm/xe: Userptr invalidation race with binds fixes Matthew Brost

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox