[PATCH v5 0/6] Fix serialization on burst of unbinds

Intel-XE Archive on lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH v5 0/6] Fix serialization on burst of unbinds - v2
@ 2025-10-29 20:57 Matthew Brost
  2025-10-29 20:57 ` [PATCH v5 1/6] drm/xe: Enforce correct user fence signaling order using drm_syncobjs Matthew Brost
                   ` (5 more replies)
  0 siblings, 6 replies; 16+ messages in thread
From: Matthew Brost @ 2025-10-29 20:57 UTC (permalink / raw)
  To: intel-xe; +Cc: thomas.hellstrom

Attempting to resolve part of [1]; this solution differs than v1 [2] 
by changing last fence semantics detailed in the patches.

Overview of issue in [1]:

When a burst of unbind jobs is issued, a dependency chain can form 
between the TLB invalidation of a previous unbind job and the current 
one. This leads to undesirable serialization, causing current jobs to 
wait unnecessarily for prior TLB invalidations, execute on the GPU 
when not needed, and significantly slow down the unbind 
burst—resulting in up to a 4× slowdown.
 
Matt

[1] https://gitlab.freedesktop.org/drm/xe/kernel/-/issues/6047
[2] https://patchwork.freedesktop.org/series/156144/

Matthew Brost (6):
  drm/xe: Enforce correct user fence signaling order using drm_syncobjs
  drm/xe: Attach last fence to TLB invalidation job queues
  drm/xe: Decouple bind queue last fence from TLB invalidations
  drm/xe: Skip TLB invalidation waits in page fault binds
  drm/xe: Disallow input fences on zero batch execs and zero binds
  drm/xe: Remove last fence dependency check from binds

 drivers/gpu/drm/xe/xe_exec.c             |   3 +-
 drivers/gpu/drm/xe/xe_exec_queue.c       | 119 ++++++++++++++++++++---
 drivers/gpu/drm/xe/xe_exec_queue.h       |  23 ++++-
 drivers/gpu/drm/xe/xe_exec_queue_types.h |  12 +++
 drivers/gpu/drm/xe/xe_migrate.c          |  14 +++
 drivers/gpu/drm/xe/xe_migrate.h          |   8 ++
 drivers/gpu/drm/xe/xe_oa.c               |  45 ++++++---
 drivers/gpu/drm/xe/xe_oa_types.h         |   8 ++
 drivers/gpu/drm/xe/xe_pt.c               |  80 +++++----------
 drivers/gpu/drm/xe/xe_sync.c             |  91 ++++++++++-------
 drivers/gpu/drm/xe/xe_sync.h             |   3 +
 drivers/gpu/drm/xe/xe_sync_types.h       |   3 +
 drivers/gpu/drm/xe/xe_tlb_inval_job.c    |  31 ++++--
 drivers/gpu/drm/xe/xe_tlb_inval_job.h    |   5 +-
 drivers/gpu/drm/xe/xe_vm.c               |  97 ++++++++++--------
 drivers/gpu/drm/xe/xe_vm_types.h         |   6 +-
 16 files changed, 376 insertions(+), 172 deletions(-)

-- 
2.34.1


^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH v5 1/6] drm/xe: Enforce correct user fence signaling order using drm_syncobjs
  2025-10-29 20:57 [PATCH v5 0/6] Fix serialization on burst of unbinds - v2 Matthew Brost
@ 2025-10-29 20:57 ` Matthew Brost
  2025-10-30  7:58   ` Thomas Hellström
  2025-10-29 20:57 ` [PATCH v5 2/6] drm/xe: Attach last fence to TLB invalidation job queues Matthew Brost
                   ` (4 subsequent siblings)
  5 siblings, 1 reply; 16+ messages in thread
From: Matthew Brost @ 2025-10-29 20:57 UTC (permalink / raw)
  To: intel-xe; +Cc: thomas.hellstrom

Prevent application hangs caused by out-of-order fence signaling when
user fences are attached. Use drm_syncobj (via dma-fence-chain) to
guarantee that each user fence signals in order, regardless of the
signaling order of the attached fences. Ensure user fence writebacks to
user space occur in the correct sequence.

Signed-of-by: Matthew Brost <matthew.brost@intel.com>
---
 drivers/gpu/drm/xe/xe_exec.c             |  3 +-
 drivers/gpu/drm/xe/xe_exec_queue.c       | 13 +++++++
 drivers/gpu/drm/xe/xe_exec_queue_types.h |  7 ++++
 drivers/gpu/drm/xe/xe_oa.c               | 45 ++++++++++++++++--------
 drivers/gpu/drm/xe/xe_oa_types.h         |  8 +++++
 drivers/gpu/drm/xe/xe_sync.c             | 17 +++++++--
 drivers/gpu/drm/xe/xe_sync.h             |  3 ++
 drivers/gpu/drm/xe/xe_sync_types.h       |  3 ++
 drivers/gpu/drm/xe/xe_vm.c               |  4 +++
 9 files changed, 85 insertions(+), 18 deletions(-)

diff --git a/drivers/gpu/drm/xe/xe_exec.c b/drivers/gpu/drm/xe/xe_exec.c
index 0dc27476832b..f7b9d8715053 100644
--- a/drivers/gpu/drm/xe/xe_exec.c
+++ b/drivers/gpu/drm/xe/xe_exec.c
@@ -166,7 +166,8 @@ int xe_exec_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
 
 	for (num_syncs = 0; num_syncs < args->num_syncs; num_syncs++) {
 		err = xe_sync_entry_parse(xe, xef, &syncs[num_syncs],
-					  &syncs_user[num_syncs], SYNC_PARSE_FLAG_EXEC |
+					  &syncs_user[num_syncs], NULL, 0,
+					  SYNC_PARSE_FLAG_EXEC |
 					  (xe_vm_in_lr_mode(vm) ?
 					   SYNC_PARSE_FLAG_LR_MODE : 0));
 		if (err)
diff --git a/drivers/gpu/drm/xe/xe_exec_queue.c b/drivers/gpu/drm/xe/xe_exec_queue.c
index 90cbc95f8e2e..6e168efbac65 100644
--- a/drivers/gpu/drm/xe/xe_exec_queue.c
+++ b/drivers/gpu/drm/xe/xe_exec_queue.c
@@ -10,6 +10,7 @@
 #include <drm/drm_device.h>
 #include <drm/drm_drv.h>
 #include <drm/drm_file.h>
+#include <drm/drm_syncobj.h>
 #include <uapi/drm/xe_drm.h>
 
 #include "xe_dep_scheduler.h"
@@ -345,6 +346,7 @@ struct xe_exec_queue *xe_exec_queue_create_bind(struct xe_device *xe,
 	struct xe_gt *gt = tile->primary_gt;
 	struct xe_exec_queue *q;
 	struct xe_vm *migrate_vm;
+	int err;
 
 	migrate_vm = xe_migrate_get_vm(tile->migrate);
 	if (xe->info.has_usm) {
@@ -368,6 +370,14 @@ struct xe_exec_queue *xe_exec_queue_create_bind(struct xe_device *xe,
 	}
 	xe_vm_put(migrate_vm);
 
+	err = drm_syncobj_create(&q->ufence_syncobj,
+				 DRM_SYNCOBJ_CREATE_SIGNALED,
+				 NULL);
+	if (err) {
+		xe_exec_queue_put(q);
+		return ERR_PTR(err);
+	}
+
 	return q;
 }
 ALLOW_ERROR_INJECTION(xe_exec_queue_create_bind, ERRNO);
@@ -377,6 +387,9 @@ void xe_exec_queue_destroy(struct kref *ref)
 	struct xe_exec_queue *q = container_of(ref, struct xe_exec_queue, refcount);
 	struct xe_exec_queue *eq, *next;
 
+	if (q->ufence_syncobj)
+		drm_syncobj_put(q->ufence_syncobj);
+
 	if (xe_exec_queue_uses_pxp(q))
 		xe_pxp_exec_queue_remove(gt_to_xe(q->gt)->pxp, q);
 
diff --git a/drivers/gpu/drm/xe/xe_exec_queue_types.h b/drivers/gpu/drm/xe/xe_exec_queue_types.h
index 282505fa1377..838266c3914b 100644
--- a/drivers/gpu/drm/xe/xe_exec_queue_types.h
+++ b/drivers/gpu/drm/xe/xe_exec_queue_types.h
@@ -15,6 +15,7 @@
 #include "xe_hw_fence_types.h"
 #include "xe_lrc_types.h"
 
+struct drm_syncobj;
 struct xe_execlist_exec_queue;
 struct xe_gt;
 struct xe_guc_exec_queue;
@@ -155,6 +156,12 @@ struct xe_exec_queue {
 		struct list_head link;
 	} pxp;
 
+	/** @ufence_syncobj: User fence syncobj */
+	struct drm_syncobj *ufence_syncobj;
+
+	/** @ufence_timeline_value: User fence timeline value */
+	u64 ufence_timeline_value;
+
 	/** @ops: submission backend exec queue operations */
 	const struct xe_exec_queue_ops *ops;
 
diff --git a/drivers/gpu/drm/xe/xe_oa.c b/drivers/gpu/drm/xe/xe_oa.c
index f901ba52b403..7a13a7bd99a6 100644
--- a/drivers/gpu/drm/xe/xe_oa.c
+++ b/drivers/gpu/drm/xe/xe_oa.c
@@ -10,6 +10,7 @@
 
 #include <drm/drm_drv.h>
 #include <drm/drm_managed.h>
+#include <drm/drm_syncobj.h>
 #include <uapi/drm/xe_drm.h>
 
 #include <generated/xe_wa_oob.h>
@@ -1390,7 +1391,9 @@ static int xe_oa_user_extensions(struct xe_oa *oa, enum xe_oa_user_extn_from fro
 	return 0;
 }
 
-static int xe_oa_parse_syncs(struct xe_oa *oa, struct xe_oa_open_param *param)
+static int xe_oa_parse_syncs(struct xe_oa *oa,
+			     struct xe_oa_stream *stream,
+			     struct xe_oa_open_param *param)
 {
 	int ret, num_syncs, num_ufence = 0;
 
@@ -1410,7 +1413,9 @@ static int xe_oa_parse_syncs(struct xe_oa *oa, struct xe_oa_open_param *param)
 
 	for (num_syncs = 0; num_syncs < param->num_syncs; num_syncs++) {
 		ret = xe_sync_entry_parse(oa->xe, param->xef, &param->syncs[num_syncs],
-					  &param->syncs_user[num_syncs], 0);
+					  &param->syncs_user[num_syncs],
+					  stream->ufence_syncobj,
+					  ++stream->ufence_timeline_value, 0);
 		if (ret)
 			goto err_syncs;
 
@@ -1540,7 +1545,7 @@ static long xe_oa_config_locked(struct xe_oa_stream *stream, u64 arg)
 		return -ENODEV;
 
 	param.xef = stream->xef;
-	err = xe_oa_parse_syncs(stream->oa, &param);
+	err = xe_oa_parse_syncs(stream->oa, stream, &param);
 	if (err)
 		goto err_config_put;
 
@@ -1636,6 +1641,7 @@ static void xe_oa_destroy_locked(struct xe_oa_stream *stream)
 	if (stream->exec_q)
 		xe_exec_queue_put(stream->exec_q);
 
+	drm_syncobj_put(stream->ufence_syncobj);
 	kfree(stream);
 }
 
@@ -1827,6 +1833,7 @@ static int xe_oa_stream_open_ioctl_locked(struct xe_oa *oa,
 					  struct xe_oa_open_param *param)
 {
 	struct xe_oa_stream *stream;
+	struct drm_syncobj *ufence_syncobj;
 	int stream_fd;
 	int ret;
 
@@ -1837,17 +1844,31 @@ static int xe_oa_stream_open_ioctl_locked(struct xe_oa *oa,
 		goto exit;
 	}
 
+	ret = drm_syncobj_create(&ufence_syncobj, DRM_SYNCOBJ_CREATE_SIGNALED,
+				 NULL);
+	if (ret)
+		goto exit;
+
 	stream = kzalloc(sizeof(*stream), GFP_KERNEL);
 	if (!stream) {
 		ret = -ENOMEM;
-		goto exit;
+		goto err_syncobj;
 	}
-
+	stream->ufence_syncobj = ufence_syncobj;
 	stream->oa = oa;
-	ret = xe_oa_stream_init(stream, param);
+
+	ret = xe_oa_parse_syncs(oa, stream, param);
 	if (ret)
 		goto err_free;
 
+	ret = xe_oa_stream_init(stream, param);
+	if (ret) {
+		while (param->num_syncs--)
+			xe_sync_entry_cleanup(&param->syncs[param->num_syncs]);
+		kfree(param->syncs);
+		goto err_free;
+	}
+
 	if (!param->disabled) {
 		ret = xe_oa_enable_locked(stream);
 		if (ret)
@@ -1871,6 +1892,8 @@ static int xe_oa_stream_open_ioctl_locked(struct xe_oa *oa,
 	xe_oa_stream_destroy(stream);
 err_free:
 	kfree(stream);
+err_syncobj:
+	drm_syncobj_put(ufence_syncobj);
 exit:
 	return ret;
 }
@@ -2084,22 +2107,14 @@ int xe_oa_stream_open_ioctl(struct drm_device *dev, u64 data, struct drm_file *f
 		goto err_exec_q;
 	}
 
-	ret = xe_oa_parse_syncs(oa, &param);
-	if (ret)
-		goto err_exec_q;
-
 	mutex_lock(&param.hwe->gt->oa.gt_lock);
 	ret = xe_oa_stream_open_ioctl_locked(oa, &param);
 	mutex_unlock(&param.hwe->gt->oa.gt_lock);
 	if (ret < 0)
-		goto err_sync_cleanup;
+		goto err_exec_q;
 
 	return ret;
 
-err_sync_cleanup:
-	while (param.num_syncs--)
-		xe_sync_entry_cleanup(&param.syncs[param.num_syncs]);
-	kfree(param.syncs);
 err_exec_q:
 	if (param.exec_q)
 		xe_exec_queue_put(param.exec_q);
diff --git a/drivers/gpu/drm/xe/xe_oa_types.h b/drivers/gpu/drm/xe/xe_oa_types.h
index 2628f78c4e8d..daf701b5d48b 100644
--- a/drivers/gpu/drm/xe/xe_oa_types.h
+++ b/drivers/gpu/drm/xe/xe_oa_types.h
@@ -15,6 +15,8 @@
 #include "regs/xe_reg_defs.h"
 #include "xe_hw_engine_types.h"
 
+struct drm_syncobj;
+
 #define DEFAULT_XE_OA_BUFFER_SIZE SZ_16M
 
 enum xe_oa_report_header {
@@ -248,6 +250,12 @@ struct xe_oa_stream {
 	/** @xef: xe_file with which the stream was opened */
 	struct xe_file *xef;
 
+	/** @ufence_syncobj: User fence syncobj */
+	struct drm_syncobj *ufence_syncobj;
+
+	/** @ufence_timeline_value: User fence timeline value */
+	u64 ufence_timeline_value;
+
 	/** @last_fence: fence to use in stream destroy when needed */
 	struct dma_fence *last_fence;
 
diff --git a/drivers/gpu/drm/xe/xe_sync.c b/drivers/gpu/drm/xe/xe_sync.c
index 82872a51f098..d48ab7b32ca5 100644
--- a/drivers/gpu/drm/xe/xe_sync.c
+++ b/drivers/gpu/drm/xe/xe_sync.c
@@ -113,6 +113,8 @@ static void user_fence_cb(struct dma_fence *fence, struct dma_fence_cb *cb)
 int xe_sync_entry_parse(struct xe_device *xe, struct xe_file *xef,
 			struct xe_sync_entry *sync,
 			struct drm_xe_sync __user *sync_user,
+			struct drm_syncobj *ufence_syncobj,
+			u64 ufence_timeline_value,
 			unsigned int flags)
 {
 	struct drm_xe_sync sync_in;
@@ -192,10 +194,15 @@ int xe_sync_entry_parse(struct xe_device *xe, struct xe_file *xef,
 		if (exec) {
 			sync->addr = sync_in.addr;
 		} else {
+			sync->ufence_timeline_value = ufence_timeline_value;
 			sync->ufence = user_fence_create(xe, sync_in.addr,
 							 sync_in.timeline_value);
 			if (XE_IOCTL_DBG(xe, IS_ERR(sync->ufence)))
 				return PTR_ERR(sync->ufence);
+			sync->ufence_chain_fence = dma_fence_chain_alloc();
+			if (!sync->ufence_chain_fence)
+				return -ENOMEM;
+			sync->ufence_syncobj = ufence_syncobj;
 		}
 
 		break;
@@ -239,7 +246,12 @@ void xe_sync_entry_signal(struct xe_sync_entry *sync, struct dma_fence *fence)
 	} else if (sync->ufence) {
 		int err;
 
-		dma_fence_get(fence);
+		drm_syncobj_add_point(sync->ufence_syncobj,
+				      sync->ufence_chain_fence,
+				      fence, sync->ufence_timeline_value);
+		sync->ufence_chain_fence = NULL;
+
+		fence = drm_syncobj_fence_get(sync->ufence_syncobj);
 		user_fence_get(sync->ufence);
 		err = dma_fence_add_callback(fence, &sync->ufence->cb,
 					     user_fence_cb);
@@ -259,7 +271,8 @@ void xe_sync_entry_cleanup(struct xe_sync_entry *sync)
 		drm_syncobj_put(sync->syncobj);
 	dma_fence_put(sync->fence);
 	dma_fence_chain_free(sync->chain_fence);
-	if (sync->ufence)
+	dma_fence_chain_free(sync->ufence_chain_fence);
+	if (!IS_ERR_OR_NULL(sync->ufence))
 		user_fence_put(sync->ufence);
 }
 
diff --git a/drivers/gpu/drm/xe/xe_sync.h b/drivers/gpu/drm/xe/xe_sync.h
index 256ffc1e54dc..51f2d803e977 100644
--- a/drivers/gpu/drm/xe/xe_sync.h
+++ b/drivers/gpu/drm/xe/xe_sync.h
@@ -8,6 +8,7 @@
 
 #include "xe_sync_types.h"
 
+struct drm_syncobj;
 struct xe_device;
 struct xe_exec_queue;
 struct xe_file;
@@ -21,6 +22,8 @@ struct xe_vm;
 int xe_sync_entry_parse(struct xe_device *xe, struct xe_file *xef,
 			struct xe_sync_entry *sync,
 			struct drm_xe_sync __user *sync_user,
+			struct drm_syncobj *ufence_syncobj,
+			u64 ufence_timeline_value,
 			unsigned int flags);
 int xe_sync_entry_add_deps(struct xe_sync_entry *sync,
 			   struct xe_sched_job *job);
diff --git a/drivers/gpu/drm/xe/xe_sync_types.h b/drivers/gpu/drm/xe/xe_sync_types.h
index 30ac3f51993b..b88f1833e28c 100644
--- a/drivers/gpu/drm/xe/xe_sync_types.h
+++ b/drivers/gpu/drm/xe/xe_sync_types.h
@@ -18,9 +18,12 @@ struct xe_sync_entry {
 	struct drm_syncobj *syncobj;
 	struct dma_fence *fence;
 	struct dma_fence_chain *chain_fence;
+	struct dma_fence_chain *ufence_chain_fence;
+	struct drm_syncobj *ufence_syncobj;
 	struct xe_user_fence *ufence;
 	u64 addr;
 	u64 timeline_value;
+	u64 ufence_timeline_value;
 	u32 type;
 	u32 flags;
 };
diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c
index 10d77666a425..4058c476aa2d 100644
--- a/drivers/gpu/drm/xe/xe_vm.c
+++ b/drivers/gpu/drm/xe/xe_vm.c
@@ -3633,8 +3633,12 @@ int xe_vm_bind_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
 
 	syncs_user = u64_to_user_ptr(args->syncs);
 	for (num_syncs = 0; num_syncs < args->num_syncs; num_syncs++) {
+		struct xe_exec_queue *__q = q ?: vm->q[0];
+
 		err = xe_sync_entry_parse(xe, xef, &syncs[num_syncs],
 					  &syncs_user[num_syncs],
+					  __q->ufence_syncobj,
+					  ++__q->ufence_timeline_value,
 					  (xe_vm_in_lr_mode(vm) ?
 					   SYNC_PARSE_FLAG_LR_MODE : 0) |
 					  (!args->num_binds ?
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH v5 2/6] drm/xe: Attach last fence to TLB invalidation job queues
  2025-10-29 20:57 [PATCH v5 0/6] Fix serialization on burst of unbinds - v2 Matthew Brost
  2025-10-29 20:57 ` [PATCH v5 1/6] drm/xe: Enforce correct user fence signaling order using drm_syncobjs Matthew Brost
@ 2025-10-29 20:57 ` Matthew Brost
  2025-10-30  8:24   ` Thomas Hellström
  2025-10-29 20:57 ` [PATCH v5 3/6] drm/xe: Decouple bind queue last fence from TLB invalidations Matthew Brost
                   ` (3 subsequent siblings)
  5 siblings, 1 reply; 16+ messages in thread
From: Matthew Brost @ 2025-10-29 20:57 UTC (permalink / raw)
  To: intel-xe; +Cc: thomas.hellstrom

Add support for attaching the last fence to TLB invalidation job queues
to address serialization issues during bursts of unbind jobs. Ensure
that user fence signaling for a bind job reflects both the bind job
itself and the last fences of all related TLB invalidations. Maintain
submission order based solely on the state of the bind and TLB
invalidation queues.

Introduce support functions for last fence attachment to TLB
invalidation queues.

Signed-off-by: Matthew Brost <matthew.brost@intel.com>

---
v3:
 - Fix assert in xe_exec_queue_tlb_inval_last_fence_set (CI)
 - Ensure migrate lock held for migrate queues (Testing)
v5:
 - Style nits (Thomas)
 - Rewrite commit message (Thomas)
---
 drivers/gpu/drm/xe/xe_exec_queue.c       | 103 ++++++++++++++++++++++-
 drivers/gpu/drm/xe/xe_exec_queue.h       |  21 +++++
 drivers/gpu/drm/xe/xe_exec_queue_types.h |   5 ++
 drivers/gpu/drm/xe/xe_migrate.c          |  14 +++
 drivers/gpu/drm/xe/xe_migrate.h          |   8 ++
 drivers/gpu/drm/xe/xe_vm.c               |   7 +-
 6 files changed, 156 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/xe/xe_exec_queue.c b/drivers/gpu/drm/xe/xe_exec_queue.c
index 6e168efbac65..b7551592fe3f 100644
--- a/drivers/gpu/drm/xe/xe_exec_queue.c
+++ b/drivers/gpu/drm/xe/xe_exec_queue.c
@@ -386,6 +386,7 @@ void xe_exec_queue_destroy(struct kref *ref)
 {
 	struct xe_exec_queue *q = container_of(ref, struct xe_exec_queue, refcount);
 	struct xe_exec_queue *eq, *next;
+	int i;
 
 	if (q->ufence_syncobj)
 		drm_syncobj_put(q->ufence_syncobj);
@@ -394,6 +395,9 @@ void xe_exec_queue_destroy(struct kref *ref)
 		xe_pxp_exec_queue_remove(gt_to_xe(q->gt)->pxp, q);
 
 	xe_exec_queue_last_fence_put_unlocked(q);
+	for_each_tlb_inval(i)
+		xe_exec_queue_tlb_inval_last_fence_put_unlocked(q, i);
+
 	if (!(q->flags & EXEC_QUEUE_FLAG_BIND_ENGINE_CHILD)) {
 		list_for_each_entry_safe(eq, next, &q->multi_gt_list,
 					 multi_gt_link)
@@ -1011,7 +1015,9 @@ int xe_exec_queue_destroy_ioctl(struct drm_device *dev, void *data,
 static void xe_exec_queue_last_fence_lockdep_assert(struct xe_exec_queue *q,
 						    struct xe_vm *vm)
 {
-	if (q->flags & EXEC_QUEUE_FLAG_VM) {
+	if (q->flags & EXEC_QUEUE_FLAG_MIGRATE) {
+		xe_migrate_job_lock_assert(q);
+	} else if (q->flags & EXEC_QUEUE_FLAG_VM) {
 		lockdep_assert_held(&vm->lock);
 	} else {
 		xe_vm_assert_held(vm);
@@ -1110,6 +1116,7 @@ void xe_exec_queue_last_fence_set(struct xe_exec_queue *q, struct xe_vm *vm,
 				  struct dma_fence *fence)
 {
 	xe_exec_queue_last_fence_lockdep_assert(q, vm);
+	xe_assert(vm->xe, !dma_fence_is_container(fence));
 
 	xe_exec_queue_last_fence_put(q, vm);
 	q->last_fence = dma_fence_get(fence);
@@ -1138,6 +1145,100 @@ int xe_exec_queue_last_fence_test_dep(struct xe_exec_queue *q, struct xe_vm *vm)
 	return err;
 }
 
+/**
+ * xe_exec_queue_tlb_inval_last_fence_put() - Drop ref to last TLB invalidation fence
+ * @q: The exec queue
+ * @vm: The VM the engine does a bind for
+ * @type: Either primary or media GT
+ */
+void xe_exec_queue_tlb_inval_last_fence_put(struct xe_exec_queue *q,
+					    struct xe_vm *vm,
+					    unsigned int type)
+{
+	xe_exec_queue_last_fence_lockdep_assert(q, vm);
+	xe_assert(vm->xe, type == XE_EXEC_QUEUE_TLB_INVAL_MEDIA_GT ||
+		  type == XE_EXEC_QUEUE_TLB_INVAL_PRIMARY_GT);
+
+	xe_exec_queue_tlb_inval_last_fence_put_unlocked(q, type);
+}
+
+/**
+ * xe_exec_queue_tlb_inval_last_fence_put_unlocked() - Drop ref to last TLB
+ * invalidation fence unlocked
+ * @q: The exec queue
+ * @type: Either primary or media GT
+ *
+ * Only safe to be called from xe_exec_queue_destroy().
+ */
+void xe_exec_queue_tlb_inval_last_fence_put_unlocked(struct xe_exec_queue *q,
+						     unsigned int type)
+{
+	xe_assert(q->vm->xe, type == XE_EXEC_QUEUE_TLB_INVAL_MEDIA_GT ||
+		  type == XE_EXEC_QUEUE_TLB_INVAL_PRIMARY_GT);
+
+	dma_fence_put(q->tlb_inval[type].last_fence);
+	q->tlb_inval[type].last_fence = NULL;
+}
+
+/**
+ * xe_exec_queue_tlb_inval_last_fence_get() - Get last fence for TLB invalidation
+ * @q: The exec queue
+ * @vm: The VM the engine does a bind for
+ * @type: Either primary or media GT
+ *
+ * Get last fence, takes a ref
+ *
+ * Returns: last fence if not signaled, dma fence stub if signaled
+ */
+struct dma_fence *xe_exec_queue_tlb_inval_last_fence_get(struct xe_exec_queue *q,
+							 struct xe_vm *vm,
+							 unsigned int type)
+{
+	struct dma_fence *fence;
+
+	xe_exec_queue_last_fence_lockdep_assert(q, vm);
+	xe_assert(vm->xe, type == XE_EXEC_QUEUE_TLB_INVAL_MEDIA_GT ||
+		  type == XE_EXEC_QUEUE_TLB_INVAL_PRIMARY_GT);
+	xe_assert(vm->xe, q->flags & (EXEC_QUEUE_FLAG_VM |
+				      EXEC_QUEUE_FLAG_MIGRATE));
+
+	if (q->tlb_inval[type].last_fence &&
+	    test_bit(DMA_FENCE_FLAG_SIGNALED_BIT,
+		     &q->tlb_inval[type].last_fence->flags))
+		xe_exec_queue_tlb_inval_last_fence_put(q, vm, type);
+
+	fence = q->tlb_inval[type].last_fence ?: dma_fence_get_stub();
+	dma_fence_get(fence);
+	return fence;
+}
+
+/**
+ * xe_exec_queue_tlb_inval_last_fence_set() - Set last fence for TLB invalidation
+ * @q: The exec queue
+ * @vm: The VM the engine does a bind for
+ * @fence: The fence
+ * @type: Either primary or media GT
+ *
+ * Set the last fence for the tlb invalidation type on the queue. Increases
+ * reference count for fence, when closing queue
+ * xe_exec_queue_tlb_inval_last_fence_put should be called.
+ */
+void xe_exec_queue_tlb_inval_last_fence_set(struct xe_exec_queue *q,
+					    struct xe_vm *vm,
+					    struct dma_fence *fence,
+					    unsigned int type)
+{
+	xe_exec_queue_last_fence_lockdep_assert(q, vm);
+	xe_assert(vm->xe, type == XE_EXEC_QUEUE_TLB_INVAL_MEDIA_GT ||
+		  type == XE_EXEC_QUEUE_TLB_INVAL_PRIMARY_GT);
+	xe_assert(vm->xe, q->flags & (EXEC_QUEUE_FLAG_VM |
+				      EXEC_QUEUE_FLAG_MIGRATE));
+	xe_assert(vm->xe, !dma_fence_is_container(fence));
+
+	xe_exec_queue_tlb_inval_last_fence_put(q, vm, type);
+	q->tlb_inval[type].last_fence = dma_fence_get(fence);
+}
+
 /**
  * xe_exec_queue_contexts_hwsp_rebase - Re-compute GGTT references
  * within all LRCs of a queue.
diff --git a/drivers/gpu/drm/xe/xe_exec_queue.h b/drivers/gpu/drm/xe/xe_exec_queue.h
index a4dfbe858bda..1ba7354b33d1 100644
--- a/drivers/gpu/drm/xe/xe_exec_queue.h
+++ b/drivers/gpu/drm/xe/xe_exec_queue.h
@@ -14,6 +14,10 @@ struct drm_file;
 struct xe_device;
 struct xe_file;
 
+#define for_each_tlb_inval(__i)	\
+	for (__i = XE_EXEC_QUEUE_TLB_INVAL_PRIMARY_GT; \
+	     __i <= XE_EXEC_QUEUE_TLB_INVAL_MEDIA_GT; ++__i)
+
 struct xe_exec_queue *xe_exec_queue_create(struct xe_device *xe, struct xe_vm *vm,
 					   u32 logical_mask, u16 width,
 					   struct xe_hw_engine *hw_engine, u32 flags,
@@ -86,6 +90,23 @@ void xe_exec_queue_last_fence_set(struct xe_exec_queue *e, struct xe_vm *vm,
 				  struct dma_fence *fence);
 int xe_exec_queue_last_fence_test_dep(struct xe_exec_queue *q,
 				      struct xe_vm *vm);
+
+void xe_exec_queue_tlb_inval_last_fence_put(struct xe_exec_queue *q,
+					    struct xe_vm *vm,
+					    unsigned int type);
+
+void xe_exec_queue_tlb_inval_last_fence_put_unlocked(struct xe_exec_queue *q,
+						     unsigned int type);
+
+struct dma_fence *xe_exec_queue_tlb_inval_last_fence_get(struct xe_exec_queue *q,
+							 struct xe_vm *vm,
+							 unsigned int type);
+
+void xe_exec_queue_tlb_inval_last_fence_set(struct xe_exec_queue *q,
+					    struct xe_vm *vm,
+					    struct dma_fence *fence,
+					    unsigned int type);
+
 void xe_exec_queue_update_run_ticks(struct xe_exec_queue *q);
 
 int xe_exec_queue_contexts_hwsp_rebase(struct xe_exec_queue *q, void *scratch);
diff --git a/drivers/gpu/drm/xe/xe_exec_queue_types.h b/drivers/gpu/drm/xe/xe_exec_queue_types.h
index 838266c3914b..4ef61e803676 100644
--- a/drivers/gpu/drm/xe/xe_exec_queue_types.h
+++ b/drivers/gpu/drm/xe/xe_exec_queue_types.h
@@ -146,6 +146,11 @@ struct xe_exec_queue {
 		 * dependency scheduler
 		 */
 		struct xe_dep_scheduler *dep_scheduler;
+		/**
+		 * @last_fence: last fence for tlb invalidation, protected by
+		 * vm->lock in write mode
+		 */
+		struct dma_fence *last_fence;
 	} tlb_inval[XE_EXEC_QUEUE_TLB_INVAL_COUNT];
 
 	/** @pxp: PXP info tracking */
diff --git a/drivers/gpu/drm/xe/xe_migrate.c b/drivers/gpu/drm/xe/xe_migrate.c
index 921c9c1ea41f..4567bc88a8ec 100644
--- a/drivers/gpu/drm/xe/xe_migrate.c
+++ b/drivers/gpu/drm/xe/xe_migrate.c
@@ -2333,6 +2333,20 @@ void xe_migrate_job_unlock(struct xe_migrate *m, struct xe_exec_queue *q)
 		xe_vm_assert_held(q->vm);	/* User queues VM's should be locked */
 }
 
+#if IS_ENABLED(CONFIG_PROVE_LOCKING)
+/**
+ * xe_migrate_job_lock_assert() - Assert migrate job lock held of queue
+ * @q: Migrate queue
+ */
+void xe_migrate_job_lock_assert(struct xe_exec_queue *q)
+{
+	struct xe_migrate *m = gt_to_tile(q->gt)->migrate;
+
+	xe_gt_assert(q->gt, q == m->q);
+	lockdep_assert_held(&m->job_mutex);
+}
+#endif
+
 #if IS_ENABLED(CONFIG_DRM_XE_KUNIT_TEST)
 #include "tests/xe_migrate.c"
 #endif
diff --git a/drivers/gpu/drm/xe/xe_migrate.h b/drivers/gpu/drm/xe/xe_migrate.h
index 4fad324b6253..9b5791617f5e 100644
--- a/drivers/gpu/drm/xe/xe_migrate.h
+++ b/drivers/gpu/drm/xe/xe_migrate.h
@@ -152,6 +152,14 @@ xe_migrate_update_pgtables(struct xe_migrate *m,
 
 void xe_migrate_wait(struct xe_migrate *m);
 
+#if IS_ENABLED(CONFIG_PROVE_LOCKING)
+void xe_migrate_job_lock_assert(struct xe_exec_queue *q);
+#else
+static inline void xe_migrate_job_lock_assert(struct xe_exec_queue *q)
+{
+}
+#endif
+
 void xe_migrate_job_lock(struct xe_migrate *m, struct xe_exec_queue *q);
 void xe_migrate_job_unlock(struct xe_migrate *m, struct xe_exec_queue *q);
 
diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c
index 4058c476aa2d..4241cc433dca 100644
--- a/drivers/gpu/drm/xe/xe_vm.c
+++ b/drivers/gpu/drm/xe/xe_vm.c
@@ -1731,8 +1731,13 @@ void xe_vm_close_and_put(struct xe_vm *vm)
 
 	down_write(&vm->lock);
 	for_each_tile(tile, xe, id) {
-		if (vm->q[id])
+		if (vm->q[id]) {
+			int i;
+
 			xe_exec_queue_last_fence_put(vm->q[id], vm);
+			for_each_tlb_inval(i)
+				xe_exec_queue_tlb_inval_last_fence_put(vm->q[id], vm, i);
+		}
 	}
 	up_write(&vm->lock);
 
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH v5 3/6] drm/xe: Decouple bind queue last fence from TLB invalidations
  2025-10-29 20:57 [PATCH v5 0/6] Fix serialization on burst of unbinds - v2 Matthew Brost
  2025-10-29 20:57 ` [PATCH v5 1/6] drm/xe: Enforce correct user fence signaling order using drm_syncobjs Matthew Brost
  2025-10-29 20:57 ` [PATCH v5 2/6] drm/xe: Attach last fence to TLB invalidation job queues Matthew Brost
@ 2025-10-29 20:57 ` Matthew Brost
  2025-10-30  9:52   ` Thomas Hellström
  2025-10-29 20:57 ` [PATCH v5 4/6] drm/xe: Skip TLB invalidation waits in page fault binds Matthew Brost
                   ` (2 subsequent siblings)
  5 siblings, 1 reply; 16+ messages in thread
From: Matthew Brost @ 2025-10-29 20:57 UTC (permalink / raw)
  To: intel-xe; +Cc: thomas.hellstrom

Separate the bind queue’s last fence to apply exclusively to the bind
job, avoiding unnecessary serialization on prior TLB invalidations.
Preserve correct user fence signaling by merging bind and TLB
invalidation fences later in the pipeline.

Link: https://gitlab.freedesktop.org/drm/xe/kernel/-/issues/6047
Signed-off-by: Matthew Brost <matthew.brost@intel.com>

---
v3:
 - Fix lockdep assert for migrate queues (CI)
 - Use individual dma fence contexts for array out fences (Testing)
 - Don't set last fence with arrays (Testing)
 - Move TLB invalid last fence under migrate lock (Testing)
 - Don't set queue last for migrate queues (Testing)
---
 drivers/gpu/drm/xe/xe_pt.c            | 73 ++++++++++---------------
 drivers/gpu/drm/xe/xe_sync.c          | 63 +++++++++++++++++-----
 drivers/gpu/drm/xe/xe_tlb_inval_job.c | 31 ++++++++---
 drivers/gpu/drm/xe/xe_tlb_inval_job.h |  5 +-
 drivers/gpu/drm/xe/xe_vm.c            | 76 ++++++++++++++-------------
 drivers/gpu/drm/xe/xe_vm_types.h      |  5 --
 6 files changed, 143 insertions(+), 110 deletions(-)

diff --git a/drivers/gpu/drm/xe/xe_pt.c b/drivers/gpu/drm/xe/xe_pt.c
index d22fd1ccc0ba..a4b9cdf016d9 100644
--- a/drivers/gpu/drm/xe/xe_pt.c
+++ b/drivers/gpu/drm/xe/xe_pt.c
@@ -3,8 +3,6 @@
  * Copyright © 2022 Intel Corporation
  */
 
-#include <linux/dma-fence-array.h>
-
 #include "xe_pt.h"
 
 #include "regs/xe_gtt_defs.h"
@@ -2359,10 +2357,9 @@ xe_pt_update_ops_run(struct xe_tile *tile, struct xe_vma_ops *vops)
 	struct xe_vm *vm = vops->vm;
 	struct xe_vm_pgtable_update_ops *pt_update_ops =
 		&vops->pt_update_ops[tile->id];
-	struct dma_fence *fence, *ifence, *mfence;
+	struct xe_exec_queue *q = pt_update_ops->q;
+	struct dma_fence *fence, *ifence = NULL, *mfence = NULL;
 	struct xe_tlb_inval_job *ijob = NULL, *mjob = NULL;
-	struct dma_fence **fences = NULL;
-	struct dma_fence_array *cf = NULL;
 	struct xe_range_fence *rfence;
 	struct xe_vma_op *op;
 	int err = 0, i;
@@ -2390,15 +2387,14 @@ xe_pt_update_ops_run(struct xe_tile *tile, struct xe_vma_ops *vops)
 #endif
 
 	if (pt_update_ops->needs_invalidation) {
-		struct xe_exec_queue *q = pt_update_ops->q;
 		struct xe_dep_scheduler *dep_scheduler =
 			to_dep_scheduler(q, tile->primary_gt);
 
 		ijob = xe_tlb_inval_job_create(q, &tile->primary_gt->tlb_inval,
-					       dep_scheduler,
+					       dep_scheduler, vm,
 					       pt_update_ops->start,
 					       pt_update_ops->last,
-					       vm->usm.asid);
+					       XE_EXEC_QUEUE_TLB_INVAL_PRIMARY_GT);
 		if (IS_ERR(ijob)) {
 			err = PTR_ERR(ijob);
 			goto kill_vm_tile1;
@@ -2410,26 +2406,15 @@ xe_pt_update_ops_run(struct xe_tile *tile, struct xe_vma_ops *vops)
 
 			mjob = xe_tlb_inval_job_create(q,
 						       &tile->media_gt->tlb_inval,
-						       dep_scheduler,
+						       dep_scheduler, vm,
 						       pt_update_ops->start,
 						       pt_update_ops->last,
-						       vm->usm.asid);
+						       XE_EXEC_QUEUE_TLB_INVAL_MEDIA_GT);
 			if (IS_ERR(mjob)) {
 				err = PTR_ERR(mjob);
 				goto free_ijob;
 			}
 			update.mjob = mjob;
-
-			fences = kmalloc_array(2, sizeof(*fences), GFP_KERNEL);
-			if (!fences) {
-				err = -ENOMEM;
-				goto free_ijob;
-			}
-			cf = dma_fence_array_alloc(2);
-			if (!cf) {
-				err = -ENOMEM;
-				goto free_ijob;
-			}
 		}
 	}
 
@@ -2460,31 +2445,12 @@ xe_pt_update_ops_run(struct xe_tile *tile, struct xe_vma_ops *vops)
 				  pt_update_ops->last, fence))
 		dma_fence_wait(fence, false);
 
-	/* tlb invalidation must be done before signaling unbind/rebind */
-	if (ijob) {
-		struct dma_fence *__fence;
-
+	if (ijob)
 		ifence = xe_tlb_inval_job_push(ijob, tile->migrate, fence);
-		__fence = ifence;
+	if (mjob)
+		mfence = xe_tlb_inval_job_push(mjob, tile->migrate, fence);
 
-		if (mjob) {
-			fences[0] = ifence;
-			mfence = xe_tlb_inval_job_push(mjob, tile->migrate,
-						       fence);
-			fences[1] = mfence;
-
-			dma_fence_array_init(cf, 2, fences,
-					     vm->composite_fence_ctx,
-					     vm->composite_fence_seqno++,
-					     false);
-			__fence = &cf->base;
-		}
-
-		dma_fence_put(fence);
-		fence = __fence;
-	}
-
-	if (!mjob) {
+	if (!mjob && !ijob) {
 		dma_resv_add_fence(xe_vm_resv(vm), fence,
 				   pt_update_ops->wait_vm_bookkeep ?
 				   DMA_RESV_USAGE_KERNEL :
@@ -2492,6 +2458,14 @@ xe_pt_update_ops_run(struct xe_tile *tile, struct xe_vma_ops *vops)
 
 		list_for_each_entry(op, &vops->list, link)
 			op_commit(vops->vm, tile, pt_update_ops, op, fence, NULL);
+	} else if (ijob && !mjob) {
+		dma_resv_add_fence(xe_vm_resv(vm), ifence,
+				   pt_update_ops->wait_vm_bookkeep ?
+				   DMA_RESV_USAGE_KERNEL :
+				   DMA_RESV_USAGE_BOOKKEEP);
+
+		list_for_each_entry(op, &vops->list, link)
+			op_commit(vops->vm, tile, pt_update_ops, op, ifence, NULL);
 	} else {
 		dma_resv_add_fence(xe_vm_resv(vm), ifence,
 				   pt_update_ops->wait_vm_bookkeep ?
@@ -2511,16 +2485,23 @@ xe_pt_update_ops_run(struct xe_tile *tile, struct xe_vma_ops *vops)
 	if (pt_update_ops->needs_svm_lock)
 		xe_svm_notifier_unlock(vm);
 
+	/*
+	 * The last fence is only used for zero bind queue idling; migrate
+	 * queues are not exposed to user space.
+	 */
+	if (!(q->flags & EXEC_QUEUE_FLAG_MIGRATE))
+		xe_exec_queue_last_fence_set(q, vm, fence);
+
 	xe_tlb_inval_job_put(mjob);
 	xe_tlb_inval_job_put(ijob);
+	dma_fence_put(ifence);
+	dma_fence_put(mfence);
 
 	return fence;
 
 free_rfence:
 	kfree(rfence);
 free_ijob:
-	kfree(cf);
-	kfree(fences);
 	xe_tlb_inval_job_put(mjob);
 	xe_tlb_inval_job_put(ijob);
 kill_vm_tile1:
diff --git a/drivers/gpu/drm/xe/xe_sync.c b/drivers/gpu/drm/xe/xe_sync.c
index d48ab7b32ca5..df7ca349398b 100644
--- a/drivers/gpu/drm/xe/xe_sync.c
+++ b/drivers/gpu/drm/xe/xe_sync.c
@@ -14,7 +14,7 @@
 #include <drm/drm_syncobj.h>
 #include <uapi/drm/xe_drm.h>
 
-#include "xe_device_types.h"
+#include "xe_device.h"
 #include "xe_exec_queue.h"
 #include "xe_macros.h"
 #include "xe_sched_job_types.h"
@@ -297,26 +297,67 @@ xe_sync_in_fence_get(struct xe_sync_entry *sync, int num_sync,
 	struct dma_fence **fences = NULL;
 	struct dma_fence_array *cf = NULL;
 	struct dma_fence *fence;
-	int i, num_in_fence = 0, current_fence = 0;
+	int i, num_fence = 0, current_fence = 0;
 
 	lockdep_assert_held(&vm->lock);
 
 	/* Count in-fences */
 	for (i = 0; i < num_sync; ++i) {
 		if (sync[i].fence) {
-			++num_in_fence;
+			++num_fence;
 			fence = sync[i].fence;
 		}
 	}
 
 	/* Easy case... */
-	if (!num_in_fence) {
+	if (!num_fence) {
+		if (q->flags & EXEC_QUEUE_FLAG_VM) {
+			struct xe_exec_queue *__q;
+			struct xe_tile *tile;
+			u8 id;
+
+			for_each_tile(tile, vm->xe, id)
+				num_fence += (1 + XE_MAX_GT_PER_TILE);
+
+			fences = kmalloc_array(num_fence, sizeof(*fences),
+					       GFP_KERNEL);
+			if (!fences)
+				return ERR_PTR(-ENOMEM);
+
+			fences[current_fence++] =
+				xe_exec_queue_last_fence_get(q, vm);
+			for_each_tlb_inval(i)
+				fences[current_fence++] =
+					xe_exec_queue_tlb_inval_last_fence_get(q, vm, i);
+			list_for_each_entry(__q, &q->multi_gt_list,
+					    multi_gt_link) {
+				fences[current_fence++] =
+					xe_exec_queue_last_fence_get(__q, vm);
+				for_each_tlb_inval(i)
+					fences[current_fence++] =
+						xe_exec_queue_tlb_inval_last_fence_get(__q, vm, i);
+			}
+
+			xe_assert(vm->xe, current_fence == num_fence);
+			cf = dma_fence_array_create(num_fence, fences,
+						    dma_fence_context_alloc(1),
+						    1, false);
+			if (!cf)
+				goto err_out;
+
+			return &cf->base;
+		}
+
 		fence = xe_exec_queue_last_fence_get(q, vm);
 		return fence;
 	}
 
-	/* Create composite fence */
-	fences = kmalloc_array(num_in_fence + 1, sizeof(*fences), GFP_KERNEL);
+	/*
+	 * Create composite fence - FIXME - the below code doesn't work. This is
+	 * unused in Mesa so we are ok for the moment. Perhaps we just disable
+	 * this entire code path if number of in fences != 0.
+	 */
+	fences = kmalloc_array(num_fence + 1, sizeof(*fences), GFP_KERNEL);
 	if (!fences)
 		return ERR_PTR(-ENOMEM);
 	for (i = 0; i < num_sync; ++i) {
@@ -326,14 +367,10 @@ xe_sync_in_fence_get(struct xe_sync_entry *sync, int num_sync,
 		}
 	}
 	fences[current_fence++] = xe_exec_queue_last_fence_get(q, vm);
-	cf = dma_fence_array_create(num_in_fence, fences,
-				    vm->composite_fence_ctx,
-				    vm->composite_fence_seqno++,
-				    false);
-	if (!cf) {
-		--vm->composite_fence_seqno;
+	cf = dma_fence_array_create(num_fence, fences,
+				    dma_fence_context_alloc(1), 1, false);
+	if (!cf)
 		goto err_out;
-	}
 
 	return &cf->base;
 
diff --git a/drivers/gpu/drm/xe/xe_tlb_inval_job.c b/drivers/gpu/drm/xe/xe_tlb_inval_job.c
index 492def04a559..1ae0dec2cf31 100644
--- a/drivers/gpu/drm/xe/xe_tlb_inval_job.c
+++ b/drivers/gpu/drm/xe/xe_tlb_inval_job.c
@@ -12,6 +12,7 @@
 #include "xe_tlb_inval_job.h"
 #include "xe_migrate.h"
 #include "xe_pm.h"
+#include "xe_vm.h"
 
 /** struct xe_tlb_inval_job - TLB invalidation job */
 struct xe_tlb_inval_job {
@@ -21,6 +22,8 @@ struct xe_tlb_inval_job {
 	struct xe_tlb_inval *tlb_inval;
 	/** @q: exec queue issuing the invalidate */
 	struct xe_exec_queue *q;
+	/** @vm: VM which TLB invalidation is being issued for */
+	struct xe_vm *vm;
 	/** @refcount: ref count of this job */
 	struct kref refcount;
 	/**
@@ -32,8 +35,8 @@ struct xe_tlb_inval_job {
 	u64 start;
 	/** @end: End address to invalidate */
 	u64 end;
-	/** @asid: Address space ID to invalidate */
-	u32 asid;
+	/** @type: GT type */
+	int type;
 	/** @fence_armed: Fence has been armed */
 	bool fence_armed;
 };
@@ -46,7 +49,7 @@ static struct dma_fence *xe_tlb_inval_job_run(struct xe_dep_job *dep_job)
 		container_of(job->fence, typeof(*ifence), base);
 
 	xe_tlb_inval_range(job->tlb_inval, ifence, job->start,
-			   job->end, job->asid);
+			   job->end, job->vm->usm.asid);
 
 	return job->fence;
 }
@@ -70,9 +73,10 @@ static const struct xe_dep_job_ops dep_job_ops = {
  * @q: exec queue issuing the invalidate
  * @tlb_inval: TLB invalidation client
  * @dep_scheduler: Dependency scheduler for job
+ * @vm: VM which TLB invalidation is being issued for
  * @start: Start address to invalidate
  * @end: End address to invalidate
- * @asid: Address space ID to invalidate
+ * @type: GT type
  *
  * Create a TLB invalidation job and initialize internal fields. The caller is
  * responsible for releasing the creation reference.
@@ -81,8 +85,8 @@ static const struct xe_dep_job_ops dep_job_ops = {
  */
 struct xe_tlb_inval_job *
 xe_tlb_inval_job_create(struct xe_exec_queue *q, struct xe_tlb_inval *tlb_inval,
-			struct xe_dep_scheduler *dep_scheduler, u64 start,
-			u64 end, u32 asid)
+			struct xe_dep_scheduler *dep_scheduler,
+			struct xe_vm *vm, u64 start, u64 end, int type)
 {
 	struct xe_tlb_inval_job *job;
 	struct drm_sched_entity *entity =
@@ -90,19 +94,24 @@ xe_tlb_inval_job_create(struct xe_exec_queue *q, struct xe_tlb_inval *tlb_inval,
 	struct xe_tlb_inval_fence *ifence;
 	int err;
 
+	xe_assert(vm->xe, type == XE_EXEC_QUEUE_TLB_INVAL_MEDIA_GT ||
+		  type == XE_EXEC_QUEUE_TLB_INVAL_PRIMARY_GT);
+
 	job = kmalloc(sizeof(*job), GFP_KERNEL);
 	if (!job)
 		return ERR_PTR(-ENOMEM);
 
 	job->q = q;
+	job->vm = vm;
 	job->tlb_inval = tlb_inval;
 	job->start = start;
 	job->end = end;
-	job->asid = asid;
 	job->fence_armed = false;
 	job->dep.ops = &dep_job_ops;
+	job->type = type;
 	kref_init(&job->refcount);
 	xe_exec_queue_get(q);	/* Pairs with put in xe_tlb_inval_job_destroy */
+	xe_vm_get(vm);		/* Pairs with put in xe_tlb_inval_job_destroy */
 
 	ifence = kmalloc(sizeof(*ifence), GFP_KERNEL);
 	if (!ifence) {
@@ -124,6 +133,7 @@ xe_tlb_inval_job_create(struct xe_exec_queue *q, struct xe_tlb_inval *tlb_inval,
 err_fence:
 	kfree(ifence);
 err_job:
+	xe_vm_put(vm);
 	xe_exec_queue_put(q);
 	kfree(job);
 
@@ -138,6 +148,7 @@ static void xe_tlb_inval_job_destroy(struct kref *ref)
 		container_of(job->fence, typeof(*ifence), base);
 	struct xe_exec_queue *q = job->q;
 	struct xe_device *xe = gt_to_xe(q->gt);
+	struct xe_vm *vm = job->vm;
 
 	if (!job->fence_armed)
 		kfree(ifence);
@@ -147,6 +158,7 @@ static void xe_tlb_inval_job_destroy(struct kref *ref)
 
 	drm_sched_job_cleanup(&job->dep.drm);
 	kfree(job);
+	xe_vm_put(vm);		/* Pairs with get from xe_tlb_inval_job_create */
 	xe_exec_queue_put(q);	/* Pairs with get from xe_tlb_inval_job_create */
 	xe_pm_runtime_put(xe);	/* Pairs with get from xe_tlb_inval_job_create */
 }
@@ -231,6 +243,11 @@ struct dma_fence *xe_tlb_inval_job_push(struct xe_tlb_inval_job *job,
 	dma_fence_get(&job->dep.drm.s_fence->finished);
 	drm_sched_entity_push_job(&job->dep.drm);
 
+	/* Let the upper layers fish this out */
+	xe_exec_queue_tlb_inval_last_fence_set(job->q, job->vm,
+					       &job->dep.drm.s_fence->finished,
+					       job->type);
+
 	xe_migrate_job_unlock(m, job->q);
 
 	/*
diff --git a/drivers/gpu/drm/xe/xe_tlb_inval_job.h b/drivers/gpu/drm/xe/xe_tlb_inval_job.h
index e63edcb26b50..4d6df1a6c6ca 100644
--- a/drivers/gpu/drm/xe/xe_tlb_inval_job.h
+++ b/drivers/gpu/drm/xe/xe_tlb_inval_job.h
@@ -11,14 +11,15 @@
 struct dma_fence;
 struct xe_dep_scheduler;
 struct xe_exec_queue;
+struct xe_migrate;
 struct xe_tlb_inval;
 struct xe_tlb_inval_job;
-struct xe_migrate;
+struct xe_vm;
 
 struct xe_tlb_inval_job *
 xe_tlb_inval_job_create(struct xe_exec_queue *q, struct xe_tlb_inval *tlb_inval,
 			struct xe_dep_scheduler *dep_scheduler,
-			u64 start, u64 end, u32 asid);
+			struct xe_vm *vm, u64 start, u64 end, int type);
 
 int xe_tlb_inval_job_alloc_dep(struct xe_tlb_inval_job *job);
 
diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c
index 4241cc433dca..7a6e254996fb 100644
--- a/drivers/gpu/drm/xe/xe_vm.c
+++ b/drivers/gpu/drm/xe/xe_vm.c
@@ -1623,9 +1623,6 @@ struct xe_vm *xe_vm_create(struct xe_device *xe, u32 flags, struct xe_file *xef)
 		}
 	}
 
-	if (number_tiles > 1)
-		vm->composite_fence_ctx = dma_fence_context_alloc(1);
-
 	if (xef && xe->info.has_asid) {
 		u32 asid;
 
@@ -3107,20 +3104,26 @@ static struct dma_fence *ops_execute(struct xe_vm *vm,
 	struct dma_fence *fence = NULL;
 	struct dma_fence **fences = NULL;
 	struct dma_fence_array *cf = NULL;
-	int number_tiles = 0, current_fence = 0, err;
+	int number_tiles = 0, current_fence = 0, n_fence = 0, err;
 	u8 id;
 
 	number_tiles = vm_ops_setup_tile_args(vm, vops);
 	if (number_tiles == 0)
 		return ERR_PTR(-ENODATA);
 
-	if (number_tiles > 1) {
-		fences = kmalloc_array(number_tiles, sizeof(*fences),
-				       GFP_KERNEL);
-		if (!fences) {
-			fence = ERR_PTR(-ENOMEM);
-			goto err_trace;
-		}
+	for_each_tile(tile, vm->xe, id)
+		n_fence += (1 + XE_MAX_GT_PER_TILE);
+
+	fences = kmalloc_array(n_fence, sizeof(*fences), GFP_KERNEL);
+	if (!fences) {
+		fence = ERR_PTR(-ENOMEM);
+		goto err_trace;
+	}
+
+	cf = dma_fence_array_alloc(n_fence);
+	if (!cf) {
+		fence = ERR_PTR(-ENOMEM);
+		goto err_out;
 	}
 
 	for_each_tile(tile, vm->xe, id) {
@@ -3137,29 +3140,30 @@ static struct dma_fence *ops_execute(struct xe_vm *vm,
 	trace_xe_vm_ops_execute(vops);
 
 	for_each_tile(tile, vm->xe, id) {
+		struct xe_exec_queue *q = vops->pt_update_ops[tile->id].q;
+		int i;
+
+		fence = NULL;
 		if (!vops->pt_update_ops[id].num_ops)
-			continue;
+			goto collect_fences;
 
 		fence = xe_pt_update_ops_run(tile, vops);
 		if (IS_ERR(fence))
 			goto err_out;
 
-		if (fences)
-			fences[current_fence++] = fence;
+collect_fences:
+		fences[current_fence++] = fence ?: dma_fence_get_stub();
+		xe_migrate_job_lock(tile->migrate, q);
+		for_each_tlb_inval(i)
+			fences[current_fence++] =
+				xe_exec_queue_tlb_inval_last_fence_get(q, vm, i);
+		xe_migrate_job_unlock(tile->migrate, q);
 	}
 
-	if (fences) {
-		cf = dma_fence_array_create(number_tiles, fences,
-					    vm->composite_fence_ctx,
-					    vm->composite_fence_seqno++,
-					    false);
-		if (!cf) {
-			--vm->composite_fence_seqno;
-			fence = ERR_PTR(-ENOMEM);
-			goto err_out;
-		}
-		fence = &cf->base;
-	}
+	xe_assert(vm->xe, current_fence == n_fence);
+	dma_fence_array_init(cf, n_fence, fences, dma_fence_context_alloc(1),
+			     1, false);
+	fence = &cf->base;
 
 	for_each_tile(tile, vm->xe, id) {
 		if (!vops->pt_update_ops[id].num_ops)
@@ -3220,7 +3224,6 @@ static void op_add_ufence(struct xe_vm *vm, struct xe_vma_op *op,
 static void vm_bind_ioctl_ops_fini(struct xe_vm *vm, struct xe_vma_ops *vops,
 				   struct dma_fence *fence)
 {
-	struct xe_exec_queue *wait_exec_queue = to_wait_exec_queue(vm, vops->q);
 	struct xe_user_fence *ufence;
 	struct xe_vma_op *op;
 	int i;
@@ -3241,7 +3244,6 @@ static void vm_bind_ioctl_ops_fini(struct xe_vm *vm, struct xe_vma_ops *vops,
 	if (fence) {
 		for (i = 0; i < vops->num_syncs; i++)
 			xe_sync_entry_signal(vops->syncs + i, fence);
-		xe_exec_queue_last_fence_set(wait_exec_queue, vm, fence);
 	}
 }
 
@@ -3435,19 +3437,19 @@ static int vm_bind_ioctl_signal_fences(struct xe_vm *vm,
 				       struct xe_sync_entry *syncs,
 				       int num_syncs)
 {
-	struct dma_fence *fence;
+	struct dma_fence *fence = NULL;
 	int i, err = 0;
 
-	fence = xe_sync_in_fence_get(syncs, num_syncs,
-				     to_wait_exec_queue(vm, q), vm);
-	if (IS_ERR(fence))
-		return PTR_ERR(fence);
+	if (num_syncs) {
+		fence = xe_sync_in_fence_get(syncs, num_syncs,
+					     to_wait_exec_queue(vm, q), vm);
+		if (IS_ERR(fence))
+			return PTR_ERR(fence);
 
-	for (i = 0; i < num_syncs; i++)
-		xe_sync_entry_signal(&syncs[i], fence);
+		for (i = 0; i < num_syncs; i++)
+			xe_sync_entry_signal(&syncs[i], fence);
+	}
 
-	xe_exec_queue_last_fence_set(to_wait_exec_queue(vm, q), vm,
-				     fence);
 	dma_fence_put(fence);
 
 	return err;
diff --git a/drivers/gpu/drm/xe/xe_vm_types.h b/drivers/gpu/drm/xe/xe_vm_types.h
index d6e2a0fdd4b3..542dbe2f9310 100644
--- a/drivers/gpu/drm/xe/xe_vm_types.h
+++ b/drivers/gpu/drm/xe/xe_vm_types.h
@@ -221,11 +221,6 @@ struct xe_vm {
 #define XE_VM_FLAG_GSC			BIT(8)
 	unsigned long flags;
 
-	/** @composite_fence_ctx: context composite fence */
-	u64 composite_fence_ctx;
-	/** @composite_fence_seqno: seqno for composite fence */
-	u32 composite_fence_seqno;
-
 	/**
 	 * @lock: outer most lock, protects objects of anything attached to this
 	 * VM
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH v5 4/6] drm/xe: Skip TLB invalidation waits in page fault binds
  2025-10-29 20:57 [PATCH v5 0/6] Fix serialization on burst of unbinds - v2 Matthew Brost
                   ` (2 preceding siblings ...)
  2025-10-29 20:57 ` [PATCH v5 3/6] drm/xe: Decouple bind queue last fence from TLB invalidations Matthew Brost
@ 2025-10-29 20:57 ` Matthew Brost
  2025-11-03 15:19   ` Thomas Hellström
  2025-10-29 20:57 ` [PATCH v5 5/6] drm/xe: Disallow input fences on zero batch execs and zero binds Matthew Brost
  2025-10-29 20:57 ` [PATCH v5 6/6] drm/xe: Remove last fence dependency check from binds Matthew Brost
  5 siblings, 1 reply; 16+ messages in thread
From: Matthew Brost @ 2025-10-29 20:57 UTC (permalink / raw)
  To: intel-xe; +Cc: thomas.hellstrom

Avoid waiting on unrelated TLB invalidations when servicing page fault
binds. Since the migrate queue is shared across processes, TLB
invalidations triggered by other processes may occur concurrently but
are not relevant to the current bind. Teach the bind pipeline to skip
waits on such invalidations to prevent unnecessary serialization.

Signed-off-by: Matthew Brost <matthew.brost@intel.com>
---
 drivers/gpu/drm/xe/xe_vm.c       | 14 ++++++++++++--
 drivers/gpu/drm/xe/xe_vm_types.h |  1 +
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c
index 7a6e254996fb..6c77ff109fe4 100644
--- a/drivers/gpu/drm/xe/xe_vm.c
+++ b/drivers/gpu/drm/xe/xe_vm.c
@@ -755,6 +755,7 @@ struct dma_fence *xe_vma_rebind(struct xe_vm *vm, struct xe_vma *vma, u8 tile_ma
 	xe_assert(vm->xe, xe_vm_in_fault_mode(vm));
 
 	xe_vma_ops_init(&vops, vm, NULL, NULL, 0);
+	vops.flags |= XE_VMA_OPS_FLAG_SKIP_TLB_WAIT;
 	for_each_tile(tile, vm->xe, id) {
 		vops.pt_update_ops[id].wait_vm_bookkeep = true;
 		vops.pt_update_ops[tile->id].q =
@@ -845,6 +846,7 @@ struct dma_fence *xe_vm_range_rebind(struct xe_vm *vm,
 	xe_assert(vm->xe, xe_vma_is_cpu_addr_mirror(vma));
 
 	xe_vma_ops_init(&vops, vm, NULL, NULL, 0);
+	vops.flags |= XE_VMA_OPS_FLAG_SKIP_TLB_WAIT;
 	for_each_tile(tile, vm->xe, id) {
 		vops.pt_update_ops[id].wait_vm_bookkeep = true;
 		vops.pt_update_ops[tile->id].q =
@@ -3111,8 +3113,13 @@ static struct dma_fence *ops_execute(struct xe_vm *vm,
 	if (number_tiles == 0)
 		return ERR_PTR(-ENODATA);
 
-	for_each_tile(tile, vm->xe, id)
-		n_fence += (1 + XE_MAX_GT_PER_TILE);
+	if (vops->flags & XE_VMA_OPS_FLAG_SKIP_TLB_WAIT) {
+		for_each_tile(tile, vm->xe, id)
+			++n_fence;
+	} else {
+		for_each_tile(tile, vm->xe, id)
+			n_fence += (1 + XE_MAX_GT_PER_TILE);
+	}
 
 	fences = kmalloc_array(n_fence, sizeof(*fences), GFP_KERNEL);
 	if (!fences) {
@@ -3153,6 +3160,9 @@ static struct dma_fence *ops_execute(struct xe_vm *vm,
 
 collect_fences:
 		fences[current_fence++] = fence ?: dma_fence_get_stub();
+		if (vops->flags & XE_VMA_OPS_FLAG_SKIP_TLB_WAIT)
+			continue;
+
 		xe_migrate_job_lock(tile->migrate, q);
 		for_each_tlb_inval(i)
 			fences[current_fence++] =
diff --git a/drivers/gpu/drm/xe/xe_vm_types.h b/drivers/gpu/drm/xe/xe_vm_types.h
index 542dbe2f9310..3766dc37b3ad 100644
--- a/drivers/gpu/drm/xe/xe_vm_types.h
+++ b/drivers/gpu/drm/xe/xe_vm_types.h
@@ -466,6 +466,7 @@ struct xe_vma_ops {
 #define XE_VMA_OPS_FLAG_HAS_SVM_PREFETCH BIT(0)
 #define XE_VMA_OPS_FLAG_MADVISE          BIT(1)
 #define XE_VMA_OPS_ARRAY_OF_BINDS	 BIT(2)
+#define XE_VMA_OPS_FLAG_SKIP_TLB_WAIT	 BIT(3)
 	u32 flags;
 #ifdef TEST_VM_OPS_ERROR
 	/** @inject_error: inject error to test error handling */
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH v5 5/6] drm/xe: Disallow input fences on zero batch execs and zero binds
  2025-10-29 20:57 [PATCH v5 0/6] Fix serialization on burst of unbinds - v2 Matthew Brost
                   ` (3 preceding siblings ...)
  2025-10-29 20:57 ` [PATCH v5 4/6] drm/xe: Skip TLB invalidation waits in page fault binds Matthew Brost
@ 2025-10-29 20:57 ` Matthew Brost
  2025-11-03 15:21   ` Thomas Hellström
  2025-10-29 20:57 ` [PATCH v5 6/6] drm/xe: Remove last fence dependency check from binds Matthew Brost
  5 siblings, 1 reply; 16+ messages in thread
From: Matthew Brost @ 2025-10-29 20:57 UTC (permalink / raw)
  To: intel-xe; +Cc: thomas.hellstrom

Prevent input fences from being installed on zero batch execs or zero
binds, which were originally added to support queue idling in Mesa via
output fences. Although input fence support was introduced for interface
consistency, it leads to incorrect behavior due to chained composite
fences, which are disallowed.

Avoid the complexity of fixing this by removing support, as input fences
for these cases are not used in practice.

Signed-off-by: Matthew Brost <matthew.brost@intel.com>
---
 drivers/gpu/drm/xe/xe_sync.c | 101 +++++++++++++----------------------
 1 file changed, 36 insertions(+), 65 deletions(-)

diff --git a/drivers/gpu/drm/xe/xe_sync.c b/drivers/gpu/drm/xe/xe_sync.c
index df7ca349398b..ff74528ca0c6 100644
--- a/drivers/gpu/drm/xe/xe_sync.c
+++ b/drivers/gpu/drm/xe/xe_sync.c
@@ -301,84 +301,55 @@ xe_sync_in_fence_get(struct xe_sync_entry *sync, int num_sync,
 
 	lockdep_assert_held(&vm->lock);
 
-	/* Count in-fences */
-	for (i = 0; i < num_sync; ++i) {
-		if (sync[i].fence) {
-			++num_fence;
-			fence = sync[i].fence;
-		}
-	}
-
-	/* Easy case... */
-	if (!num_fence) {
-		if (q->flags & EXEC_QUEUE_FLAG_VM) {
-			struct xe_exec_queue *__q;
-			struct xe_tile *tile;
-			u8 id;
-
-			for_each_tile(tile, vm->xe, id)
-				num_fence += (1 + XE_MAX_GT_PER_TILE);
-
-			fences = kmalloc_array(num_fence, sizeof(*fences),
-					       GFP_KERNEL);
-			if (!fences)
-				return ERR_PTR(-ENOMEM);
-
+	/* Reject in fences */
+	for (i = 0; i < num_sync; ++i)
+		if (sync[i].fence)
+			return ERR_PTR(-EOPNOTSUPP);
+
+	if (q->flags & EXEC_QUEUE_FLAG_VM) {
+		struct xe_exec_queue *__q;
+		struct xe_tile *tile;
+		u8 id;
+
+		for_each_tile(tile, vm->xe, id)
+			num_fence += (1 + XE_MAX_GT_PER_TILE);
+
+		fences = kmalloc_array(num_fence, sizeof(*fences),
+				       GFP_KERNEL);
+		if (!fences)
+			return ERR_PTR(-ENOMEM);
+
+		fences[current_fence++] =
+			xe_exec_queue_last_fence_get(q, vm);
+		for_each_tlb_inval(i)
+			fences[current_fence++] =
+				xe_exec_queue_tlb_inval_last_fence_get(q, vm, i);
+		list_for_each_entry(__q, &q->multi_gt_list,
+				    multi_gt_link) {
 			fences[current_fence++] =
-				xe_exec_queue_last_fence_get(q, vm);
+				xe_exec_queue_last_fence_get(__q, vm);
 			for_each_tlb_inval(i)
 				fences[current_fence++] =
-					xe_exec_queue_tlb_inval_last_fence_get(q, vm, i);
-			list_for_each_entry(__q, &q->multi_gt_list,
-					    multi_gt_link) {
-				fences[current_fence++] =
-					xe_exec_queue_last_fence_get(__q, vm);
-				for_each_tlb_inval(i)
-					fences[current_fence++] =
-						xe_exec_queue_tlb_inval_last_fence_get(__q, vm, i);
-			}
-
-			xe_assert(vm->xe, current_fence == num_fence);
-			cf = dma_fence_array_create(num_fence, fences,
-						    dma_fence_context_alloc(1),
-						    1, false);
-			if (!cf)
-				goto err_out;
-
-			return &cf->base;
+					xe_exec_queue_tlb_inval_last_fence_get(__q, vm, i);
 		}
 
-		fence = xe_exec_queue_last_fence_get(q, vm);
-		return fence;
-	}
+		xe_assert(vm->xe, current_fence == num_fence);
+		cf = dma_fence_array_create(num_fence, fences,
+					    dma_fence_context_alloc(1),
+					    1, false);
+		if (!cf)
+			goto err_out;
 
-	/*
-	 * Create composite fence - FIXME - the below code doesn't work. This is
-	 * unused in Mesa so we are ok for the moment. Perhaps we just disable
-	 * this entire code path if number of in fences != 0.
-	 */
-	fences = kmalloc_array(num_fence + 1, sizeof(*fences), GFP_KERNEL);
-	if (!fences)
-		return ERR_PTR(-ENOMEM);
-	for (i = 0; i < num_sync; ++i) {
-		if (sync[i].fence) {
-			dma_fence_get(sync[i].fence);
-			fences[current_fence++] = sync[i].fence;
-		}
+		return &cf->base;
 	}
-	fences[current_fence++] = xe_exec_queue_last_fence_get(q, vm);
-	cf = dma_fence_array_create(num_fence, fences,
-				    dma_fence_context_alloc(1), 1, false);
-	if (!cf)
-		goto err_out;
 
-	return &cf->base;
+	fence = xe_exec_queue_last_fence_get(q, vm);
+	return fence;
 
 err_out:
 	while (current_fence)
 		dma_fence_put(fences[--current_fence]);
 	kfree(fences);
-	kfree(cf);
 
 	return ERR_PTR(-ENOMEM);
 }
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH v5 6/6] drm/xe: Remove last fence dependency check from binds
  2025-10-29 20:57 [PATCH v5 0/6] Fix serialization on burst of unbinds - v2 Matthew Brost
                   ` (4 preceding siblings ...)
  2025-10-29 20:57 ` [PATCH v5 5/6] drm/xe: Disallow input fences on zero batch execs and zero binds Matthew Brost
@ 2025-10-29 20:57 ` Matthew Brost
  2025-10-30  8:43   ` Thomas Hellström
  2025-11-03 15:24   ` Thomas Hellström
  5 siblings, 2 replies; 16+ messages in thread
From: Matthew Brost @ 2025-10-29 20:57 UTC (permalink / raw)
  To: intel-xe; +Cc: thomas.hellstrom

Eliminate redundant last fence dependency checks in bind jobs, as they
are now equivalent to xe_exec_queue_is_idle. Simplify the code by
removing this dead logic.

Signed-off-by: Matthew Brost <matthew.brost@intel.com>
---
 drivers/gpu/drm/xe/xe_exec_queue.c | 23 -----------------------
 drivers/gpu/drm/xe/xe_exec_queue.h |  2 --
 drivers/gpu/drm/xe/xe_pt.c         |  7 -------
 3 files changed, 32 deletions(-)

diff --git a/drivers/gpu/drm/xe/xe_exec_queue.c b/drivers/gpu/drm/xe/xe_exec_queue.c
index b7551592fe3f..3486744a8dfd 100644
--- a/drivers/gpu/drm/xe/xe_exec_queue.c
+++ b/drivers/gpu/drm/xe/xe_exec_queue.c
@@ -1122,29 +1122,6 @@ void xe_exec_queue_last_fence_set(struct xe_exec_queue *q, struct xe_vm *vm,
 	q->last_fence = dma_fence_get(fence);
 }
 
-/**
- * xe_exec_queue_last_fence_test_dep - Test last fence dependency of queue
- * @q: The exec queue
- * @vm: The VM the engine does a bind or exec for
- *
- * Returns:
- * -ETIME if there exists an unsignalled last fence dependency, zero otherwise.
- */
-int xe_exec_queue_last_fence_test_dep(struct xe_exec_queue *q, struct xe_vm *vm)
-{
-	struct dma_fence *fence;
-	int err = 0;
-
-	fence = xe_exec_queue_last_fence_get(q, vm);
-	if (fence) {
-		err = test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence->flags) ?
-			0 : -ETIME;
-		dma_fence_put(fence);
-	}
-
-	return err;
-}
-
 /**
  * xe_exec_queue_tlb_inval_last_fence_put() - Drop ref to last TLB invalidation fence
  * @q: The exec queue
diff --git a/drivers/gpu/drm/xe/xe_exec_queue.h b/drivers/gpu/drm/xe/xe_exec_queue.h
index 1ba7354b33d1..fda4d4f9bda8 100644
--- a/drivers/gpu/drm/xe/xe_exec_queue.h
+++ b/drivers/gpu/drm/xe/xe_exec_queue.h
@@ -88,8 +88,6 @@ struct dma_fence *xe_exec_queue_last_fence_get_for_resume(struct xe_exec_queue *
 							  struct xe_vm *vm);
 void xe_exec_queue_last_fence_set(struct xe_exec_queue *e, struct xe_vm *vm,
 				  struct dma_fence *fence);
-int xe_exec_queue_last_fence_test_dep(struct xe_exec_queue *q,
-				      struct xe_vm *vm);
 
 void xe_exec_queue_tlb_inval_last_fence_put(struct xe_exec_queue *q,
 					    struct xe_vm *vm,
diff --git a/drivers/gpu/drm/xe/xe_pt.c b/drivers/gpu/drm/xe/xe_pt.c
index a4b9cdf016d9..01056b51ac9f 100644
--- a/drivers/gpu/drm/xe/xe_pt.c
+++ b/drivers/gpu/drm/xe/xe_pt.c
@@ -1338,13 +1338,6 @@ static int xe_pt_vm_dependencies(struct xe_sched_job *job,
 			return err;
 	}
 
-	if (!(pt_update_ops->q->flags & EXEC_QUEUE_FLAG_KERNEL)) {
-		if (job)
-			err = xe_sched_job_last_fence_add_dep(job, vm);
-		else
-			err = xe_exec_queue_last_fence_test_dep(pt_update_ops->q, vm);
-	}
-
 	for (i = 0; job && !err && i < vops->num_syncs; i++)
 		err = xe_sync_entry_add_deps(&vops->syncs[i], job);
 
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* Re: [PATCH v5 1/6] drm/xe: Enforce correct user fence signaling order using drm_syncobjs
  2025-10-29 20:57 ` [PATCH v5 1/6] drm/xe: Enforce correct user fence signaling order using drm_syncobjs Matthew Brost
@ 2025-10-30  7:58   ` Thomas Hellström
  2025-10-30 12:54     ` Matthew Brost
  0 siblings, 1 reply; 16+ messages in thread
From: Thomas Hellström @ 2025-10-30  7:58 UTC (permalink / raw)
  To: Matthew Brost, intel-xe

On Wed, 2025-10-29 at 13:57 -0700, Matthew Brost wrote:
> Prevent application hangs caused by out-of-order fence signaling when
> user fences are attached. Use drm_syncobj (via dma-fence-chain) to
> guarantee that each user fence signals in order, regardless of the
> signaling order of the attached fences. Ensure user fence writebacks
> to
> user space occur in the correct sequence.

Perhaps describe exactly what the signalling order is supposed to be?
Like calling order per exec_queue etc. (strictly I guess it's the
grabbing order of a certain lock, like the vm resv?)

Do we need a Fixes: for this?
Reviewed-by: Thomas Hellström <thomas.hellstrom@linux.intel.com>

> 
> Signed-of-by: Matthew Brost <matthew.brost@intel.com>
> ---
>  drivers/gpu/drm/xe/xe_exec.c             |  3 +-
>  drivers/gpu/drm/xe/xe_exec_queue.c       | 13 +++++++
>  drivers/gpu/drm/xe/xe_exec_queue_types.h |  7 ++++
>  drivers/gpu/drm/xe/xe_oa.c               | 45 ++++++++++++++++------
> --
>  drivers/gpu/drm/xe/xe_oa_types.h         |  8 +++++
>  drivers/gpu/drm/xe/xe_sync.c             | 17 +++++++--
>  drivers/gpu/drm/xe/xe_sync.h             |  3 ++
>  drivers/gpu/drm/xe/xe_sync_types.h       |  3 ++
>  drivers/gpu/drm/xe/xe_vm.c               |  4 +++
>  9 files changed, 85 insertions(+), 18 deletions(-)
> 
> diff --git a/drivers/gpu/drm/xe/xe_exec.c
> b/drivers/gpu/drm/xe/xe_exec.c
> index 0dc27476832b..f7b9d8715053 100644
> --- a/drivers/gpu/drm/xe/xe_exec.c
> +++ b/drivers/gpu/drm/xe/xe_exec.c
> @@ -166,7 +166,8 @@ int xe_exec_ioctl(struct drm_device *dev, void
> *data, struct drm_file *file)
>  
>  	for (num_syncs = 0; num_syncs < args->num_syncs;
> num_syncs++) {
>  		err = xe_sync_entry_parse(xe, xef,
> &syncs[num_syncs],
> -					  &syncs_user[num_syncs],
> SYNC_PARSE_FLAG_EXEC |
> +					  &syncs_user[num_syncs],
> NULL, 0,
> +					  SYNC_PARSE_FLAG_EXEC |
>  					  (xe_vm_in_lr_mode(vm) ?
>  					   SYNC_PARSE_FLAG_LR_MODE :
> 0));
>  		if (err)
> diff --git a/drivers/gpu/drm/xe/xe_exec_queue.c
> b/drivers/gpu/drm/xe/xe_exec_queue.c
> index 90cbc95f8e2e..6e168efbac65 100644
> --- a/drivers/gpu/drm/xe/xe_exec_queue.c
> +++ b/drivers/gpu/drm/xe/xe_exec_queue.c
> @@ -10,6 +10,7 @@
>  #include <drm/drm_device.h>
>  #include <drm/drm_drv.h>
>  #include <drm/drm_file.h>
> +#include <drm/drm_syncobj.h>
>  #include <uapi/drm/xe_drm.h>
>  
>  #include "xe_dep_scheduler.h"
> @@ -345,6 +346,7 @@ struct xe_exec_queue
> *xe_exec_queue_create_bind(struct xe_device *xe,
>  	struct xe_gt *gt = tile->primary_gt;
>  	struct xe_exec_queue *q;
>  	struct xe_vm *migrate_vm;
> +	int err;
>  
>  	migrate_vm = xe_migrate_get_vm(tile->migrate);
>  	if (xe->info.has_usm) {
> @@ -368,6 +370,14 @@ struct xe_exec_queue
> *xe_exec_queue_create_bind(struct xe_device *xe,
>  	}
>  	xe_vm_put(migrate_vm);
>  
> +	err = drm_syncobj_create(&q->ufence_syncobj,
> +				 DRM_SYNCOBJ_CREATE_SIGNALED,
> +				 NULL);
> +	if (err) {
> +		xe_exec_queue_put(q);
> +		return ERR_PTR(err);
> +	}
> +
>  	return q;
>  }
>  ALLOW_ERROR_INJECTION(xe_exec_queue_create_bind, ERRNO);
> @@ -377,6 +387,9 @@ void xe_exec_queue_destroy(struct kref *ref)
>  	struct xe_exec_queue *q = container_of(ref, struct
> xe_exec_queue, refcount);
>  	struct xe_exec_queue *eq, *next;
>  
> +	if (q->ufence_syncobj)
> +		drm_syncobj_put(q->ufence_syncobj);
> +
>  	if (xe_exec_queue_uses_pxp(q))
>  		xe_pxp_exec_queue_remove(gt_to_xe(q->gt)->pxp, q);
>  
> diff --git a/drivers/gpu/drm/xe/xe_exec_queue_types.h
> b/drivers/gpu/drm/xe/xe_exec_queue_types.h
> index 282505fa1377..838266c3914b 100644
> --- a/drivers/gpu/drm/xe/xe_exec_queue_types.h
> +++ b/drivers/gpu/drm/xe/xe_exec_queue_types.h
> @@ -15,6 +15,7 @@
>  #include "xe_hw_fence_types.h"
>  #include "xe_lrc_types.h"
>  
> +struct drm_syncobj;
>  struct xe_execlist_exec_queue;
>  struct xe_gt;
>  struct xe_guc_exec_queue;
> @@ -155,6 +156,12 @@ struct xe_exec_queue {
>  		struct list_head link;
>  	} pxp;
>  
> +	/** @ufence_syncobj: User fence syncobj */
> +	struct drm_syncobj *ufence_syncobj;
> +
> +	/** @ufence_timeline_value: User fence timeline value */
> +	u64 ufence_timeline_value;
> +
>  	/** @ops: submission backend exec queue operations */
>  	const struct xe_exec_queue_ops *ops;
>  
> diff --git a/drivers/gpu/drm/xe/xe_oa.c b/drivers/gpu/drm/xe/xe_oa.c
> index f901ba52b403..7a13a7bd99a6 100644
> --- a/drivers/gpu/drm/xe/xe_oa.c
> +++ b/drivers/gpu/drm/xe/xe_oa.c
> @@ -10,6 +10,7 @@
>  
>  #include <drm/drm_drv.h>
>  #include <drm/drm_managed.h>
> +#include <drm/drm_syncobj.h>
>  #include <uapi/drm/xe_drm.h>
>  
>  #include <generated/xe_wa_oob.h>
> @@ -1390,7 +1391,9 @@ static int xe_oa_user_extensions(struct xe_oa
> *oa, enum xe_oa_user_extn_from fro
>  	return 0;
>  }
>  
> -static int xe_oa_parse_syncs(struct xe_oa *oa, struct
> xe_oa_open_param *param)
> +static int xe_oa_parse_syncs(struct xe_oa *oa,
> +			     struct xe_oa_stream *stream,
> +			     struct xe_oa_open_param *param)
>  {
>  	int ret, num_syncs, num_ufence = 0;
>  
> @@ -1410,7 +1413,9 @@ static int xe_oa_parse_syncs(struct xe_oa *oa,
> struct xe_oa_open_param *param)
>  
>  	for (num_syncs = 0; num_syncs < param->num_syncs;
> num_syncs++) {
>  		ret = xe_sync_entry_parse(oa->xe, param->xef,
> &param->syncs[num_syncs],
> -					  &param-
> >syncs_user[num_syncs], 0);
> +					  &param-
> >syncs_user[num_syncs],
> +					  stream->ufence_syncobj,
> +					  ++stream-
> >ufence_timeline_value, 0);
>  		if (ret)
>  			goto err_syncs;
>  
> @@ -1540,7 +1545,7 @@ static long xe_oa_config_locked(struct
> xe_oa_stream *stream, u64 arg)
>  		return -ENODEV;
>  
>  	param.xef = stream->xef;
> -	err = xe_oa_parse_syncs(stream->oa, &param);
> +	err = xe_oa_parse_syncs(stream->oa, stream, &param);
>  	if (err)
>  		goto err_config_put;
>  
> @@ -1636,6 +1641,7 @@ static void xe_oa_destroy_locked(struct
> xe_oa_stream *stream)
>  	if (stream->exec_q)
>  		xe_exec_queue_put(stream->exec_q);
>  
> +	drm_syncobj_put(stream->ufence_syncobj);
>  	kfree(stream);
>  }
>  
> @@ -1827,6 +1833,7 @@ static int
> xe_oa_stream_open_ioctl_locked(struct xe_oa *oa,
>  					  struct xe_oa_open_param
> *param)
>  {
>  	struct xe_oa_stream *stream;
> +	struct drm_syncobj *ufence_syncobj;
>  	int stream_fd;
>  	int ret;
>  
> @@ -1837,17 +1844,31 @@ static int
> xe_oa_stream_open_ioctl_locked(struct xe_oa *oa,
>  		goto exit;
>  	}
>  
> +	ret = drm_syncobj_create(&ufence_syncobj,
> DRM_SYNCOBJ_CREATE_SIGNALED,
> +				 NULL);
> +	if (ret)
> +		goto exit;
> +
>  	stream = kzalloc(sizeof(*stream), GFP_KERNEL);
>  	if (!stream) {
>  		ret = -ENOMEM;
> -		goto exit;
> +		goto err_syncobj;
>  	}
> -
> +	stream->ufence_syncobj = ufence_syncobj;
>  	stream->oa = oa;
> -	ret = xe_oa_stream_init(stream, param);
> +
> +	ret = xe_oa_parse_syncs(oa, stream, param);
>  	if (ret)
>  		goto err_free;
>  
> +	ret = xe_oa_stream_init(stream, param);
> +	if (ret) {
> +		while (param->num_syncs--)
> +			xe_sync_entry_cleanup(&param->syncs[param-
> >num_syncs]);
> +		kfree(param->syncs);
> +		goto err_free;
> +	}
> +
>  	if (!param->disabled) {
>  		ret = xe_oa_enable_locked(stream);
>  		if (ret)
> @@ -1871,6 +1892,8 @@ static int
> xe_oa_stream_open_ioctl_locked(struct xe_oa *oa,
>  	xe_oa_stream_destroy(stream);
>  err_free:
>  	kfree(stream);
> +err_syncobj:
> +	drm_syncobj_put(ufence_syncobj);
>  exit:
>  	return ret;
>  }
> @@ -2084,22 +2107,14 @@ int xe_oa_stream_open_ioctl(struct drm_device
> *dev, u64 data, struct drm_file *f
>  		goto err_exec_q;
>  	}
>  
> -	ret = xe_oa_parse_syncs(oa, &param);
> -	if (ret)
> -		goto err_exec_q;
> -
>  	mutex_lock(&param.hwe->gt->oa.gt_lock);
>  	ret = xe_oa_stream_open_ioctl_locked(oa, &param);
>  	mutex_unlock(&param.hwe->gt->oa.gt_lock);
>  	if (ret < 0)
> -		goto err_sync_cleanup;
> +		goto err_exec_q;
>  
>  	return ret;
>  
> -err_sync_cleanup:
> -	while (param.num_syncs--)
> -
> 		xe_sync_entry_cleanup(&param.syncs[param.num_syncs]);
> -	kfree(param.syncs);
>  err_exec_q:
>  	if (param.exec_q)
>  		xe_exec_queue_put(param.exec_q);
> diff --git a/drivers/gpu/drm/xe/xe_oa_types.h
> b/drivers/gpu/drm/xe/xe_oa_types.h
> index 2628f78c4e8d..daf701b5d48b 100644
> --- a/drivers/gpu/drm/xe/xe_oa_types.h
> +++ b/drivers/gpu/drm/xe/xe_oa_types.h
> @@ -15,6 +15,8 @@
>  #include "regs/xe_reg_defs.h"
>  #include "xe_hw_engine_types.h"
>  
> +struct drm_syncobj;
> +
>  #define DEFAULT_XE_OA_BUFFER_SIZE SZ_16M
>  
>  enum xe_oa_report_header {
> @@ -248,6 +250,12 @@ struct xe_oa_stream {
>  	/** @xef: xe_file with which the stream was opened */
>  	struct xe_file *xef;
>  
> +	/** @ufence_syncobj: User fence syncobj */
> +	struct drm_syncobj *ufence_syncobj;
> +
> +	/** @ufence_timeline_value: User fence timeline value */
> +	u64 ufence_timeline_value;
> +
>  	/** @last_fence: fence to use in stream destroy when needed
> */
>  	struct dma_fence *last_fence;
>  
> diff --git a/drivers/gpu/drm/xe/xe_sync.c
> b/drivers/gpu/drm/xe/xe_sync.c
> index 82872a51f098..d48ab7b32ca5 100644
> --- a/drivers/gpu/drm/xe/xe_sync.c
> +++ b/drivers/gpu/drm/xe/xe_sync.c
> @@ -113,6 +113,8 @@ static void user_fence_cb(struct dma_fence
> *fence, struct dma_fence_cb *cb)
>  int xe_sync_entry_parse(struct xe_device *xe, struct xe_file *xef,
>  			struct xe_sync_entry *sync,
>  			struct drm_xe_sync __user *sync_user,
> +			struct drm_syncobj *ufence_syncobj,
> +			u64 ufence_timeline_value,
>  			unsigned int flags)
>  {
>  	struct drm_xe_sync sync_in;
> @@ -192,10 +194,15 @@ int xe_sync_entry_parse(struct xe_device *xe,
> struct xe_file *xef,
>  		if (exec) {
>  			sync->addr = sync_in.addr;
>  		} else {
> +			sync->ufence_timeline_value =
> ufence_timeline_value;
>  			sync->ufence = user_fence_create(xe,
> sync_in.addr,
>  							
> sync_in.timeline_value);
>  			if (XE_IOCTL_DBG(xe, IS_ERR(sync->ufence)))
>  				return PTR_ERR(sync->ufence);
> +			sync->ufence_chain_fence =
> dma_fence_chain_alloc();
> +			if (!sync->ufence_chain_fence)
> +				return -ENOMEM;
> +			sync->ufence_syncobj = ufence_syncobj;
>  		}
>  
>  		break;
> @@ -239,7 +246,12 @@ void xe_sync_entry_signal(struct xe_sync_entry
> *sync, struct dma_fence *fence)
>  	} else if (sync->ufence) {
>  		int err;
>  
> -		dma_fence_get(fence);
> +		drm_syncobj_add_point(sync->ufence_syncobj,
> +				      sync->ufence_chain_fence,
> +				      fence, sync-
> >ufence_timeline_value);
> +		sync->ufence_chain_fence = NULL;
> +
> +		fence = drm_syncobj_fence_get(sync->ufence_syncobj);
>  		user_fence_get(sync->ufence);
>  		err = dma_fence_add_callback(fence, &sync->ufence-
> >cb,
>  					     user_fence_cb);
> @@ -259,7 +271,8 @@ void xe_sync_entry_cleanup(struct xe_sync_entry
> *sync)
>  		drm_syncobj_put(sync->syncobj);
>  	dma_fence_put(sync->fence);
>  	dma_fence_chain_free(sync->chain_fence);
> -	if (sync->ufence)
> +	dma_fence_chain_free(sync->ufence_chain_fence);
> +	if (!IS_ERR_OR_NULL(sync->ufence))
>  		user_fence_put(sync->ufence);
>  }
>  
> diff --git a/drivers/gpu/drm/xe/xe_sync.h
> b/drivers/gpu/drm/xe/xe_sync.h
> index 256ffc1e54dc..51f2d803e977 100644
> --- a/drivers/gpu/drm/xe/xe_sync.h
> +++ b/drivers/gpu/drm/xe/xe_sync.h
> @@ -8,6 +8,7 @@
>  
>  #include "xe_sync_types.h"
>  
> +struct drm_syncobj;
>  struct xe_device;
>  struct xe_exec_queue;
>  struct xe_file;
> @@ -21,6 +22,8 @@ struct xe_vm;
>  int xe_sync_entry_parse(struct xe_device *xe, struct xe_file *xef,
>  			struct xe_sync_entry *sync,
>  			struct drm_xe_sync __user *sync_user,
> +			struct drm_syncobj *ufence_syncobj,
> +			u64 ufence_timeline_value,
>  			unsigned int flags);
>  int xe_sync_entry_add_deps(struct xe_sync_entry *sync,
>  			   struct xe_sched_job *job);
> diff --git a/drivers/gpu/drm/xe/xe_sync_types.h
> b/drivers/gpu/drm/xe/xe_sync_types.h
> index 30ac3f51993b..b88f1833e28c 100644
> --- a/drivers/gpu/drm/xe/xe_sync_types.h
> +++ b/drivers/gpu/drm/xe/xe_sync_types.h
> @@ -18,9 +18,12 @@ struct xe_sync_entry {
>  	struct drm_syncobj *syncobj;
>  	struct dma_fence *fence;
>  	struct dma_fence_chain *chain_fence;
> +	struct dma_fence_chain *ufence_chain_fence;
> +	struct drm_syncobj *ufence_syncobj;
>  	struct xe_user_fence *ufence;
>  	u64 addr;
>  	u64 timeline_value;
> +	u64 ufence_timeline_value;
>  	u32 type;
>  	u32 flags;
>  };
> diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c
> index 10d77666a425..4058c476aa2d 100644
> --- a/drivers/gpu/drm/xe/xe_vm.c
> +++ b/drivers/gpu/drm/xe/xe_vm.c
> @@ -3633,8 +3633,12 @@ int xe_vm_bind_ioctl(struct drm_device *dev,
> void *data, struct drm_file *file)
>  
>  	syncs_user = u64_to_user_ptr(args->syncs);
>  	for (num_syncs = 0; num_syncs < args->num_syncs;
> num_syncs++) {
> +		struct xe_exec_queue *__q = q ?: vm->q[0];
> +
>  		err = xe_sync_entry_parse(xe, xef,
> &syncs[num_syncs],
>  					  &syncs_user[num_syncs],
> +					  __q->ufence_syncobj,
> +					  ++__q-
> >ufence_timeline_value,
>  					  (xe_vm_in_lr_mode(vm) ?
>  					   SYNC_PARSE_FLAG_LR_MODE :
> 0) |
>  					  (!args->num_binds ?


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v5 2/6] drm/xe: Attach last fence to TLB invalidation job queues
  2025-10-29 20:57 ` [PATCH v5 2/6] drm/xe: Attach last fence to TLB invalidation job queues Matthew Brost
@ 2025-10-30  8:24   ` Thomas Hellström
  0 siblings, 0 replies; 16+ messages in thread
From: Thomas Hellström @ 2025-10-30  8:24 UTC (permalink / raw)
  To: Matthew Brost, intel-xe

On Wed, 2025-10-29 at 13:57 -0700, Matthew Brost wrote:
> Add support for attaching the last fence to TLB invalidation job
> queues
> to address serialization issues during bursts of unbind jobs. Ensure
> that user fence signaling for a bind job reflects both the bind job
> itself and the last fences of all related TLB invalidations. Maintain
> submission order based solely on the state of the bind and TLB
> invalidation queues.
> 
> Introduce support functions for last fence attachment to TLB
> invalidation queues.
> 
> Signed-off-by: Matthew Brost <matthew.brost@intel.com>

Reviewed-by: Thomas Hellström <thomas.hellstrom@linux.intel.com>

> 
> ---
> v3:
>  - Fix assert in xe_exec_queue_tlb_inval_last_fence_set (CI)
>  - Ensure migrate lock held for migrate queues (Testing)
> v5:
>  - Style nits (Thomas)
>  - Rewrite commit message (Thomas)
> ---
>  drivers/gpu/drm/xe/xe_exec_queue.c       | 103
> ++++++++++++++++++++++-
>  drivers/gpu/drm/xe/xe_exec_queue.h       |  21 +++++
>  drivers/gpu/drm/xe/xe_exec_queue_types.h |   5 ++
>  drivers/gpu/drm/xe/xe_migrate.c          |  14 +++
>  drivers/gpu/drm/xe/xe_migrate.h          |   8 ++
>  drivers/gpu/drm/xe/xe_vm.c               |   7 +-
>  6 files changed, 156 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/gpu/drm/xe/xe_exec_queue.c
> b/drivers/gpu/drm/xe/xe_exec_queue.c
> index 6e168efbac65..b7551592fe3f 100644
> --- a/drivers/gpu/drm/xe/xe_exec_queue.c
> +++ b/drivers/gpu/drm/xe/xe_exec_queue.c
> @@ -386,6 +386,7 @@ void xe_exec_queue_destroy(struct kref *ref)
>  {
>  	struct xe_exec_queue *q = container_of(ref, struct
> xe_exec_queue, refcount);
>  	struct xe_exec_queue *eq, *next;
> +	int i;
>  
>  	if (q->ufence_syncobj)
>  		drm_syncobj_put(q->ufence_syncobj);
> @@ -394,6 +395,9 @@ void xe_exec_queue_destroy(struct kref *ref)
>  		xe_pxp_exec_queue_remove(gt_to_xe(q->gt)->pxp, q);
>  
>  	xe_exec_queue_last_fence_put_unlocked(q);
> +	for_each_tlb_inval(i)
> +		xe_exec_queue_tlb_inval_last_fence_put_unlocked(q,
> i);
> +
>  	if (!(q->flags & EXEC_QUEUE_FLAG_BIND_ENGINE_CHILD)) {
>  		list_for_each_entry_safe(eq, next, &q-
> >multi_gt_list,
>  					 multi_gt_link)
> @@ -1011,7 +1015,9 @@ int xe_exec_queue_destroy_ioctl(struct
> drm_device *dev, void *data,
>  static void xe_exec_queue_last_fence_lockdep_assert(struct
> xe_exec_queue *q,
>  						    struct xe_vm
> *vm)
>  {
> -	if (q->flags & EXEC_QUEUE_FLAG_VM) {
> +	if (q->flags & EXEC_QUEUE_FLAG_MIGRATE) {
> +		xe_migrate_job_lock_assert(q);
> +	} else if (q->flags & EXEC_QUEUE_FLAG_VM) {
>  		lockdep_assert_held(&vm->lock);
>  	} else {
>  		xe_vm_assert_held(vm);
> @@ -1110,6 +1116,7 @@ void xe_exec_queue_last_fence_set(struct
> xe_exec_queue *q, struct xe_vm *vm,
>  				  struct dma_fence *fence)
>  {
>  	xe_exec_queue_last_fence_lockdep_assert(q, vm);
> +	xe_assert(vm->xe, !dma_fence_is_container(fence));
>  
>  	xe_exec_queue_last_fence_put(q, vm);
>  	q->last_fence = dma_fence_get(fence);
> @@ -1138,6 +1145,100 @@ int xe_exec_queue_last_fence_test_dep(struct
> xe_exec_queue *q, struct xe_vm *vm)
>  	return err;
>  }
>  
> +/**
> + * xe_exec_queue_tlb_inval_last_fence_put() - Drop ref to last TLB
> invalidation fence
> + * @q: The exec queue
> + * @vm: The VM the engine does a bind for
> + * @type: Either primary or media GT
> + */
> +void xe_exec_queue_tlb_inval_last_fence_put(struct xe_exec_queue *q,
> +					    struct xe_vm *vm,
> +					    unsigned int type)
> +{
> +	xe_exec_queue_last_fence_lockdep_assert(q, vm);
> +	xe_assert(vm->xe, type == XE_EXEC_QUEUE_TLB_INVAL_MEDIA_GT
> ||
> +		  type == XE_EXEC_QUEUE_TLB_INVAL_PRIMARY_GT);
> +
> +	xe_exec_queue_tlb_inval_last_fence_put_unlocked(q, type);
> +}
> +
> +/**
> + * xe_exec_queue_tlb_inval_last_fence_put_unlocked() - Drop ref to
> last TLB
> + * invalidation fence unlocked
> + * @q: The exec queue
> + * @type: Either primary or media GT
> + *
> + * Only safe to be called from xe_exec_queue_destroy().
> + */
> +void xe_exec_queue_tlb_inval_last_fence_put_unlocked(struct
> xe_exec_queue *q,
> +						     unsigned int
> type)
> +{
> +	xe_assert(q->vm->xe, type ==
> XE_EXEC_QUEUE_TLB_INVAL_MEDIA_GT ||
> +		  type == XE_EXEC_QUEUE_TLB_INVAL_PRIMARY_GT);
> +
> +	dma_fence_put(q->tlb_inval[type].last_fence);
> +	q->tlb_inval[type].last_fence = NULL;
> +}
> +
> +/**
> + * xe_exec_queue_tlb_inval_last_fence_get() - Get last fence for TLB
> invalidation
> + * @q: The exec queue
> + * @vm: The VM the engine does a bind for
> + * @type: Either primary or media GT
> + *
> + * Get last fence, takes a ref
> + *
> + * Returns: last fence if not signaled, dma fence stub if signaled
> + */
> +struct dma_fence *xe_exec_queue_tlb_inval_last_fence_get(struct
> xe_exec_queue *q,
> +							 struct
> xe_vm *vm,
> +							 unsigned
> int type)
> +{
> +	struct dma_fence *fence;
> +
> +	xe_exec_queue_last_fence_lockdep_assert(q, vm);
> +	xe_assert(vm->xe, type == XE_EXEC_QUEUE_TLB_INVAL_MEDIA_GT
> ||
> +		  type == XE_EXEC_QUEUE_TLB_INVAL_PRIMARY_GT);
> +	xe_assert(vm->xe, q->flags & (EXEC_QUEUE_FLAG_VM |
> +				      EXEC_QUEUE_FLAG_MIGRATE));
> +
> +	if (q->tlb_inval[type].last_fence &&
> +	    test_bit(DMA_FENCE_FLAG_SIGNALED_BIT,
> +		     &q->tlb_inval[type].last_fence->flags))
> +		xe_exec_queue_tlb_inval_last_fence_put(q, vm, type);
> +
> +	fence = q->tlb_inval[type].last_fence ?:
> dma_fence_get_stub();
> +	dma_fence_get(fence);
> +	return fence;
> +}
> +
> +/**
> + * xe_exec_queue_tlb_inval_last_fence_set() - Set last fence for TLB
> invalidation
> + * @q: The exec queue
> + * @vm: The VM the engine does a bind for
> + * @fence: The fence
> + * @type: Either primary or media GT
> + *
> + * Set the last fence for the tlb invalidation type on the queue.
> Increases
> + * reference count for fence, when closing queue
> + * xe_exec_queue_tlb_inval_last_fence_put should be called.
> + */
> +void xe_exec_queue_tlb_inval_last_fence_set(struct xe_exec_queue *q,
> +					    struct xe_vm *vm,
> +					    struct dma_fence *fence,
> +					    unsigned int type)
> +{
> +	xe_exec_queue_last_fence_lockdep_assert(q, vm);
> +	xe_assert(vm->xe, type == XE_EXEC_QUEUE_TLB_INVAL_MEDIA_GT
> ||
> +		  type == XE_EXEC_QUEUE_TLB_INVAL_PRIMARY_GT);
> +	xe_assert(vm->xe, q->flags & (EXEC_QUEUE_FLAG_VM |
> +				      EXEC_QUEUE_FLAG_MIGRATE));
> +	xe_assert(vm->xe, !dma_fence_is_container(fence));
> +
> +	xe_exec_queue_tlb_inval_last_fence_put(q, vm, type);
> +	q->tlb_inval[type].last_fence = dma_fence_get(fence);
> +}
> +
>  /**
>   * xe_exec_queue_contexts_hwsp_rebase - Re-compute GGTT references
>   * within all LRCs of a queue.
> diff --git a/drivers/gpu/drm/xe/xe_exec_queue.h
> b/drivers/gpu/drm/xe/xe_exec_queue.h
> index a4dfbe858bda..1ba7354b33d1 100644
> --- a/drivers/gpu/drm/xe/xe_exec_queue.h
> +++ b/drivers/gpu/drm/xe/xe_exec_queue.h
> @@ -14,6 +14,10 @@ struct drm_file;
>  struct xe_device;
>  struct xe_file;
>  
> +#define for_each_tlb_inval(__i)	\
> +	for (__i = XE_EXEC_QUEUE_TLB_INVAL_PRIMARY_GT; \
> +	     __i <= XE_EXEC_QUEUE_TLB_INVAL_MEDIA_GT; ++__i)
> +
>  struct xe_exec_queue *xe_exec_queue_create(struct xe_device *xe,
> struct xe_vm *vm,
>  					   u32 logical_mask, u16
> width,
>  					   struct xe_hw_engine
> *hw_engine, u32 flags,
> @@ -86,6 +90,23 @@ void xe_exec_queue_last_fence_set(struct
> xe_exec_queue *e, struct xe_vm *vm,
>  				  struct dma_fence *fence);
>  int xe_exec_queue_last_fence_test_dep(struct xe_exec_queue *q,
>  				      struct xe_vm *vm);
> +
> +void xe_exec_queue_tlb_inval_last_fence_put(struct xe_exec_queue *q,
> +					    struct xe_vm *vm,
> +					    unsigned int type);
> +
> +void xe_exec_queue_tlb_inval_last_fence_put_unlocked(struct
> xe_exec_queue *q,
> +						     unsigned int
> type);
> +
> +struct dma_fence *xe_exec_queue_tlb_inval_last_fence_get(struct
> xe_exec_queue *q,
> +							 struct
> xe_vm *vm,
> +							 unsigned
> int type);
> +
> +void xe_exec_queue_tlb_inval_last_fence_set(struct xe_exec_queue *q,
> +					    struct xe_vm *vm,
> +					    struct dma_fence *fence,
> +					    unsigned int type);
> +
>  void xe_exec_queue_update_run_ticks(struct xe_exec_queue *q);
>  
>  int xe_exec_queue_contexts_hwsp_rebase(struct xe_exec_queue *q, void
> *scratch);
> diff --git a/drivers/gpu/drm/xe/xe_exec_queue_types.h
> b/drivers/gpu/drm/xe/xe_exec_queue_types.h
> index 838266c3914b..4ef61e803676 100644
> --- a/drivers/gpu/drm/xe/xe_exec_queue_types.h
> +++ b/drivers/gpu/drm/xe/xe_exec_queue_types.h
> @@ -146,6 +146,11 @@ struct xe_exec_queue {
>  		 * dependency scheduler
>  		 */
>  		struct xe_dep_scheduler *dep_scheduler;
> +		/**
> +		 * @last_fence: last fence for tlb invalidation,
> protected by
> +		 * vm->lock in write mode
> +		 */
> +		struct dma_fence *last_fence;
>  	} tlb_inval[XE_EXEC_QUEUE_TLB_INVAL_COUNT];
>  
>  	/** @pxp: PXP info tracking */
> diff --git a/drivers/gpu/drm/xe/xe_migrate.c
> b/drivers/gpu/drm/xe/xe_migrate.c
> index 921c9c1ea41f..4567bc88a8ec 100644
> --- a/drivers/gpu/drm/xe/xe_migrate.c
> +++ b/drivers/gpu/drm/xe/xe_migrate.c
> @@ -2333,6 +2333,20 @@ void xe_migrate_job_unlock(struct xe_migrate
> *m, struct xe_exec_queue *q)
>  		xe_vm_assert_held(q->vm);	/* User queues VM's
> should be locked */
>  }
>  
> +#if IS_ENABLED(CONFIG_PROVE_LOCKING)
> +/**
> + * xe_migrate_job_lock_assert() - Assert migrate job lock held of
> queue
> + * @q: Migrate queue
> + */
> +void xe_migrate_job_lock_assert(struct xe_exec_queue *q)
> +{
> +	struct xe_migrate *m = gt_to_tile(q->gt)->migrate;
> +
> +	xe_gt_assert(q->gt, q == m->q);
> +	lockdep_assert_held(&m->job_mutex);
> +}
> +#endif
> +
>  #if IS_ENABLED(CONFIG_DRM_XE_KUNIT_TEST)
>  #include "tests/xe_migrate.c"
>  #endif
> diff --git a/drivers/gpu/drm/xe/xe_migrate.h
> b/drivers/gpu/drm/xe/xe_migrate.h
> index 4fad324b6253..9b5791617f5e 100644
> --- a/drivers/gpu/drm/xe/xe_migrate.h
> +++ b/drivers/gpu/drm/xe/xe_migrate.h
> @@ -152,6 +152,14 @@ xe_migrate_update_pgtables(struct xe_migrate *m,
>  
>  void xe_migrate_wait(struct xe_migrate *m);
>  
> +#if IS_ENABLED(CONFIG_PROVE_LOCKING)
> +void xe_migrate_job_lock_assert(struct xe_exec_queue *q);
> +#else
> +static inline void xe_migrate_job_lock_assert(struct xe_exec_queue
> *q)
> +{
> +}
> +#endif
> +
>  void xe_migrate_job_lock(struct xe_migrate *m, struct xe_exec_queue
> *q);
>  void xe_migrate_job_unlock(struct xe_migrate *m, struct
> xe_exec_queue *q);
>  
> diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c
> index 4058c476aa2d..4241cc433dca 100644
> --- a/drivers/gpu/drm/xe/xe_vm.c
> +++ b/drivers/gpu/drm/xe/xe_vm.c
> @@ -1731,8 +1731,13 @@ void xe_vm_close_and_put(struct xe_vm *vm)
>  
>  	down_write(&vm->lock);
>  	for_each_tile(tile, xe, id) {
> -		if (vm->q[id])
> +		if (vm->q[id]) {
> +			int i;
> +
>  			xe_exec_queue_last_fence_put(vm->q[id], vm);
> +			for_each_tlb_inval(i)
> +				xe_exec_queue_tlb_inval_last_fence_p
> ut(vm->q[id], vm, i);
> +		}
>  	}
>  	up_write(&vm->lock);
>  


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v5 6/6] drm/xe: Remove last fence dependency check from binds
  2025-10-29 20:57 ` [PATCH v5 6/6] drm/xe: Remove last fence dependency check from binds Matthew Brost
@ 2025-10-30  8:43   ` Thomas Hellström
  2025-11-03 15:24   ` Thomas Hellström
  1 sibling, 0 replies; 16+ messages in thread
From: Thomas Hellström @ 2025-10-30  8:43 UTC (permalink / raw)
  To: Matthew Brost, intel-xe

On Wed, 2025-10-29 at 13:57 -0700, Matthew Brost wrote:
> Eliminate redundant last fence dependency checks in bind jobs, as
> they
> are now equivalent to xe_exec_queue_is_idle. Simplify the code by
> removing this dead logic.
> 
> Signed-off-by: Matthew Brost <matthew.brost@intel.com>

Reviewed-by: Thomas Hellström <thomas.hellstrom@linux.intel.com>

> ---
>  drivers/gpu/drm/xe/xe_exec_queue.c | 23 -----------------------
>  drivers/gpu/drm/xe/xe_exec_queue.h |  2 --
>  drivers/gpu/drm/xe/xe_pt.c         |  7 -------
>  3 files changed, 32 deletions(-)
> 
> diff --git a/drivers/gpu/drm/xe/xe_exec_queue.c
> b/drivers/gpu/drm/xe/xe_exec_queue.c
> index b7551592fe3f..3486744a8dfd 100644
> --- a/drivers/gpu/drm/xe/xe_exec_queue.c
> +++ b/drivers/gpu/drm/xe/xe_exec_queue.c
> @@ -1122,29 +1122,6 @@ void xe_exec_queue_last_fence_set(struct
> xe_exec_queue *q, struct xe_vm *vm,
>  	q->last_fence = dma_fence_get(fence);
>  }
>  
> -/**
> - * xe_exec_queue_last_fence_test_dep - Test last fence dependency of
> queue
> - * @q: The exec queue
> - * @vm: The VM the engine does a bind or exec for
> - *
> - * Returns:
> - * -ETIME if there exists an unsignalled last fence dependency, zero
> otherwise.
> - */
> -int xe_exec_queue_last_fence_test_dep(struct xe_exec_queue *q,
> struct xe_vm *vm)
> -{
> -	struct dma_fence *fence;
> -	int err = 0;
> -
> -	fence = xe_exec_queue_last_fence_get(q, vm);
> -	if (fence) {
> -		err = test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence-
> >flags) ?
> -			0 : -ETIME;
> -		dma_fence_put(fence);
> -	}
> -
> -	return err;
> -}
> -
>  /**
>   * xe_exec_queue_tlb_inval_last_fence_put() - Drop ref to last TLB
> invalidation fence
>   * @q: The exec queue
> diff --git a/drivers/gpu/drm/xe/xe_exec_queue.h
> b/drivers/gpu/drm/xe/xe_exec_queue.h
> index 1ba7354b33d1..fda4d4f9bda8 100644
> --- a/drivers/gpu/drm/xe/xe_exec_queue.h
> +++ b/drivers/gpu/drm/xe/xe_exec_queue.h
> @@ -88,8 +88,6 @@ struct dma_fence
> *xe_exec_queue_last_fence_get_for_resume(struct xe_exec_queue *
>  							  struct
> xe_vm *vm);
>  void xe_exec_queue_last_fence_set(struct xe_exec_queue *e, struct
> xe_vm *vm,
>  				  struct dma_fence *fence);
> -int xe_exec_queue_last_fence_test_dep(struct xe_exec_queue *q,
> -				      struct xe_vm *vm);
>  
>  void xe_exec_queue_tlb_inval_last_fence_put(struct xe_exec_queue *q,
>  					    struct xe_vm *vm,
> diff --git a/drivers/gpu/drm/xe/xe_pt.c b/drivers/gpu/drm/xe/xe_pt.c
> index a4b9cdf016d9..01056b51ac9f 100644
> --- a/drivers/gpu/drm/xe/xe_pt.c
> +++ b/drivers/gpu/drm/xe/xe_pt.c
> @@ -1338,13 +1338,6 @@ static int xe_pt_vm_dependencies(struct
> xe_sched_job *job,
>  			return err;
>  	}
>  
> -	if (!(pt_update_ops->q->flags & EXEC_QUEUE_FLAG_KERNEL)) {
> -		if (job)
> -			err = xe_sched_job_last_fence_add_dep(job,
> vm);
> -		else
> -			err =
> xe_exec_queue_last_fence_test_dep(pt_update_ops->q, vm);
> -	}
> -
>  	for (i = 0; job && !err && i < vops->num_syncs; i++)
>  		err = xe_sync_entry_add_deps(&vops->syncs[i], job);
>  


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v5 3/6] drm/xe: Decouple bind queue last fence from TLB invalidations
  2025-10-29 20:57 ` [PATCH v5 3/6] drm/xe: Decouple bind queue last fence from TLB invalidations Matthew Brost
@ 2025-10-30  9:52   ` Thomas Hellström
  0 siblings, 0 replies; 16+ messages in thread
From: Thomas Hellström @ 2025-10-30  9:52 UTC (permalink / raw)
  To: Matthew Brost, intel-xe

On Wed, 2025-10-29 at 13:57 -0700, Matthew Brost wrote:
> Separate the bind queue’s last fence to apply exclusively to the bind
> job, avoiding unnecessary serialization on prior TLB invalidations.
> Preserve correct user fence signaling by merging bind and TLB
> invalidation fences later in the pipeline.
> 
> Link: https://gitlab.freedesktop.org/drm/xe/kernel/-/issues/6047
> Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> 
> ---
Pls keep version history

> v3:
>  - Fix lockdep assert for migrate queues (CI)
>  - Use individual dma fence contexts for array out fences (Testing)
>  - Don't set last fence with arrays (Testing)
>  - Move TLB invalid last fence under migrate lock (Testing)
>  - Don't set queue last for migrate queues (Testing)

Reviewed-by:
Thomas Hellström <thomas.hellstrom@linux.intel.com


> ---
>  drivers/gpu/drm/xe/xe_pt.c            | 73 ++++++++++---------------
>  drivers/gpu/drm/xe/xe_sync.c          | 63 +++++++++++++++++-----
>  drivers/gpu/drm/xe/xe_tlb_inval_job.c | 31 ++++++++---
>  drivers/gpu/drm/xe/xe_tlb_inval_job.h |  5 +-
>  drivers/gpu/drm/xe/xe_vm.c            | 76 ++++++++++++++-----------
> --
>  drivers/gpu/drm/xe/xe_vm_types.h      |  5 --
>  6 files changed, 143 insertions(+), 110 deletions(-)
> 
> diff --git a/drivers/gpu/drm/xe/xe_pt.c b/drivers/gpu/drm/xe/xe_pt.c
> index d22fd1ccc0ba..a4b9cdf016d9 100644
> --- a/drivers/gpu/drm/xe/xe_pt.c
> +++ b/drivers/gpu/drm/xe/xe_pt.c
> @@ -3,8 +3,6 @@
>   * Copyright © 2022 Intel Corporation
>   */
>  
> -#include <linux/dma-fence-array.h>
> -
>  #include "xe_pt.h"
>  
>  #include "regs/xe_gtt_defs.h"
> @@ -2359,10 +2357,9 @@ xe_pt_update_ops_run(struct xe_tile *tile,
> struct xe_vma_ops *vops)
>  	struct xe_vm *vm = vops->vm;
>  	struct xe_vm_pgtable_update_ops *pt_update_ops =
>  		&vops->pt_update_ops[tile->id];
> -	struct dma_fence *fence, *ifence, *mfence;
> +	struct xe_exec_queue *q = pt_update_ops->q;
> +	struct dma_fence *fence, *ifence = NULL, *mfence = NULL;
>  	struct xe_tlb_inval_job *ijob = NULL, *mjob = NULL;
> -	struct dma_fence **fences = NULL;
> -	struct dma_fence_array *cf = NULL;
>  	struct xe_range_fence *rfence;
>  	struct xe_vma_op *op;
>  	int err = 0, i;
> @@ -2390,15 +2387,14 @@ xe_pt_update_ops_run(struct xe_tile *tile,
> struct xe_vma_ops *vops)
>  #endif
>  
>  	if (pt_update_ops->needs_invalidation) {
> -		struct xe_exec_queue *q = pt_update_ops->q;
>  		struct xe_dep_scheduler *dep_scheduler =
>  			to_dep_scheduler(q, tile->primary_gt);
>  
>  		ijob = xe_tlb_inval_job_create(q, &tile->primary_gt-
> >tlb_inval,
> -					       dep_scheduler,
> +					       dep_scheduler, vm,
>  					       pt_update_ops->start,
>  					       pt_update_ops->last,
> -					       vm->usm.asid);
> +					      
> XE_EXEC_QUEUE_TLB_INVAL_PRIMARY_GT);
>  		if (IS_ERR(ijob)) {
>  			err = PTR_ERR(ijob);
>  			goto kill_vm_tile1;
> @@ -2410,26 +2406,15 @@ xe_pt_update_ops_run(struct xe_tile *tile,
> struct xe_vma_ops *vops)
>  
>  			mjob = xe_tlb_inval_job_create(q,
>  						       &tile-
> >media_gt->tlb_inval,
> -						      
> dep_scheduler,
> +						      
> dep_scheduler, vm,
>  						      
> pt_update_ops->start,
>  						      
> pt_update_ops->last,
> -						       vm-
> >usm.asid);
> +						      
> XE_EXEC_QUEUE_TLB_INVAL_MEDIA_GT);
>  			if (IS_ERR(mjob)) {
>  				err = PTR_ERR(mjob);
>  				goto free_ijob;
>  			}
>  			update.mjob = mjob;
> -
> -			fences = kmalloc_array(2, sizeof(*fences),
> GFP_KERNEL);
> -			if (!fences) {
> -				err = -ENOMEM;
> -				goto free_ijob;
> -			}
> -			cf = dma_fence_array_alloc(2);
> -			if (!cf) {
> -				err = -ENOMEM;
> -				goto free_ijob;
> -			}
>  		}
>  	}
>  
> @@ -2460,31 +2445,12 @@ xe_pt_update_ops_run(struct xe_tile *tile,
> struct xe_vma_ops *vops)
>  				  pt_update_ops->last, fence))
>  		dma_fence_wait(fence, false);
>  
> -	/* tlb invalidation must be done before signaling
> unbind/rebind */
> -	if (ijob) {
> -		struct dma_fence *__fence;
> -
> +	if (ijob)
>  		ifence = xe_tlb_inval_job_push(ijob, tile->migrate,
> fence);
> -		__fence = ifence;
> +	if (mjob)
> +		mfence = xe_tlb_inval_job_push(mjob, tile->migrate,
> fence);
>  
> -		if (mjob) {
> -			fences[0] = ifence;
> -			mfence = xe_tlb_inval_job_push(mjob, tile-
> >migrate,
> -						       fence);
> -			fences[1] = mfence;
> -
> -			dma_fence_array_init(cf, 2, fences,
> -					     vm-
> >composite_fence_ctx,
> -					     vm-
> >composite_fence_seqno++,
> -					     false);
> -			__fence = &cf->base;
> -		}
> -
> -		dma_fence_put(fence);
> -		fence = __fence;
> -	}
> -
> -	if (!mjob) {
> +	if (!mjob && !ijob) {
>  		dma_resv_add_fence(xe_vm_resv(vm), fence,
>  				   pt_update_ops->wait_vm_bookkeep ?
>  				   DMA_RESV_USAGE_KERNEL :
> @@ -2492,6 +2458,14 @@ xe_pt_update_ops_run(struct xe_tile *tile,
> struct xe_vma_ops *vops)
>  
>  		list_for_each_entry(op, &vops->list, link)
>  			op_commit(vops->vm, tile, pt_update_ops, op,
> fence, NULL);
> +	} else if (ijob && !mjob) {
> +		dma_resv_add_fence(xe_vm_resv(vm), ifence,
> +				   pt_update_ops->wait_vm_bookkeep ?
> +				   DMA_RESV_USAGE_KERNEL :
> +				   DMA_RESV_USAGE_BOOKKEEP);
> +
> +		list_for_each_entry(op, &vops->list, link)
> +			op_commit(vops->vm, tile, pt_update_ops, op,
> ifence, NULL);
>  	} else {
>  		dma_resv_add_fence(xe_vm_resv(vm), ifence,
>  				   pt_update_ops->wait_vm_bookkeep ?
> @@ -2511,16 +2485,23 @@ xe_pt_update_ops_run(struct xe_tile *tile,
> struct xe_vma_ops *vops)
>  	if (pt_update_ops->needs_svm_lock)
>  		xe_svm_notifier_unlock(vm);
>  
> +	/*
> +	 * The last fence is only used for zero bind queue idling;
> migrate
> +	 * queues are not exposed to user space.
> +	 */
> +	if (!(q->flags & EXEC_QUEUE_FLAG_MIGRATE))
> +		xe_exec_queue_last_fence_set(q, vm, fence);
> +
>  	xe_tlb_inval_job_put(mjob);
>  	xe_tlb_inval_job_put(ijob);
> +	dma_fence_put(ifence);
> +	dma_fence_put(mfence);
>  
>  	return fence;
>  
>  free_rfence:
>  	kfree(rfence);
>  free_ijob:
> -	kfree(cf);
> -	kfree(fences);
>  	xe_tlb_inval_job_put(mjob);
>  	xe_tlb_inval_job_put(ijob);
>  kill_vm_tile1:
> diff --git a/drivers/gpu/drm/xe/xe_sync.c
> b/drivers/gpu/drm/xe/xe_sync.c
> index d48ab7b32ca5..df7ca349398b 100644
> --- a/drivers/gpu/drm/xe/xe_sync.c
> +++ b/drivers/gpu/drm/xe/xe_sync.c
> @@ -14,7 +14,7 @@
>  #include <drm/drm_syncobj.h>
>  #include <uapi/drm/xe_drm.h>
>  
> -#include "xe_device_types.h"
> +#include "xe_device.h"
>  #include "xe_exec_queue.h"
>  #include "xe_macros.h"
>  #include "xe_sched_job_types.h"
> @@ -297,26 +297,67 @@ xe_sync_in_fence_get(struct xe_sync_entry
> *sync, int num_sync,
>  	struct dma_fence **fences = NULL;
>  	struct dma_fence_array *cf = NULL;
>  	struct dma_fence *fence;
> -	int i, num_in_fence = 0, current_fence = 0;
> +	int i, num_fence = 0, current_fence = 0;
>  
>  	lockdep_assert_held(&vm->lock);
>  
>  	/* Count in-fences */
>  	for (i = 0; i < num_sync; ++i) {
>  		if (sync[i].fence) {
> -			++num_in_fence;
> +			++num_fence;
>  			fence = sync[i].fence;
>  		}
>  	}
>  
>  	/* Easy case... */
> -	if (!num_in_fence) {
> +	if (!num_fence) {
> +		if (q->flags & EXEC_QUEUE_FLAG_VM) {
> +			struct xe_exec_queue *__q;
> +			struct xe_tile *tile;
> +			u8 id;
> +
> +			for_each_tile(tile, vm->xe, id)
> +				num_fence += (1 +
> XE_MAX_GT_PER_TILE);
> +
> +			fences = kmalloc_array(num_fence,
> sizeof(*fences),
> +					       GFP_KERNEL);
> +			if (!fences)
> +				return ERR_PTR(-ENOMEM);
> +
> +			fences[current_fence++] =
> +				xe_exec_queue_last_fence_get(q, vm);
> +			for_each_tlb_inval(i)
> +				fences[current_fence++] =
> +					xe_exec_queue_tlb_inval_last
> _fence_get(q, vm, i);
> +			list_for_each_entry(__q, &q->multi_gt_list,
> +					    multi_gt_link) {
> +				fences[current_fence++] =
> +					xe_exec_queue_last_fence_get
> (__q, vm);
> +				for_each_tlb_inval(i)
> +					fences[current_fence++] =
> +						xe_exec_queue_tlb_in
> val_last_fence_get(__q, vm, i);
> +			}
> +
> +			xe_assert(vm->xe, current_fence ==
> num_fence);
> +			cf = dma_fence_array_create(num_fence,
> fences,
> +						   
> dma_fence_context_alloc(1),
> +						    1, false);
> +			if (!cf)
> +				goto err_out;
> +
> +			return &cf->base;
> +		}
> +
>  		fence = xe_exec_queue_last_fence_get(q, vm);
>  		return fence;
>  	}
>  
> -	/* Create composite fence */
> -	fences = kmalloc_array(num_in_fence + 1, sizeof(*fences),
> GFP_KERNEL);
> +	/*
> +	 * Create composite fence - FIXME - the below code doesn't
> work. This is
> +	 * unused in Mesa so we are ok for the moment. Perhaps we
> just disable
> +	 * this entire code path if number of in fences != 0.
> +	 */
> +	fences = kmalloc_array(num_fence + 1, sizeof(*fences),
> GFP_KERNEL);
>  	if (!fences)
>  		return ERR_PTR(-ENOMEM);
>  	for (i = 0; i < num_sync; ++i) {
> @@ -326,14 +367,10 @@ xe_sync_in_fence_get(struct xe_sync_entry
> *sync, int num_sync,
>  		}
>  	}
>  	fences[current_fence++] = xe_exec_queue_last_fence_get(q,
> vm);
> -	cf = dma_fence_array_create(num_in_fence, fences,
> -				    vm->composite_fence_ctx,
> -				    vm->composite_fence_seqno++,
> -				    false);
> -	if (!cf) {
> -		--vm->composite_fence_seqno;
> +	cf = dma_fence_array_create(num_fence, fences,
> +				    dma_fence_context_alloc(1), 1,
> false);
> +	if (!cf)
>  		goto err_out;
> -	}
>  
>  	return &cf->base;
>  
> diff --git a/drivers/gpu/drm/xe/xe_tlb_inval_job.c
> b/drivers/gpu/drm/xe/xe_tlb_inval_job.c
> index 492def04a559..1ae0dec2cf31 100644
> --- a/drivers/gpu/drm/xe/xe_tlb_inval_job.c
> +++ b/drivers/gpu/drm/xe/xe_tlb_inval_job.c
> @@ -12,6 +12,7 @@
>  #include "xe_tlb_inval_job.h"
>  #include "xe_migrate.h"
>  #include "xe_pm.h"
> +#include "xe_vm.h"
>  
>  /** struct xe_tlb_inval_job - TLB invalidation job */
>  struct xe_tlb_inval_job {
> @@ -21,6 +22,8 @@ struct xe_tlb_inval_job {
>  	struct xe_tlb_inval *tlb_inval;
>  	/** @q: exec queue issuing the invalidate */
>  	struct xe_exec_queue *q;
> +	/** @vm: VM which TLB invalidation is being issued for */
> +	struct xe_vm *vm;
>  	/** @refcount: ref count of this job */
>  	struct kref refcount;
>  	/**
> @@ -32,8 +35,8 @@ struct xe_tlb_inval_job {
>  	u64 start;
>  	/** @end: End address to invalidate */
>  	u64 end;
> -	/** @asid: Address space ID to invalidate */
> -	u32 asid;
> +	/** @type: GT type */
> +	int type;
>  	/** @fence_armed: Fence has been armed */
>  	bool fence_armed;
>  };
> @@ -46,7 +49,7 @@ static struct dma_fence
> *xe_tlb_inval_job_run(struct xe_dep_job *dep_job)
>  		container_of(job->fence, typeof(*ifence), base);
>  
>  	xe_tlb_inval_range(job->tlb_inval, ifence, job->start,
> -			   job->end, job->asid);
> +			   job->end, job->vm->usm.asid);
>  
>  	return job->fence;
>  }
> @@ -70,9 +73,10 @@ static const struct xe_dep_job_ops dep_job_ops = {
>   * @q: exec queue issuing the invalidate
>   * @tlb_inval: TLB invalidation client
>   * @dep_scheduler: Dependency scheduler for job
> + * @vm: VM which TLB invalidation is being issued for
>   * @start: Start address to invalidate
>   * @end: End address to invalidate
> - * @asid: Address space ID to invalidate
> + * @type: GT type
>   *
>   * Create a TLB invalidation job and initialize internal fields. The
> caller is
>   * responsible for releasing the creation reference.
> @@ -81,8 +85,8 @@ static const struct xe_dep_job_ops dep_job_ops = {
>   */
>  struct xe_tlb_inval_job *
>  xe_tlb_inval_job_create(struct xe_exec_queue *q, struct xe_tlb_inval
> *tlb_inval,
> -			struct xe_dep_scheduler *dep_scheduler, u64
> start,
> -			u64 end, u32 asid)
> +			struct xe_dep_scheduler *dep_scheduler,
> +			struct xe_vm *vm, u64 start, u64 end, int
> type)
>  {
>  	struct xe_tlb_inval_job *job;
>  	struct drm_sched_entity *entity =
> @@ -90,19 +94,24 @@ xe_tlb_inval_job_create(struct xe_exec_queue *q,
> struct xe_tlb_inval *tlb_inval,
>  	struct xe_tlb_inval_fence *ifence;
>  	int err;
>  
> +	xe_assert(vm->xe, type == XE_EXEC_QUEUE_TLB_INVAL_MEDIA_GT
> ||
> +		  type == XE_EXEC_QUEUE_TLB_INVAL_PRIMARY_GT);
> +
>  	job = kmalloc(sizeof(*job), GFP_KERNEL);
>  	if (!job)
>  		return ERR_PTR(-ENOMEM);
>  
>  	job->q = q;
> +	job->vm = vm;
>  	job->tlb_inval = tlb_inval;
>  	job->start = start;
>  	job->end = end;
> -	job->asid = asid;
>  	job->fence_armed = false;
>  	job->dep.ops = &dep_job_ops;
> +	job->type = type;
>  	kref_init(&job->refcount);
>  	xe_exec_queue_get(q);	/* Pairs with put in
> xe_tlb_inval_job_destroy */
> +	xe_vm_get(vm);		/* Pairs with put in
> xe_tlb_inval_job_destroy */
>  
>  	ifence = kmalloc(sizeof(*ifence), GFP_KERNEL);
>  	if (!ifence) {
> @@ -124,6 +133,7 @@ xe_tlb_inval_job_create(struct xe_exec_queue *q,
> struct xe_tlb_inval *tlb_inval,
>  err_fence:
>  	kfree(ifence);
>  err_job:
> +	xe_vm_put(vm);
>  	xe_exec_queue_put(q);
>  	kfree(job);
>  
> @@ -138,6 +148,7 @@ static void xe_tlb_inval_job_destroy(struct kref
> *ref)
>  		container_of(job->fence, typeof(*ifence), base);
>  	struct xe_exec_queue *q = job->q;
>  	struct xe_device *xe = gt_to_xe(q->gt);
> +	struct xe_vm *vm = job->vm;
>  
>  	if (!job->fence_armed)
>  		kfree(ifence);
> @@ -147,6 +158,7 @@ static void xe_tlb_inval_job_destroy(struct kref
> *ref)
>  
>  	drm_sched_job_cleanup(&job->dep.drm);
>  	kfree(job);
> +	xe_vm_put(vm);		/* Pairs with get from
> xe_tlb_inval_job_create */
>  	xe_exec_queue_put(q);	/* Pairs with get from
> xe_tlb_inval_job_create */
>  	xe_pm_runtime_put(xe);	/* Pairs with get from
> xe_tlb_inval_job_create */
>  }
> @@ -231,6 +243,11 @@ struct dma_fence *xe_tlb_inval_job_push(struct
> xe_tlb_inval_job *job,
>  	dma_fence_get(&job->dep.drm.s_fence->finished);
>  	drm_sched_entity_push_job(&job->dep.drm);
>  
> +	/* Let the upper layers fish this out */
> +	xe_exec_queue_tlb_inval_last_fence_set(job->q, job->vm,
> +					       &job-
> >dep.drm.s_fence->finished,
> +					       job->type);
> +
>  	xe_migrate_job_unlock(m, job->q);
>  
>  	/*
> diff --git a/drivers/gpu/drm/xe/xe_tlb_inval_job.h
> b/drivers/gpu/drm/xe/xe_tlb_inval_job.h
> index e63edcb26b50..4d6df1a6c6ca 100644
> --- a/drivers/gpu/drm/xe/xe_tlb_inval_job.h
> +++ b/drivers/gpu/drm/xe/xe_tlb_inval_job.h
> @@ -11,14 +11,15 @@
>  struct dma_fence;
>  struct xe_dep_scheduler;
>  struct xe_exec_queue;
> +struct xe_migrate;
>  struct xe_tlb_inval;
>  struct xe_tlb_inval_job;
> -struct xe_migrate;
> +struct xe_vm;
>  
>  struct xe_tlb_inval_job *
>  xe_tlb_inval_job_create(struct xe_exec_queue *q, struct xe_tlb_inval
> *tlb_inval,
>  			struct xe_dep_scheduler *dep_scheduler,
> -			u64 start, u64 end, u32 asid);
> +			struct xe_vm *vm, u64 start, u64 end, int
> type);
>  
>  int xe_tlb_inval_job_alloc_dep(struct xe_tlb_inval_job *job);
>  
> diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c
> index 4241cc433dca..7a6e254996fb 100644
> --- a/drivers/gpu/drm/xe/xe_vm.c
> +++ b/drivers/gpu/drm/xe/xe_vm.c
> @@ -1623,9 +1623,6 @@ struct xe_vm *xe_vm_create(struct xe_device
> *xe, u32 flags, struct xe_file *xef)
>  		}
>  	}
>  
> -	if (number_tiles > 1)
> -		vm->composite_fence_ctx =
> dma_fence_context_alloc(1);
> -
>  	if (xef && xe->info.has_asid) {
>  		u32 asid;
>  
> @@ -3107,20 +3104,26 @@ static struct dma_fence *ops_execute(struct
> xe_vm *vm,
>  	struct dma_fence *fence = NULL;
>  	struct dma_fence **fences = NULL;
>  	struct dma_fence_array *cf = NULL;
> -	int number_tiles = 0, current_fence = 0, err;
> +	int number_tiles = 0, current_fence = 0, n_fence = 0, err;
>  	u8 id;
>  
>  	number_tiles = vm_ops_setup_tile_args(vm, vops);
>  	if (number_tiles == 0)
>  		return ERR_PTR(-ENODATA);
>  
> -	if (number_tiles > 1) {
> -		fences = kmalloc_array(number_tiles,
> sizeof(*fences),
> -				       GFP_KERNEL);
> -		if (!fences) {
> -			fence = ERR_PTR(-ENOMEM);
> -			goto err_trace;
> -		}
> +	for_each_tile(tile, vm->xe, id)
> +		n_fence += (1 + XE_MAX_GT_PER_TILE);
> +
> +	fences = kmalloc_array(n_fence, sizeof(*fences),
> GFP_KERNEL);
> +	if (!fences) {
> +		fence = ERR_PTR(-ENOMEM);
> +		goto err_trace;
> +	}
> +
> +	cf = dma_fence_array_alloc(n_fence);
> +	if (!cf) {
> +		fence = ERR_PTR(-ENOMEM);
> +		goto err_out;
>  	}
>  
>  	for_each_tile(tile, vm->xe, id) {
> @@ -3137,29 +3140,30 @@ static struct dma_fence *ops_execute(struct
> xe_vm *vm,
>  	trace_xe_vm_ops_execute(vops);
>  
>  	for_each_tile(tile, vm->xe, id) {
> +		struct xe_exec_queue *q = vops->pt_update_ops[tile-
> >id].q;
> +		int i;
> +
> +		fence = NULL;
>  		if (!vops->pt_update_ops[id].num_ops)
> -			continue;
> +			goto collect_fences;
>  
>  		fence = xe_pt_update_ops_run(tile, vops);
>  		if (IS_ERR(fence))
>  			goto err_out;
>  
> -		if (fences)
> -			fences[current_fence++] = fence;
> +collect_fences:
> +		fences[current_fence++] = fence ?:
> dma_fence_get_stub();
> +		xe_migrate_job_lock(tile->migrate, q);
> +		for_each_tlb_inval(i)
> +			fences[current_fence++] =
> +				xe_exec_queue_tlb_inval_last_fence_g
> et(q, vm, i);
> +		xe_migrate_job_unlock(tile->migrate, q);
>  	}
>  
> -	if (fences) {
> -		cf = dma_fence_array_create(number_tiles, fences,
> -					    vm->composite_fence_ctx,
> -					    vm-
> >composite_fence_seqno++,
> -					    false);
> -		if (!cf) {
> -			--vm->composite_fence_seqno;
> -			fence = ERR_PTR(-ENOMEM);
> -			goto err_out;
> -		}
> -		fence = &cf->base;
> -	}
> +	xe_assert(vm->xe, current_fence == n_fence);
> +	dma_fence_array_init(cf, n_fence, fences,
> dma_fence_context_alloc(1),
> +			     1, false);
> +	fence = &cf->base;
>  
>  	for_each_tile(tile, vm->xe, id) {
>  		if (!vops->pt_update_ops[id].num_ops)
> @@ -3220,7 +3224,6 @@ static void op_add_ufence(struct xe_vm *vm,
> struct xe_vma_op *op,
>  static void vm_bind_ioctl_ops_fini(struct xe_vm *vm, struct
> xe_vma_ops *vops,
>  				   struct dma_fence *fence)
>  {
> -	struct xe_exec_queue *wait_exec_queue =
> to_wait_exec_queue(vm, vops->q);
>  	struct xe_user_fence *ufence;
>  	struct xe_vma_op *op;
>  	int i;
> @@ -3241,7 +3244,6 @@ static void vm_bind_ioctl_ops_fini(struct xe_vm
> *vm, struct xe_vma_ops *vops,
>  	if (fence) {
>  		for (i = 0; i < vops->num_syncs; i++)
>  			xe_sync_entry_signal(vops->syncs + i,
> fence);
> -		xe_exec_queue_last_fence_set(wait_exec_queue, vm,
> fence);
>  	}
>  }
>  
> @@ -3435,19 +3437,19 @@ static int vm_bind_ioctl_signal_fences(struct
> xe_vm *vm,
>  				       struct xe_sync_entry *syncs,
>  				       int num_syncs)
>  {
> -	struct dma_fence *fence;
> +	struct dma_fence *fence = NULL;
>  	int i, err = 0;
>  
> -	fence = xe_sync_in_fence_get(syncs, num_syncs,
> -				     to_wait_exec_queue(vm, q), vm);
> -	if (IS_ERR(fence))
> -		return PTR_ERR(fence);
> +	if (num_syncs) {
> +		fence = xe_sync_in_fence_get(syncs, num_syncs,
> +					     to_wait_exec_queue(vm,
> q), vm);
> +		if (IS_ERR(fence))
> +			return PTR_ERR(fence);
>  
> -	for (i = 0; i < num_syncs; i++)
> -		xe_sync_entry_signal(&syncs[i], fence);
> +		for (i = 0; i < num_syncs; i++)
> +			xe_sync_entry_signal(&syncs[i], fence);
> +	}
>  
> -	xe_exec_queue_last_fence_set(to_wait_exec_queue(vm, q), vm,
> -				     fence);
>  	dma_fence_put(fence);
>  
>  	return err;
> diff --git a/drivers/gpu/drm/xe/xe_vm_types.h
> b/drivers/gpu/drm/xe/xe_vm_types.h
> index d6e2a0fdd4b3..542dbe2f9310 100644
> --- a/drivers/gpu/drm/xe/xe_vm_types.h
> +++ b/drivers/gpu/drm/xe/xe_vm_types.h
> @@ -221,11 +221,6 @@ struct xe_vm {
>  #define XE_VM_FLAG_GSC			BIT(8)
>  	unsigned long flags;
>  
> -	/** @composite_fence_ctx: context composite fence */
> -	u64 composite_fence_ctx;
> -	/** @composite_fence_seqno: seqno for composite fence */
> -	u32 composite_fence_seqno;
> -
>  	/**
>  	 * @lock: outer most lock, protects objects of anything
> attached to this
>  	 * VM


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v5 1/6] drm/xe: Enforce correct user fence signaling order using drm_syncobjs
  2025-10-30  7:58   ` Thomas Hellström
@ 2025-10-30 12:54     ` Matthew Brost
  0 siblings, 0 replies; 16+ messages in thread
From: Matthew Brost @ 2025-10-30 12:54 UTC (permalink / raw)
  To: Thomas Hellström; +Cc: intel-xe

On Thu, Oct 30, 2025 at 08:58:53AM +0100, Thomas Hellström wrote:
> On Wed, 2025-10-29 at 13:57 -0700, Matthew Brost wrote:
> > Prevent application hangs caused by out-of-order fence signaling when
> > user fences are attached. Use drm_syncobj (via dma-fence-chain) to
> > guarantee that each user fence signals in order, regardless of the
> > signaling order of the attached fences. Ensure user fence writebacks
> > to
> > user space occur in the correct sequence.
> 
> Perhaps describe exactly what the signalling order is supposed to be?
> Like calling order per exec_queue etc. (strictly I guess it's the
> grabbing order of a certain lock, like the vm resv?)
> 
> Do we need a Fixes: for this?

Yes, we probably should add a fixes tag. It seems possible to hit, but
by chance, timing we haven't hit this yet. Will add ahead of merge.

Matt

> Reviewed-by: Thomas Hellström <thomas.hellstrom@linux.intel.com>
> 
> > 
> > Signed-of-by: Matthew Brost <matthew.brost@intel.com>
> > ---
> >  drivers/gpu/drm/xe/xe_exec.c             |  3 +-
> >  drivers/gpu/drm/xe/xe_exec_queue.c       | 13 +++++++
> >  drivers/gpu/drm/xe/xe_exec_queue_types.h |  7 ++++
> >  drivers/gpu/drm/xe/xe_oa.c               | 45 ++++++++++++++++------
> > --
> >  drivers/gpu/drm/xe/xe_oa_types.h         |  8 +++++
> >  drivers/gpu/drm/xe/xe_sync.c             | 17 +++++++--
> >  drivers/gpu/drm/xe/xe_sync.h             |  3 ++
> >  drivers/gpu/drm/xe/xe_sync_types.h       |  3 ++
> >  drivers/gpu/drm/xe/xe_vm.c               |  4 +++
> >  9 files changed, 85 insertions(+), 18 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/xe/xe_exec.c
> > b/drivers/gpu/drm/xe/xe_exec.c
> > index 0dc27476832b..f7b9d8715053 100644
> > --- a/drivers/gpu/drm/xe/xe_exec.c
> > +++ b/drivers/gpu/drm/xe/xe_exec.c
> > @@ -166,7 +166,8 @@ int xe_exec_ioctl(struct drm_device *dev, void
> > *data, struct drm_file *file)
> >  
> >  	for (num_syncs = 0; num_syncs < args->num_syncs;
> > num_syncs++) {
> >  		err = xe_sync_entry_parse(xe, xef,
> > &syncs[num_syncs],
> > -					  &syncs_user[num_syncs],
> > SYNC_PARSE_FLAG_EXEC |
> > +					  &syncs_user[num_syncs],
> > NULL, 0,
> > +					  SYNC_PARSE_FLAG_EXEC |
> >  					  (xe_vm_in_lr_mode(vm) ?
> >  					   SYNC_PARSE_FLAG_LR_MODE :
> > 0));
> >  		if (err)
> > diff --git a/drivers/gpu/drm/xe/xe_exec_queue.c
> > b/drivers/gpu/drm/xe/xe_exec_queue.c
> > index 90cbc95f8e2e..6e168efbac65 100644
> > --- a/drivers/gpu/drm/xe/xe_exec_queue.c
> > +++ b/drivers/gpu/drm/xe/xe_exec_queue.c
> > @@ -10,6 +10,7 @@
> >  #include <drm/drm_device.h>
> >  #include <drm/drm_drv.h>
> >  #include <drm/drm_file.h>
> > +#include <drm/drm_syncobj.h>
> >  #include <uapi/drm/xe_drm.h>
> >  
> >  #include "xe_dep_scheduler.h"
> > @@ -345,6 +346,7 @@ struct xe_exec_queue
> > *xe_exec_queue_create_bind(struct xe_device *xe,
> >  	struct xe_gt *gt = tile->primary_gt;
> >  	struct xe_exec_queue *q;
> >  	struct xe_vm *migrate_vm;
> > +	int err;
> >  
> >  	migrate_vm = xe_migrate_get_vm(tile->migrate);
> >  	if (xe->info.has_usm) {
> > @@ -368,6 +370,14 @@ struct xe_exec_queue
> > *xe_exec_queue_create_bind(struct xe_device *xe,
> >  	}
> >  	xe_vm_put(migrate_vm);
> >  
> > +	err = drm_syncobj_create(&q->ufence_syncobj,
> > +				 DRM_SYNCOBJ_CREATE_SIGNALED,
> > +				 NULL);
> > +	if (err) {
> > +		xe_exec_queue_put(q);
> > +		return ERR_PTR(err);
> > +	}
> > +
> >  	return q;
> >  }
> >  ALLOW_ERROR_INJECTION(xe_exec_queue_create_bind, ERRNO);
> > @@ -377,6 +387,9 @@ void xe_exec_queue_destroy(struct kref *ref)
> >  	struct xe_exec_queue *q = container_of(ref, struct
> > xe_exec_queue, refcount);
> >  	struct xe_exec_queue *eq, *next;
> >  
> > +	if (q->ufence_syncobj)
> > +		drm_syncobj_put(q->ufence_syncobj);
> > +
> >  	if (xe_exec_queue_uses_pxp(q))
> >  		xe_pxp_exec_queue_remove(gt_to_xe(q->gt)->pxp, q);
> >  
> > diff --git a/drivers/gpu/drm/xe/xe_exec_queue_types.h
> > b/drivers/gpu/drm/xe/xe_exec_queue_types.h
> > index 282505fa1377..838266c3914b 100644
> > --- a/drivers/gpu/drm/xe/xe_exec_queue_types.h
> > +++ b/drivers/gpu/drm/xe/xe_exec_queue_types.h
> > @@ -15,6 +15,7 @@
> >  #include "xe_hw_fence_types.h"
> >  #include "xe_lrc_types.h"
> >  
> > +struct drm_syncobj;
> >  struct xe_execlist_exec_queue;
> >  struct xe_gt;
> >  struct xe_guc_exec_queue;
> > @@ -155,6 +156,12 @@ struct xe_exec_queue {
> >  		struct list_head link;
> >  	} pxp;
> >  
> > +	/** @ufence_syncobj: User fence syncobj */
> > +	struct drm_syncobj *ufence_syncobj;
> > +
> > +	/** @ufence_timeline_value: User fence timeline value */
> > +	u64 ufence_timeline_value;
> > +
> >  	/** @ops: submission backend exec queue operations */
> >  	const struct xe_exec_queue_ops *ops;
> >  
> > diff --git a/drivers/gpu/drm/xe/xe_oa.c b/drivers/gpu/drm/xe/xe_oa.c
> > index f901ba52b403..7a13a7bd99a6 100644
> > --- a/drivers/gpu/drm/xe/xe_oa.c
> > +++ b/drivers/gpu/drm/xe/xe_oa.c
> > @@ -10,6 +10,7 @@
> >  
> >  #include <drm/drm_drv.h>
> >  #include <drm/drm_managed.h>
> > +#include <drm/drm_syncobj.h>
> >  #include <uapi/drm/xe_drm.h>
> >  
> >  #include <generated/xe_wa_oob.h>
> > @@ -1390,7 +1391,9 @@ static int xe_oa_user_extensions(struct xe_oa
> > *oa, enum xe_oa_user_extn_from fro
> >  	return 0;
> >  }
> >  
> > -static int xe_oa_parse_syncs(struct xe_oa *oa, struct
> > xe_oa_open_param *param)
> > +static int xe_oa_parse_syncs(struct xe_oa *oa,
> > +			     struct xe_oa_stream *stream,
> > +			     struct xe_oa_open_param *param)
> >  {
> >  	int ret, num_syncs, num_ufence = 0;
> >  
> > @@ -1410,7 +1413,9 @@ static int xe_oa_parse_syncs(struct xe_oa *oa,
> > struct xe_oa_open_param *param)
> >  
> >  	for (num_syncs = 0; num_syncs < param->num_syncs;
> > num_syncs++) {
> >  		ret = xe_sync_entry_parse(oa->xe, param->xef,
> > &param->syncs[num_syncs],
> > -					  &param-
> > >syncs_user[num_syncs], 0);
> > +					  &param-
> > >syncs_user[num_syncs],
> > +					  stream->ufence_syncobj,
> > +					  ++stream-
> > >ufence_timeline_value, 0);
> >  		if (ret)
> >  			goto err_syncs;
> >  
> > @@ -1540,7 +1545,7 @@ static long xe_oa_config_locked(struct
> > xe_oa_stream *stream, u64 arg)
> >  		return -ENODEV;
> >  
> >  	param.xef = stream->xef;
> > -	err = xe_oa_parse_syncs(stream->oa, &param);
> > +	err = xe_oa_parse_syncs(stream->oa, stream, &param);
> >  	if (err)
> >  		goto err_config_put;
> >  
> > @@ -1636,6 +1641,7 @@ static void xe_oa_destroy_locked(struct
> > xe_oa_stream *stream)
> >  	if (stream->exec_q)
> >  		xe_exec_queue_put(stream->exec_q);
> >  
> > +	drm_syncobj_put(stream->ufence_syncobj);
> >  	kfree(stream);
> >  }
> >  
> > @@ -1827,6 +1833,7 @@ static int
> > xe_oa_stream_open_ioctl_locked(struct xe_oa *oa,
> >  					  struct xe_oa_open_param
> > *param)
> >  {
> >  	struct xe_oa_stream *stream;
> > +	struct drm_syncobj *ufence_syncobj;
> >  	int stream_fd;
> >  	int ret;
> >  
> > @@ -1837,17 +1844,31 @@ static int
> > xe_oa_stream_open_ioctl_locked(struct xe_oa *oa,
> >  		goto exit;
> >  	}
> >  
> > +	ret = drm_syncobj_create(&ufence_syncobj,
> > DRM_SYNCOBJ_CREATE_SIGNALED,
> > +				 NULL);
> > +	if (ret)
> > +		goto exit;
> > +
> >  	stream = kzalloc(sizeof(*stream), GFP_KERNEL);
> >  	if (!stream) {
> >  		ret = -ENOMEM;
> > -		goto exit;
> > +		goto err_syncobj;
> >  	}
> > -
> > +	stream->ufence_syncobj = ufence_syncobj;
> >  	stream->oa = oa;
> > -	ret = xe_oa_stream_init(stream, param);
> > +
> > +	ret = xe_oa_parse_syncs(oa, stream, param);
> >  	if (ret)
> >  		goto err_free;
> >  
> > +	ret = xe_oa_stream_init(stream, param);
> > +	if (ret) {
> > +		while (param->num_syncs--)
> > +			xe_sync_entry_cleanup(&param->syncs[param-
> > >num_syncs]);
> > +		kfree(param->syncs);
> > +		goto err_free;
> > +	}
> > +
> >  	if (!param->disabled) {
> >  		ret = xe_oa_enable_locked(stream);
> >  		if (ret)
> > @@ -1871,6 +1892,8 @@ static int
> > xe_oa_stream_open_ioctl_locked(struct xe_oa *oa,
> >  	xe_oa_stream_destroy(stream);
> >  err_free:
> >  	kfree(stream);
> > +err_syncobj:
> > +	drm_syncobj_put(ufence_syncobj);
> >  exit:
> >  	return ret;
> >  }
> > @@ -2084,22 +2107,14 @@ int xe_oa_stream_open_ioctl(struct drm_device
> > *dev, u64 data, struct drm_file *f
> >  		goto err_exec_q;
> >  	}
> >  
> > -	ret = xe_oa_parse_syncs(oa, &param);
> > -	if (ret)
> > -		goto err_exec_q;
> > -
> >  	mutex_lock(&param.hwe->gt->oa.gt_lock);
> >  	ret = xe_oa_stream_open_ioctl_locked(oa, &param);
> >  	mutex_unlock(&param.hwe->gt->oa.gt_lock);
> >  	if (ret < 0)
> > -		goto err_sync_cleanup;
> > +		goto err_exec_q;
> >  
> >  	return ret;
> >  
> > -err_sync_cleanup:
> > -	while (param.num_syncs--)
> > -
> > 		xe_sync_entry_cleanup(&param.syncs[param.num_syncs]);
> > -	kfree(param.syncs);
> >  err_exec_q:
> >  	if (param.exec_q)
> >  		xe_exec_queue_put(param.exec_q);
> > diff --git a/drivers/gpu/drm/xe/xe_oa_types.h
> > b/drivers/gpu/drm/xe/xe_oa_types.h
> > index 2628f78c4e8d..daf701b5d48b 100644
> > --- a/drivers/gpu/drm/xe/xe_oa_types.h
> > +++ b/drivers/gpu/drm/xe/xe_oa_types.h
> > @@ -15,6 +15,8 @@
> >  #include "regs/xe_reg_defs.h"
> >  #include "xe_hw_engine_types.h"
> >  
> > +struct drm_syncobj;
> > +
> >  #define DEFAULT_XE_OA_BUFFER_SIZE SZ_16M
> >  
> >  enum xe_oa_report_header {
> > @@ -248,6 +250,12 @@ struct xe_oa_stream {
> >  	/** @xef: xe_file with which the stream was opened */
> >  	struct xe_file *xef;
> >  
> > +	/** @ufence_syncobj: User fence syncobj */
> > +	struct drm_syncobj *ufence_syncobj;
> > +
> > +	/** @ufence_timeline_value: User fence timeline value */
> > +	u64 ufence_timeline_value;
> > +
> >  	/** @last_fence: fence to use in stream destroy when needed
> > */
> >  	struct dma_fence *last_fence;
> >  
> > diff --git a/drivers/gpu/drm/xe/xe_sync.c
> > b/drivers/gpu/drm/xe/xe_sync.c
> > index 82872a51f098..d48ab7b32ca5 100644
> > --- a/drivers/gpu/drm/xe/xe_sync.c
> > +++ b/drivers/gpu/drm/xe/xe_sync.c
> > @@ -113,6 +113,8 @@ static void user_fence_cb(struct dma_fence
> > *fence, struct dma_fence_cb *cb)
> >  int xe_sync_entry_parse(struct xe_device *xe, struct xe_file *xef,
> >  			struct xe_sync_entry *sync,
> >  			struct drm_xe_sync __user *sync_user,
> > +			struct drm_syncobj *ufence_syncobj,
> > +			u64 ufence_timeline_value,
> >  			unsigned int flags)
> >  {
> >  	struct drm_xe_sync sync_in;
> > @@ -192,10 +194,15 @@ int xe_sync_entry_parse(struct xe_device *xe,
> > struct xe_file *xef,
> >  		if (exec) {
> >  			sync->addr = sync_in.addr;
> >  		} else {
> > +			sync->ufence_timeline_value =
> > ufence_timeline_value;
> >  			sync->ufence = user_fence_create(xe,
> > sync_in.addr,
> >  							
> > sync_in.timeline_value);
> >  			if (XE_IOCTL_DBG(xe, IS_ERR(sync->ufence)))
> >  				return PTR_ERR(sync->ufence);
> > +			sync->ufence_chain_fence =
> > dma_fence_chain_alloc();
> > +			if (!sync->ufence_chain_fence)
> > +				return -ENOMEM;
> > +			sync->ufence_syncobj = ufence_syncobj;
> >  		}
> >  
> >  		break;
> > @@ -239,7 +246,12 @@ void xe_sync_entry_signal(struct xe_sync_entry
> > *sync, struct dma_fence *fence)
> >  	} else if (sync->ufence) {
> >  		int err;
> >  
> > -		dma_fence_get(fence);
> > +		drm_syncobj_add_point(sync->ufence_syncobj,
> > +				      sync->ufence_chain_fence,
> > +				      fence, sync-
> > >ufence_timeline_value);
> > +		sync->ufence_chain_fence = NULL;
> > +
> > +		fence = drm_syncobj_fence_get(sync->ufence_syncobj);
> >  		user_fence_get(sync->ufence);
> >  		err = dma_fence_add_callback(fence, &sync->ufence-
> > >cb,
> >  					     user_fence_cb);
> > @@ -259,7 +271,8 @@ void xe_sync_entry_cleanup(struct xe_sync_entry
> > *sync)
> >  		drm_syncobj_put(sync->syncobj);
> >  	dma_fence_put(sync->fence);
> >  	dma_fence_chain_free(sync->chain_fence);
> > -	if (sync->ufence)
> > +	dma_fence_chain_free(sync->ufence_chain_fence);
> > +	if (!IS_ERR_OR_NULL(sync->ufence))
> >  		user_fence_put(sync->ufence);
> >  }
> >  
> > diff --git a/drivers/gpu/drm/xe/xe_sync.h
> > b/drivers/gpu/drm/xe/xe_sync.h
> > index 256ffc1e54dc..51f2d803e977 100644
> > --- a/drivers/gpu/drm/xe/xe_sync.h
> > +++ b/drivers/gpu/drm/xe/xe_sync.h
> > @@ -8,6 +8,7 @@
> >  
> >  #include "xe_sync_types.h"
> >  
> > +struct drm_syncobj;
> >  struct xe_device;
> >  struct xe_exec_queue;
> >  struct xe_file;
> > @@ -21,6 +22,8 @@ struct xe_vm;
> >  int xe_sync_entry_parse(struct xe_device *xe, struct xe_file *xef,
> >  			struct xe_sync_entry *sync,
> >  			struct drm_xe_sync __user *sync_user,
> > +			struct drm_syncobj *ufence_syncobj,
> > +			u64 ufence_timeline_value,
> >  			unsigned int flags);
> >  int xe_sync_entry_add_deps(struct xe_sync_entry *sync,
> >  			   struct xe_sched_job *job);
> > diff --git a/drivers/gpu/drm/xe/xe_sync_types.h
> > b/drivers/gpu/drm/xe/xe_sync_types.h
> > index 30ac3f51993b..b88f1833e28c 100644
> > --- a/drivers/gpu/drm/xe/xe_sync_types.h
> > +++ b/drivers/gpu/drm/xe/xe_sync_types.h
> > @@ -18,9 +18,12 @@ struct xe_sync_entry {
> >  	struct drm_syncobj *syncobj;
> >  	struct dma_fence *fence;
> >  	struct dma_fence_chain *chain_fence;
> > +	struct dma_fence_chain *ufence_chain_fence;
> > +	struct drm_syncobj *ufence_syncobj;
> >  	struct xe_user_fence *ufence;
> >  	u64 addr;
> >  	u64 timeline_value;
> > +	u64 ufence_timeline_value;
> >  	u32 type;
> >  	u32 flags;
> >  };
> > diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c
> > index 10d77666a425..4058c476aa2d 100644
> > --- a/drivers/gpu/drm/xe/xe_vm.c
> > +++ b/drivers/gpu/drm/xe/xe_vm.c
> > @@ -3633,8 +3633,12 @@ int xe_vm_bind_ioctl(struct drm_device *dev,
> > void *data, struct drm_file *file)
> >  
> >  	syncs_user = u64_to_user_ptr(args->syncs);
> >  	for (num_syncs = 0; num_syncs < args->num_syncs;
> > num_syncs++) {
> > +		struct xe_exec_queue *__q = q ?: vm->q[0];
> > +
> >  		err = xe_sync_entry_parse(xe, xef,
> > &syncs[num_syncs],
> >  					  &syncs_user[num_syncs],
> > +					  __q->ufence_syncobj,
> > +					  ++__q-
> > >ufence_timeline_value,
> >  					  (xe_vm_in_lr_mode(vm) ?
> >  					   SYNC_PARSE_FLAG_LR_MODE :
> > 0) |
> >  					  (!args->num_binds ?
> 

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v5 4/6] drm/xe: Skip TLB invalidation waits in page fault binds
  2025-10-29 20:57 ` [PATCH v5 4/6] drm/xe: Skip TLB invalidation waits in page fault binds Matthew Brost
@ 2025-11-03 15:19   ` Thomas Hellström
  0 siblings, 0 replies; 16+ messages in thread
From: Thomas Hellström @ 2025-11-03 15:19 UTC (permalink / raw)
  To: Matthew Brost, intel-xe

On Wed, 2025-10-29 at 13:57 -0700, Matthew Brost wrote:
> Avoid waiting on unrelated TLB invalidations when servicing page
> fault
> binds. Since the migrate queue is shared across processes, TLB
> invalidations triggered by other processes may occur concurrently but
> are not relevant to the current bind. Teach the bind pipeline to skip
> waits on such invalidations to prevent unnecessary serialization.
> 
> Signed-off-by: Matthew Brost <matthew.brost@intel.com>

Reviewed-by: Thomas Hellström <thomas.hellstrom@linux.intel.com>

> ---
>  drivers/gpu/drm/xe/xe_vm.c       | 14 ++++++++++++--
>  drivers/gpu/drm/xe/xe_vm_types.h |  1 +
>  2 files changed, 13 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c
> index 7a6e254996fb..6c77ff109fe4 100644
> --- a/drivers/gpu/drm/xe/xe_vm.c
> +++ b/drivers/gpu/drm/xe/xe_vm.c
> @@ -755,6 +755,7 @@ struct dma_fence *xe_vma_rebind(struct xe_vm *vm,
> struct xe_vma *vma, u8 tile_ma
>  	xe_assert(vm->xe, xe_vm_in_fault_mode(vm));
>  
>  	xe_vma_ops_init(&vops, vm, NULL, NULL, 0);
> +	vops.flags |= XE_VMA_OPS_FLAG_SKIP_TLB_WAIT;
>  	for_each_tile(tile, vm->xe, id) {
>  		vops.pt_update_ops[id].wait_vm_bookkeep = true;
>  		vops.pt_update_ops[tile->id].q =
> @@ -845,6 +846,7 @@ struct dma_fence *xe_vm_range_rebind(struct xe_vm
> *vm,
>  	xe_assert(vm->xe, xe_vma_is_cpu_addr_mirror(vma));
>  
>  	xe_vma_ops_init(&vops, vm, NULL, NULL, 0);
> +	vops.flags |= XE_VMA_OPS_FLAG_SKIP_TLB_WAIT;
>  	for_each_tile(tile, vm->xe, id) {
>  		vops.pt_update_ops[id].wait_vm_bookkeep = true;
>  		vops.pt_update_ops[tile->id].q =
> @@ -3111,8 +3113,13 @@ static struct dma_fence *ops_execute(struct
> xe_vm *vm,
>  	if (number_tiles == 0)
>  		return ERR_PTR(-ENODATA);
>  
> -	for_each_tile(tile, vm->xe, id)
> -		n_fence += (1 + XE_MAX_GT_PER_TILE);
> +	if (vops->flags & XE_VMA_OPS_FLAG_SKIP_TLB_WAIT) {
> +		for_each_tile(tile, vm->xe, id)
> +			++n_fence;
> +	} else {
> +		for_each_tile(tile, vm->xe, id)
> +			n_fence += (1 + XE_MAX_GT_PER_TILE);
> +	}
>  
>  	fences = kmalloc_array(n_fence, sizeof(*fences),
> GFP_KERNEL);
>  	if (!fences) {
> @@ -3153,6 +3160,9 @@ static struct dma_fence *ops_execute(struct
> xe_vm *vm,
>  
>  collect_fences:
>  		fences[current_fence++] = fence ?:
> dma_fence_get_stub();
> +		if (vops->flags & XE_VMA_OPS_FLAG_SKIP_TLB_WAIT)
> +			continue;
> +
>  		xe_migrate_job_lock(tile->migrate, q);
>  		for_each_tlb_inval(i)
>  			fences[current_fence++] =
> diff --git a/drivers/gpu/drm/xe/xe_vm_types.h
> b/drivers/gpu/drm/xe/xe_vm_types.h
> index 542dbe2f9310..3766dc37b3ad 100644
> --- a/drivers/gpu/drm/xe/xe_vm_types.h
> +++ b/drivers/gpu/drm/xe/xe_vm_types.h
> @@ -466,6 +466,7 @@ struct xe_vma_ops {
>  #define XE_VMA_OPS_FLAG_HAS_SVM_PREFETCH BIT(0)
>  #define XE_VMA_OPS_FLAG_MADVISE          BIT(1)
>  #define XE_VMA_OPS_ARRAY_OF_BINDS	 BIT(2)
> +#define XE_VMA_OPS_FLAG_SKIP_TLB_WAIT	 BIT(3)
>  	u32 flags;
>  #ifdef TEST_VM_OPS_ERROR
>  	/** @inject_error: inject error to test error handling */


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v5 5/6] drm/xe: Disallow input fences on zero batch execs and zero binds
  2025-10-29 20:57 ` [PATCH v5 5/6] drm/xe: Disallow input fences on zero batch execs and zero binds Matthew Brost
@ 2025-11-03 15:21   ` Thomas Hellström
  2025-11-03 15:22     ` Thomas Hellström
  0 siblings, 1 reply; 16+ messages in thread
From: Thomas Hellström @ 2025-11-03 15:21 UTC (permalink / raw)
  To: Matthew Brost, intel-xe

On Wed, 2025-10-29 at 13:57 -0700, Matthew Brost wrote:
> Prevent input fences from being installed on zero batch execs or zero
> binds, which were originally added to support queue idling in Mesa
> via
> output fences. Although input fence support was introduced for
> interface
> consistency, it leads to incorrect behavior due to chained composite
> fences, which are disallowed.
> 
> Avoid the complexity of fixing this by removing support, as input
> fences
> for these cases are not used in practice.
> 
> Signed-off-by: Matthew Brost <matthew.brost@intel.com>

Reviewed-by: Thomas Hellström <thomas.hellstrom@linux.intel.com>

> ---
>  drivers/gpu/drm/xe/xe_sync.c | 101 +++++++++++++--------------------
> --
>  1 file changed, 36 insertions(+), 65 deletions(-)
> 
> diff --git a/drivers/gpu/drm/xe/xe_sync.c
> b/drivers/gpu/drm/xe/xe_sync.c
> index df7ca349398b..ff74528ca0c6 100644
> --- a/drivers/gpu/drm/xe/xe_sync.c
> +++ b/drivers/gpu/drm/xe/xe_sync.c
> @@ -301,84 +301,55 @@ xe_sync_in_fence_get(struct xe_sync_entry
> *sync, int num_sync,
>  
>  	lockdep_assert_held(&vm->lock);
>  
> -	/* Count in-fences */
> -	for (i = 0; i < num_sync; ++i) {
> -		if (sync[i].fence) {
> -			++num_fence;
> -			fence = sync[i].fence;
> -		}
> -	}
> -
> -	/* Easy case... */
> -	if (!num_fence) {
> -		if (q->flags & EXEC_QUEUE_FLAG_VM) {
> -			struct xe_exec_queue *__q;
> -			struct xe_tile *tile;
> -			u8 id;
> -
> -			for_each_tile(tile, vm->xe, id)
> -				num_fence += (1 +
> XE_MAX_GT_PER_TILE);
> -
> -			fences = kmalloc_array(num_fence,
> sizeof(*fences),
> -					       GFP_KERNEL);
> -			if (!fences)
> -				return ERR_PTR(-ENOMEM);
> -
> +	/* Reject in fences */
> +	for (i = 0; i < num_sync; ++i)
> +		if (sync[i].fence)
> +			return ERR_PTR(-EOPNOTSUPP);
> +
> +	if (q->flags & EXEC_QUEUE_FLAG_VM) {
> +		struct xe_exec_queue *__q;
> +		struct xe_tile *tile;
> +		u8 id;
> +
> +		for_each_tile(tile, vm->xe, id)
> +			num_fence += (1 + XE_MAX_GT_PER_TILE);
> +
> +		fences = kmalloc_array(num_fence, sizeof(*fences),
> +				       GFP_KERNEL);
> +		if (!fences)
> +			return ERR_PTR(-ENOMEM);
> +
> +		fences[current_fence++] =
> +			xe_exec_queue_last_fence_get(q, vm);
> +		for_each_tlb_inval(i)
> +			fences[current_fence++] =
> +				xe_exec_queue_tlb_inval_last_fence_g
> et(q, vm, i);
> +		list_for_each_entry(__q, &q->multi_gt_list,
> +				    multi_gt_link) {
>  			fences[current_fence++] =
> -				xe_exec_queue_last_fence_get(q, vm);
> +				xe_exec_queue_last_fence_get(__q,
> vm);
>  			for_each_tlb_inval(i)
>  				fences[current_fence++] =
> -
> 					xe_exec_queue_tlb_inval_last_fence_get(q, vm, i);
> -			list_for_each_entry(__q, &q->multi_gt_list,
> -					    multi_gt_link) {
> -				fences[current_fence++] =
> -
> 					xe_exec_queue_last_fence_get(__q, vm);
> -				for_each_tlb_inval(i)
> -					fences[current_fence++] =
> -
> 						xe_exec_queue_tlb_inval_last_fence_get(__q, vm, i);
> -			}
> -
> -			xe_assert(vm->xe, current_fence ==
> num_fence);
> -			cf = dma_fence_array_create(num_fence,
> fences,
> -						   
> dma_fence_context_alloc(1),
> -						    1, false);
> -			if (!cf)
> -				goto err_out;
> -
> -			return &cf->base;
> +					xe_exec_queue_tlb_inval_last
> _fence_get(__q, vm, i);
>  		}
>  
> -		fence = xe_exec_queue_last_fence_get(q, vm);
> -		return fence;
> -	}
> +		xe_assert(vm->xe, current_fence == num_fence);
> +		cf = dma_fence_array_create(num_fence, fences,
> +					   
> dma_fence_context_alloc(1),
> +					    1, false);
> +		if (!cf)
> +			goto err_out;
>  
> -	/*
> -	 * Create composite fence - FIXME - the below code doesn't
> work. This is
> -	 * unused in Mesa so we are ok for the moment. Perhaps we
> just disable
> -	 * this entire code path if number of in fences != 0.
> -	 */
> -	fences = kmalloc_array(num_fence + 1, sizeof(*fences),
> GFP_KERNEL);
> -	if (!fences)
> -		return ERR_PTR(-ENOMEM);
> -	for (i = 0; i < num_sync; ++i) {
> -		if (sync[i].fence) {
> -			dma_fence_get(sync[i].fence);
> -			fences[current_fence++] = sync[i].fence;
> -		}
> +		return &cf->base;
>  	}
> -	fences[current_fence++] = xe_exec_queue_last_fence_get(q,
> vm);
> -	cf = dma_fence_array_create(num_fence, fences,
> -				    dma_fence_context_alloc(1), 1,
> false);
> -	if (!cf)
> -		goto err_out;
>  
> -	return &cf->base;
> +	fence = xe_exec_queue_last_fence_get(q, vm);
> +	return fence;
>  
>  err_out:
>  	while (current_fence)
>  		dma_fence_put(fences[--current_fence]);
>  	kfree(fences);
> -	kfree(cf);
>  
>  	return ERR_PTR(-ENOMEM);
>  }


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v5 5/6] drm/xe: Disallow input fences on zero batch execs and zero binds
  2025-11-03 15:21   ` Thomas Hellström
@ 2025-11-03 15:22     ` Thomas Hellström
  0 siblings, 0 replies; 16+ messages in thread
From: Thomas Hellström @ 2025-11-03 15:22 UTC (permalink / raw)
  To: Matthew Brost, intel-xe

On Mon, 2025-11-03 at 16:21 +0100, Thomas Hellström wrote:
> On Wed, 2025-10-29 at 13:57 -0700, Matthew Brost wrote:
> > Prevent input fences from being installed on zero batch execs or
> > zero
> > binds, which were originally added to support queue idling in Mesa
> > via
> > output fences. Although input fence support was introduced for
> > interface
> > consistency, it leads to incorrect behavior due to chained
> > composite
> > fences, which are disallowed.
> > 
> > Avoid the complexity of fixing this by removing support, as input
> > fences
> > for these cases are not used in practice.
> > 
> > Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> 
> Reviewed-by: Thomas Hellström <thomas.hellstrom@linux.intel.com>

Oh, please mention it's a uapi change drm/xe/uapi:

/Thomas



> 
> > ---
> >  drivers/gpu/drm/xe/xe_sync.c | 101 +++++++++++++------------------
> > --
> > --
> >  1 file changed, 36 insertions(+), 65 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/xe/xe_sync.c
> > b/drivers/gpu/drm/xe/xe_sync.c
> > index df7ca349398b..ff74528ca0c6 100644
> > --- a/drivers/gpu/drm/xe/xe_sync.c
> > +++ b/drivers/gpu/drm/xe/xe_sync.c
> > @@ -301,84 +301,55 @@ xe_sync_in_fence_get(struct xe_sync_entry
> > *sync, int num_sync,
> >  
> >  	lockdep_assert_held(&vm->lock);
> >  
> > -	/* Count in-fences */
> > -	for (i = 0; i < num_sync; ++i) {
> > -		if (sync[i].fence) {
> > -			++num_fence;
> > -			fence = sync[i].fence;
> > -		}
> > -	}
> > -
> > -	/* Easy case... */
> > -	if (!num_fence) {
> > -		if (q->flags & EXEC_QUEUE_FLAG_VM) {
> > -			struct xe_exec_queue *__q;
> > -			struct xe_tile *tile;
> > -			u8 id;
> > -
> > -			for_each_tile(tile, vm->xe, id)
> > -				num_fence += (1 +
> > XE_MAX_GT_PER_TILE);
> > -
> > -			fences = kmalloc_array(num_fence,
> > sizeof(*fences),
> > -					       GFP_KERNEL);
> > -			if (!fences)
> > -				return ERR_PTR(-ENOMEM);
> > -
> > +	/* Reject in fences */
> > +	for (i = 0; i < num_sync; ++i)
> > +		if (sync[i].fence)
> > +			return ERR_PTR(-EOPNOTSUPP);
> > +
> > +	if (q->flags & EXEC_QUEUE_FLAG_VM) {
> > +		struct xe_exec_queue *__q;
> > +		struct xe_tile *tile;
> > +		u8 id;
> > +
> > +		for_each_tile(tile, vm->xe, id)
> > +			num_fence += (1 + XE_MAX_GT_PER_TILE);
> > +
> > +		fences = kmalloc_array(num_fence, sizeof(*fences),
> > +				       GFP_KERNEL);
> > +		if (!fences)
> > +			return ERR_PTR(-ENOMEM);
> > +
> > +		fences[current_fence++] =
> > +			xe_exec_queue_last_fence_get(q, vm);
> > +		for_each_tlb_inval(i)
> > +			fences[current_fence++] =
> > +				xe_exec_queue_tlb_inval_last_fence
> > _g
> > et(q, vm, i);
> > +		list_for_each_entry(__q, &q->multi_gt_list,
> > +				    multi_gt_link) {
> >  			fences[current_fence++] =
> > -				xe_exec_queue_last_fence_get(q,
> > vm);
> > +				xe_exec_queue_last_fence_get(__q,
> > vm);
> >  			for_each_tlb_inval(i)
> >  				fences[current_fence++] =
> > -
> > 					xe_exec_queue_tlb_inval_la
> > st_fence_get(q, vm, i);
> > -			list_for_each_entry(__q, &q-
> > >multi_gt_list,
> > -					    multi_gt_link) {
> > -				fences[current_fence++] =
> > -
> > 					xe_exec_queue_last_fence_g
> > et(__q, vm);
> > -				for_each_tlb_inval(i)
> > -					fences[current_fence++] =
> > -
> > 						xe_exec_queue_tlb_
> > inval_last_fence_get(__q, vm, i);
> > -			}
> > -
> > -			xe_assert(vm->xe, current_fence ==
> > num_fence);
> > -			cf = dma_fence_array_create(num_fence,
> > fences,
> > -						   
> > dma_fence_context_alloc(1),
> > -						    1, false);
> > -			if (!cf)
> > -				goto err_out;
> > -
> > -			return &cf->base;
> > +					xe_exec_queue_tlb_inval_la
> > st
> > _fence_get(__q, vm, i);
> >  		}
> >  
> > -		fence = xe_exec_queue_last_fence_get(q, vm);
> > -		return fence;
> > -	}
> > +		xe_assert(vm->xe, current_fence == num_fence);
> > +		cf = dma_fence_array_create(num_fence, fences,
> > +					   
> > dma_fence_context_alloc(1),
> > +					    1, false);
> > +		if (!cf)
> > +			goto err_out;
> >  
> > -	/*
> > -	 * Create composite fence - FIXME - the below code doesn't
> > work. This is
> > -	 * unused in Mesa so we are ok for the moment. Perhaps we
> > just disable
> > -	 * this entire code path if number of in fences != 0.
> > -	 */
> > -	fences = kmalloc_array(num_fence + 1, sizeof(*fences),
> > GFP_KERNEL);
> > -	if (!fences)
> > -		return ERR_PTR(-ENOMEM);
> > -	for (i = 0; i < num_sync; ++i) {
> > -		if (sync[i].fence) {
> > -			dma_fence_get(sync[i].fence);
> > -			fences[current_fence++] = sync[i].fence;
> > -		}
> > +		return &cf->base;
> >  	}
> > -	fences[current_fence++] = xe_exec_queue_last_fence_get(q,
> > vm);
> > -	cf = dma_fence_array_create(num_fence, fences,
> > -				    dma_fence_context_alloc(1), 1,
> > false);
> > -	if (!cf)
> > -		goto err_out;
> >  
> > -	return &cf->base;
> > +	fence = xe_exec_queue_last_fence_get(q, vm);
> > +	return fence;
> >  
> >  err_out:
> >  	while (current_fence)
> >  		dma_fence_put(fences[--current_fence]);
> >  	kfree(fences);
> > -	kfree(cf);
> >  
> >  	return ERR_PTR(-ENOMEM);
> >  }
> 


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v5 6/6] drm/xe: Remove last fence dependency check from binds
  2025-10-29 20:57 ` [PATCH v5 6/6] drm/xe: Remove last fence dependency check from binds Matthew Brost
  2025-10-30  8:43   ` Thomas Hellström
@ 2025-11-03 15:24   ` Thomas Hellström
  1 sibling, 0 replies; 16+ messages in thread
From: Thomas Hellström @ 2025-11-03 15:24 UTC (permalink / raw)
  To: Matthew Brost, intel-xe

On Wed, 2025-10-29 at 13:57 -0700, Matthew Brost wrote:
> Eliminate redundant last fence dependency checks in bind jobs, as
> they
> are now equivalent to xe_exec_queue_is_idle. Simplify the code by
> removing this dead logic.
> 
> Signed-off-by: Matthew Brost <matthew.brost@intel.com>

Reviewed-by: Thomas Hellström <thomas.hellstrom@linux.intel.com>


> ---
>  drivers/gpu/drm/xe/xe_exec_queue.c | 23 -----------------------
>  drivers/gpu/drm/xe/xe_exec_queue.h |  2 --
>  drivers/gpu/drm/xe/xe_pt.c         |  7 -------
>  3 files changed, 32 deletions(-)
> 
> diff --git a/drivers/gpu/drm/xe/xe_exec_queue.c
> b/drivers/gpu/drm/xe/xe_exec_queue.c
> index b7551592fe3f..3486744a8dfd 100644
> --- a/drivers/gpu/drm/xe/xe_exec_queue.c
> +++ b/drivers/gpu/drm/xe/xe_exec_queue.c
> @@ -1122,29 +1122,6 @@ void xe_exec_queue_last_fence_set(struct
> xe_exec_queue *q, struct xe_vm *vm,
>  	q->last_fence = dma_fence_get(fence);
>  }
>  
> -/**
> - * xe_exec_queue_last_fence_test_dep - Test last fence dependency of
> queue
> - * @q: The exec queue
> - * @vm: The VM the engine does a bind or exec for
> - *
> - * Returns:
> - * -ETIME if there exists an unsignalled last fence dependency, zero
> otherwise.
> - */
> -int xe_exec_queue_last_fence_test_dep(struct xe_exec_queue *q,
> struct xe_vm *vm)
> -{
> -	struct dma_fence *fence;
> -	int err = 0;
> -
> -	fence = xe_exec_queue_last_fence_get(q, vm);
> -	if (fence) {
> -		err = test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence-
> >flags) ?
> -			0 : -ETIME;
> -		dma_fence_put(fence);
> -	}
> -
> -	return err;
> -}
> -
>  /**
>   * xe_exec_queue_tlb_inval_last_fence_put() - Drop ref to last TLB
> invalidation fence
>   * @q: The exec queue
> diff --git a/drivers/gpu/drm/xe/xe_exec_queue.h
> b/drivers/gpu/drm/xe/xe_exec_queue.h
> index 1ba7354b33d1..fda4d4f9bda8 100644
> --- a/drivers/gpu/drm/xe/xe_exec_queue.h
> +++ b/drivers/gpu/drm/xe/xe_exec_queue.h
> @@ -88,8 +88,6 @@ struct dma_fence
> *xe_exec_queue_last_fence_get_for_resume(struct xe_exec_queue *
>  							  struct
> xe_vm *vm);
>  void xe_exec_queue_last_fence_set(struct xe_exec_queue *e, struct
> xe_vm *vm,
>  				  struct dma_fence *fence);
> -int xe_exec_queue_last_fence_test_dep(struct xe_exec_queue *q,
> -				      struct xe_vm *vm);
>  
>  void xe_exec_queue_tlb_inval_last_fence_put(struct xe_exec_queue *q,
>  					    struct xe_vm *vm,
> diff --git a/drivers/gpu/drm/xe/xe_pt.c b/drivers/gpu/drm/xe/xe_pt.c
> index a4b9cdf016d9..01056b51ac9f 100644
> --- a/drivers/gpu/drm/xe/xe_pt.c
> +++ b/drivers/gpu/drm/xe/xe_pt.c
> @@ -1338,13 +1338,6 @@ static int xe_pt_vm_dependencies(struct
> xe_sched_job *job,
>  			return err;
>  	}
>  
> -	if (!(pt_update_ops->q->flags & EXEC_QUEUE_FLAG_KERNEL)) {
> -		if (job)
> -			err = xe_sched_job_last_fence_add_dep(job,
> vm);
> -		else
> -			err =
> xe_exec_queue_last_fence_test_dep(pt_update_ops->q, vm);
> -	}
> -
>  	for (i = 0; job && !err && i < vops->num_syncs; i++)
>  		err = xe_sync_entry_add_deps(&vops->syncs[i], job);
>  


^ permalink raw reply	[flat|nested] 16+ messages in thread

end of thread, other threads:[~2025-11-03 15:24 UTC | newest]

Thread overview: 16+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-10-29 20:57 [PATCH v5 0/6] Fix serialization on burst of unbinds - v2 Matthew Brost
2025-10-29 20:57 ` [PATCH v5 1/6] drm/xe: Enforce correct user fence signaling order using drm_syncobjs Matthew Brost
2025-10-30  7:58   ` Thomas Hellström
2025-10-30 12:54     ` Matthew Brost
2025-10-29 20:57 ` [PATCH v5 2/6] drm/xe: Attach last fence to TLB invalidation job queues Matthew Brost
2025-10-30  8:24   ` Thomas Hellström
2025-10-29 20:57 ` [PATCH v5 3/6] drm/xe: Decouple bind queue last fence from TLB invalidations Matthew Brost
2025-10-30  9:52   ` Thomas Hellström
2025-10-29 20:57 ` [PATCH v5 4/6] drm/xe: Skip TLB invalidation waits in page fault binds Matthew Brost
2025-11-03 15:19   ` Thomas Hellström
2025-10-29 20:57 ` [PATCH v5 5/6] drm/xe: Disallow input fences on zero batch execs and zero binds Matthew Brost
2025-11-03 15:21   ` Thomas Hellström
2025-11-03 15:22     ` Thomas Hellström
2025-10-29 20:57 ` [PATCH v5 6/6] drm/xe: Remove last fence dependency check from binds Matthew Brost
2025-10-30  8:43   ` Thomas Hellström
2025-11-03 15:24   ` Thomas Hellström

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox