Intel-XE Archive on lore.kernel.org
 help / color / mirror / Atom feed
* [Intel-xe] [RFC PATCH 0/7] Syncs vs async exec/bind uAPI change
@ 2023-12-07  5:57 Matthew Brost
  2023-12-07  5:57 ` [Intel-xe] [RFC PATCH 1/7] drm/xe: Use a flags field instead of bools for VMA create Matthew Brost
                   ` (7 more replies)
  0 siblings, 8 replies; 22+ messages in thread
From: Matthew Brost @ 2023-12-07  5:57 UTC (permalink / raw)
  To: intel-xe

Last patch should explain it nicely, built on top of [1].

Backwards breaking, IGT change will be on list shortly.

Matt

[1] https://patchwork.freedesktop.org/series/126546/

Matthew Brost (7):
  drm/xe: Use a flags field instead of bools for VMA create
  drm/xe: Use a flags field instead of bools for sync parse
  drm/xe: Allow num_binds == 0 in VM bind IOCTL
  drm/xe: Allow num_batch_buffer == 0 in exec IOCTL
  drm/xe: Take in-syncs into account when num_execs or num_binds == 0
  drm/xe: Add last fence as dependency for jobs on user exec queues
  drm/xe/uapi: Uniform async vs sync handling

 drivers/gpu/drm/xe/xe_exec.c             |  88 +++++++--
 drivers/gpu/drm/xe/xe_exec_queue.c       |  12 +-
 drivers/gpu/drm/xe/xe_exec_queue_types.h |   7 +-
 drivers/gpu/drm/xe/xe_migrate.c          |  14 +-
 drivers/gpu/drm/xe/xe_sched_job.c        |  17 ++
 drivers/gpu/drm/xe/xe_sched_job.h        |   4 +
 drivers/gpu/drm/xe/xe_sync.c             |  82 +++++++-
 drivers/gpu/drm/xe/xe_sync.h             |  11 +-
 drivers/gpu/drm/xe/xe_vm.c               | 242 ++++++++++++-----------
 drivers/gpu/drm/xe/xe_vm_types.h         |  15 +-
 include/uapi/drm/xe_drm.h                |  56 +++---
 11 files changed, 373 insertions(+), 175 deletions(-)

-- 
2.34.1


^ permalink raw reply	[flat|nested] 22+ messages in thread

* [Intel-xe] [RFC PATCH 1/7] drm/xe: Use a flags field instead of bools for VMA create
  2023-12-07  5:57 [Intel-xe] [RFC PATCH 0/7] Syncs vs async exec/bind uAPI change Matthew Brost
@ 2023-12-07  5:57 ` Matthew Brost
  2023-12-07  5:57 ` [Intel-xe] [RFC PATCH 2/7] drm/xe: Use a flags field instead of bools for sync parse Matthew Brost
                   ` (6 subsequent siblings)
  7 siblings, 0 replies; 22+ messages in thread
From: Matthew Brost @ 2023-12-07  5:57 UTC (permalink / raw)
  To: intel-xe

Use a flags field instead of severval bools for VMA create as it is
easier to read and less bug prone.

Suggested-by: Thomas Hellström <thomas.hellstrom@linux.intel.com>
Signed-off-by: Matthew Brost <matthew.brost@intel.com>
Reviewed-by: Thomas Hellström <thomas.hellstrom@linux.intel.com>
---
 drivers/gpu/drm/xe/xe_vm.c | 64 ++++++++++++++++++++------------------
 1 file changed, 34 insertions(+), 30 deletions(-)

diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c
index e09050f16f07..44b2972d5d5f 100644
--- a/drivers/gpu/drm/xe/xe_vm.c
+++ b/drivers/gpu/drm/xe/xe_vm.c
@@ -860,17 +860,20 @@ struct dma_fence *xe_vm_rebind(struct xe_vm *vm, bool rebind_worker)
 	return fence;
 }
 
+#define VMA_CREATE_FLAG_READ_ONLY	BIT(0)
+#define VMA_CREATE_FLAG_IS_NULL		BIT(1)
+
 static struct xe_vma *xe_vma_create(struct xe_vm *vm,
 				    struct xe_bo *bo,
 				    u64 bo_offset_or_userptr,
 				    u64 start, u64 end,
-				    bool read_only,
-				    bool is_null,
-				    u16 pat_index)
+				    u16 pat_index, unsigned int flags)
 {
 	struct xe_vma *vma;
 	struct xe_tile *tile;
 	u8 id;
+	bool read_only = (flags & VMA_CREATE_FLAG_READ_ONLY);
+	bool is_null = (flags & VMA_CREATE_FLAG_IS_NULL);
 
 	xe_assert(vm->xe, start < end);
 	xe_assert(vm->xe, end < vm->size);
@@ -2242,7 +2245,7 @@ vm_bind_ioctl_ops_create(struct xe_vm *vm, struct xe_bo *bo,
 }
 
 static struct xe_vma *new_vma(struct xe_vm *vm, struct drm_gpuva_op_map *op,
-			      bool read_only, bool is_null, u16 pat_index)
+			      u16 pat_index, unsigned int flags)
 {
 	struct xe_bo *bo = op->gem.obj ? gem_to_xe_bo(op->gem.obj) : NULL;
 	struct xe_vma *vma;
@@ -2257,8 +2260,7 @@ static struct xe_vma *new_vma(struct xe_vm *vm, struct drm_gpuva_op_map *op,
 	}
 	vma = xe_vma_create(vm, bo, op->gem.offset,
 			    op->va.addr, op->va.addr +
-			    op->va.range - 1, read_only, is_null,
-			    pat_index);
+			    op->va.range - 1, pat_index, flags);
 	if (bo)
 		xe_bo_unlock(bo);
 
@@ -2384,7 +2386,9 @@ static int vm_bind_ioctl_ops_parse(struct xe_vm *vm, struct xe_exec_queue *q,
 
 	drm_gpuva_for_each_op(__op, ops) {
 		struct xe_vma_op *op = gpuva_op_to_vma_op(__op);
+		struct xe_vma *vma;
 		bool first = list_empty(ops_list);
+		unsigned int flags = 0;
 
 		INIT_LIST_HEAD(&op->link);
 		list_add_tail(&op->link, ops_list);
@@ -2400,10 +2404,13 @@ static int vm_bind_ioctl_ops_parse(struct xe_vm *vm, struct xe_exec_queue *q,
 		switch (op->base.op) {
 		case DRM_GPUVA_OP_MAP:
 		{
-			struct xe_vma *vma;
+			flags |= op->map.read_only ?
+				VMA_CREATE_FLAG_READ_ONLY : 0;
+			flags |= op->map.is_null ?
+				VMA_CREATE_FLAG_IS_NULL : 0;
 
-			vma = new_vma(vm, &op->base.map, op->map.read_only,
-				      op->map.is_null, op->map.pat_index);
+			vma = new_vma(vm, &op->base.map, op->map.pat_index,
+				      flags);
 			if (IS_ERR(vma))
 				return PTR_ERR(vma);
 
@@ -2419,16 +2426,15 @@ static int vm_bind_ioctl_ops_parse(struct xe_vm *vm, struct xe_exec_queue *q,
 			op->remap.range = xe_vma_size(old);
 
 			if (op->base.remap.prev) {
-				struct xe_vma *vma;
-				bool read_only =
-					op->base.remap.unmap->va->flags &
-					XE_VMA_READ_ONLY;
-				bool is_null =
-					op->base.remap.unmap->va->flags &
-					DRM_GPUVA_SPARSE;
-
-				vma = new_vma(vm, op->base.remap.prev, read_only,
-					      is_null, old->pat_index);
+				flags |= op->base.remap.unmap->va->flags &
+					XE_VMA_READ_ONLY ?
+					VMA_CREATE_FLAG_READ_ONLY : 0;
+				flags |= op->base.remap.unmap->va->flags &
+					DRM_GPUVA_SPARSE ?
+					VMA_CREATE_FLAG_IS_NULL : 0;
+
+				vma = new_vma(vm, op->base.remap.prev,
+					      old->pat_index, flags);
 				if (IS_ERR(vma))
 					return PTR_ERR(vma);
 
@@ -2451,17 +2457,15 @@ static int vm_bind_ioctl_ops_parse(struct xe_vm *vm, struct xe_exec_queue *q,
 			}
 
 			if (op->base.remap.next) {
-				struct xe_vma *vma;
-				bool read_only =
-					op->base.remap.unmap->va->flags &
-					XE_VMA_READ_ONLY;
-
-				bool is_null =
-					op->base.remap.unmap->va->flags &
-					DRM_GPUVA_SPARSE;
-
-				vma = new_vma(vm, op->base.remap.next, read_only,
-					      is_null, old->pat_index);
+				flags |= op->base.remap.unmap->va->flags &
+					XE_VMA_READ_ONLY ?
+					VMA_CREATE_FLAG_READ_ONLY : 0;
+				flags |= op->base.remap.unmap->va->flags &
+					DRM_GPUVA_SPARSE ?
+					VMA_CREATE_FLAG_IS_NULL : 0;
+
+				vma = new_vma(vm, op->base.remap.next,
+					      old->pat_index, flags);
 				if (IS_ERR(vma))
 					return PTR_ERR(vma);
 
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 22+ messages in thread

* [Intel-xe] [RFC PATCH 2/7] drm/xe: Use a flags field instead of bools for sync parse
  2023-12-07  5:57 [Intel-xe] [RFC PATCH 0/7] Syncs vs async exec/bind uAPI change Matthew Brost
  2023-12-07  5:57 ` [Intel-xe] [RFC PATCH 1/7] drm/xe: Use a flags field instead of bools for VMA create Matthew Brost
@ 2023-12-07  5:57 ` Matthew Brost
  2023-12-07  5:57 ` [Intel-xe] [RFC PATCH 3/7] drm/xe: Allow num_binds == 0 in VM bind IOCTL Matthew Brost
                   ` (5 subsequent siblings)
  7 siblings, 0 replies; 22+ messages in thread
From: Matthew Brost @ 2023-12-07  5:57 UTC (permalink / raw)
  To: intel-xe

Use a flags field instead of severval bools for sync parse as it is
easier to read and less bug prone.

v2: Pull in header change from subsequent patch

Suggested-by: Thomas Hellström <thomas.hellstrom@linux.intel.com>
Signed-off-by: Matthew Brost <matthew.brost@intel.com>
Reviewed-by: Thomas Hellström <thomas.hellstrom@linux.intel.com>
---
 drivers/gpu/drm/xe/xe_exec.c | 5 +++--
 drivers/gpu/drm/xe/xe_sync.c | 4 +++-
 drivers/gpu/drm/xe/xe_sync.h | 5 ++++-
 drivers/gpu/drm/xe/xe_vm.c   | 5 +++--
 4 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/xe/xe_exec.c b/drivers/gpu/drm/xe/xe_exec.c
index 347239f28170..a8a025495b14 100644
--- a/drivers/gpu/drm/xe/xe_exec.c
+++ b/drivers/gpu/drm/xe/xe_exec.c
@@ -181,8 +181,9 @@ int xe_exec_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
 
 	for (i = 0; i < args->num_syncs; i++) {
 		err = xe_sync_entry_parse(xe, xef, &syncs[num_syncs++],
-					  &syncs_user[i], true,
-					  xe_vm_in_lr_mode(vm));
+					  &syncs_user[i], SYNC_PARSE_FLAG_EXEC |
+					  (xe_vm_in_lr_mode(vm) ?
+					   SYNC_PARSE_FLAG_LR_MODE : 0));
 		if (err)
 			goto err_syncs;
 	}
diff --git a/drivers/gpu/drm/xe/xe_sync.c b/drivers/gpu/drm/xe/xe_sync.c
index 936227e79483..2a3f508722fc 100644
--- a/drivers/gpu/drm/xe/xe_sync.c
+++ b/drivers/gpu/drm/xe/xe_sync.c
@@ -98,10 +98,12 @@ static void user_fence_cb(struct dma_fence *fence, struct dma_fence_cb *cb)
 int xe_sync_entry_parse(struct xe_device *xe, struct xe_file *xef,
 			struct xe_sync_entry *sync,
 			struct drm_xe_sync __user *sync_user,
-			bool exec, bool in_lr_mode)
+			unsigned int flags)
 {
 	struct drm_xe_sync sync_in;
 	int err;
+	bool exec = flags & SYNC_PARSE_FLAG_EXEC;
+	bool in_lr_mode = flags & SYNC_PARSE_FLAG_LR_MODE;
 	bool signal;
 
 	if (copy_from_user(&sync_in, sync_user, sizeof(*sync_user)))
diff --git a/drivers/gpu/drm/xe/xe_sync.h b/drivers/gpu/drm/xe/xe_sync.h
index 30958ddc4cdc..1b748cec4678 100644
--- a/drivers/gpu/drm/xe/xe_sync.h
+++ b/drivers/gpu/drm/xe/xe_sync.h
@@ -12,10 +12,13 @@ struct xe_device;
 struct xe_file;
 struct xe_sched_job;
 
+#define SYNC_PARSE_FLAG_EXEC			BIT(0)
+#define SYNC_PARSE_FLAG_LR_MODE			BIT(1)
+
 int xe_sync_entry_parse(struct xe_device *xe, struct xe_file *xef,
 			struct xe_sync_entry *sync,
 			struct drm_xe_sync __user *sync_user,
-			bool exec, bool compute_mode);
+			unsigned int flags);
 int xe_sync_entry_wait(struct xe_sync_entry *sync);
 int xe_sync_entry_add_deps(struct xe_sync_entry *sync,
 			   struct xe_sched_job *job);
diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c
index 44b2972d5d5f..42077e3db36a 100644
--- a/drivers/gpu/drm/xe/xe_vm.c
+++ b/drivers/gpu/drm/xe/xe_vm.c
@@ -3106,8 +3106,9 @@ int xe_vm_bind_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
 	syncs_user = u64_to_user_ptr(args->syncs);
 	for (num_syncs = 0; num_syncs < args->num_syncs; num_syncs++) {
 		err = xe_sync_entry_parse(xe, xef, &syncs[num_syncs],
-					  &syncs_user[num_syncs], false,
-					  xe_vm_in_lr_mode(vm));
+					  &syncs_user[num_syncs],
+					  xe_vm_in_lr_mode(vm) ?
+					  SYNC_PARSE_FLAG_LR_MODE : 0);
 		if (err)
 			goto free_syncs;
 	}
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 22+ messages in thread

* [Intel-xe] [RFC PATCH 3/7] drm/xe: Allow num_binds == 0 in VM bind IOCTL
  2023-12-07  5:57 [Intel-xe] [RFC PATCH 0/7] Syncs vs async exec/bind uAPI change Matthew Brost
  2023-12-07  5:57 ` [Intel-xe] [RFC PATCH 1/7] drm/xe: Use a flags field instead of bools for VMA create Matthew Brost
  2023-12-07  5:57 ` [Intel-xe] [RFC PATCH 2/7] drm/xe: Use a flags field instead of bools for sync parse Matthew Brost
@ 2023-12-07  5:57 ` Matthew Brost
  2023-12-07  5:57 ` [Intel-xe] [RFC PATCH 4/7] drm/xe: Allow num_batch_buffer == 0 in exec IOCTL Matthew Brost
                   ` (4 subsequent siblings)
  7 siblings, 0 replies; 22+ messages in thread
From: Matthew Brost @ 2023-12-07  5:57 UTC (permalink / raw)
  To: intel-xe

The idea being out-syncs can signal indicating all previous operations
on the bind queue are complete. An example use case of this would be
support for implementing vkQueueWaitIdle easily.

v2: s/vkQueueWaitForIdle/vkQueueWaitIdle

Signed-off-by: Matthew Brost <matthew.brost@intel.com>
Reviewed-by: Thomas Hellström <thomas.hellstrom@linux.intel.com>
---
 drivers/gpu/drm/xe/xe_sync.c |  4 ++++
 drivers/gpu/drm/xe/xe_sync.h |  1 +
 drivers/gpu/drm/xe/xe_vm.c   | 36 ++++++++++++++++++++++--------------
 3 files changed, 27 insertions(+), 14 deletions(-)

diff --git a/drivers/gpu/drm/xe/xe_sync.c b/drivers/gpu/drm/xe/xe_sync.c
index 2a3f508722fc..d0f118223fa2 100644
--- a/drivers/gpu/drm/xe/xe_sync.c
+++ b/drivers/gpu/drm/xe/xe_sync.c
@@ -104,6 +104,7 @@ int xe_sync_entry_parse(struct xe_device *xe, struct xe_file *xef,
 	int err;
 	bool exec = flags & SYNC_PARSE_FLAG_EXEC;
 	bool in_lr_mode = flags & SYNC_PARSE_FLAG_LR_MODE;
+	bool disallow_user_fence = flags & SYNC_PARSE_FLAG_DISALLOW_USER_FENCE;
 	bool signal;
 
 	if (copy_from_user(&sync_in, sync_user, sizeof(*sync_user)))
@@ -164,6 +165,9 @@ int xe_sync_entry_parse(struct xe_device *xe, struct xe_file *xef,
 		break;
 
 	case DRM_XE_SYNC_TYPE_USER_FENCE:
+		if (XE_IOCTL_DBG(xe, disallow_user_fence))
+			return -EOPNOTSUPP;
+
 		if (XE_IOCTL_DBG(xe, !signal))
 			return -EOPNOTSUPP;
 
diff --git a/drivers/gpu/drm/xe/xe_sync.h b/drivers/gpu/drm/xe/xe_sync.h
index 1b748cec4678..45f4371e94b9 100644
--- a/drivers/gpu/drm/xe/xe_sync.h
+++ b/drivers/gpu/drm/xe/xe_sync.h
@@ -14,6 +14,7 @@ struct xe_sched_job;
 
 #define SYNC_PARSE_FLAG_EXEC			BIT(0)
 #define SYNC_PARSE_FLAG_LR_MODE			BIT(1)
+#define SYNC_PARSE_FLAG_DISALLOW_USER_FENCE	BIT(2)
 
 int xe_sync_entry_parse(struct xe_device *xe, struct xe_file *xef,
 			struct xe_sync_entry *sync,
diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c
index 42077e3db36a..f6de0584ea91 100644
--- a/drivers/gpu/drm/xe/xe_vm.c
+++ b/drivers/gpu/drm/xe/xe_vm.c
@@ -2834,7 +2834,6 @@ static int vm_bind_ioctl_check_args(struct xe_device *xe,
 		return -EINVAL;
 
 	if (XE_IOCTL_DBG(xe, args->extensions) ||
-	    XE_IOCTL_DBG(xe, !args->num_binds) ||
 	    XE_IOCTL_DBG(xe, args->num_binds > MAX_BINDS))
 		return -EINVAL;
 
@@ -2987,7 +2986,7 @@ int xe_vm_bind_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
 			goto put_exec_queue;
 		}
 
-		if (XE_IOCTL_DBG(xe, async !=
+		if (XE_IOCTL_DBG(xe, args->num_binds && async !=
 				 !!(q->flags & EXEC_QUEUE_FLAG_VM_ASYNC))) {
 			err = -EINVAL;
 			goto put_exec_queue;
@@ -3001,7 +3000,7 @@ int xe_vm_bind_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
 	}
 
 	if (!args->exec_queue_id) {
-		if (XE_IOCTL_DBG(xe, async !=
+		if (XE_IOCTL_DBG(xe, args->num_binds && async !=
 				 !!(vm->flags & XE_VM_FLAG_ASYNC_DEFAULT))) {
 			err = -EINVAL;
 			goto put_vm;
@@ -3028,16 +3027,18 @@ int xe_vm_bind_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
 		}
 	}
 
-	bos = kzalloc(sizeof(*bos) * args->num_binds, GFP_KERNEL);
-	if (!bos) {
-		err = -ENOMEM;
-		goto release_vm_lock;
-	}
+	if (args->num_binds) {
+		bos = kcalloc(args->num_binds, sizeof(*bos), GFP_KERNEL);
+		if (!bos) {
+			err = -ENOMEM;
+			goto release_vm_lock;
+		}
 
-	ops = kzalloc(sizeof(*ops) * args->num_binds, GFP_KERNEL);
-	if (!ops) {
-		err = -ENOMEM;
-		goto release_vm_lock;
+		ops = kcalloc(args->num_binds, sizeof(*ops), GFP_KERNEL);
+		if (!ops) {
+			err = -ENOMEM;
+			goto release_vm_lock;
+		}
 	}
 
 	for (i = 0; i < args->num_binds; ++i) {
@@ -3107,12 +3108,19 @@ int xe_vm_bind_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
 	for (num_syncs = 0; num_syncs < args->num_syncs; num_syncs++) {
 		err = xe_sync_entry_parse(xe, xef, &syncs[num_syncs],
 					  &syncs_user[num_syncs],
-					  xe_vm_in_lr_mode(vm) ?
-					  SYNC_PARSE_FLAG_LR_MODE : 0);
+					  (xe_vm_in_lr_mode(vm) ?
+					   SYNC_PARSE_FLAG_LR_MODE : 0) |
+					  (!args->num_binds ?
+					   SYNC_PARSE_FLAG_DISALLOW_USER_FENCE : 0));
 		if (err)
 			goto free_syncs;
 	}
 
+	if (!args->num_binds) {
+		err = -ENODATA;
+		goto free_syncs;
+	}
+
 	for (i = 0; i < args->num_binds; ++i) {
 		u64 range = bind_ops[i].range;
 		u64 addr = bind_ops[i].addr;
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 22+ messages in thread

* [Intel-xe] [RFC PATCH 4/7] drm/xe: Allow num_batch_buffer == 0 in exec IOCTL
  2023-12-07  5:57 [Intel-xe] [RFC PATCH 0/7] Syncs vs async exec/bind uAPI change Matthew Brost
                   ` (2 preceding siblings ...)
  2023-12-07  5:57 ` [Intel-xe] [RFC PATCH 3/7] drm/xe: Allow num_binds == 0 in VM bind IOCTL Matthew Brost
@ 2023-12-07  5:57 ` Matthew Brost
  2023-12-07  5:57 ` [Intel-xe] [RFC PATCH 5/7] drm/xe: Take in-syncs into account when num_execs or num_binds == 0 Matthew Brost
                   ` (3 subsequent siblings)
  7 siblings, 0 replies; 22+ messages in thread
From: Matthew Brost @ 2023-12-07  5:57 UTC (permalink / raw)
  To: intel-xe

The idea being out-syncs can signal indicating all previous operations
on the exec queue are complete. An example use case of this would be
support for implementing vkQueueWaitIdle easily.

v2: Don't add last_fence for VM's that do not support dma fences
v3: Use a flags field instead of severval bools in sync parse (Thomas)
v4: s/vkQueueWaitForIdle/vkQueueWaitIdle
v5: Fix inverted lr_mode checks

Signed-off-by: Matthew Brost <matthew.brost@intel.com>
Reviewed-by: Thomas Hellström <thomas.hellstrom@linux.intel.com>
---
 drivers/gpu/drm/xe/xe_exec.c             | 17 ++++++++++++++++-
 drivers/gpu/drm/xe/xe_exec_queue.c       |  5 ++++-
 drivers/gpu/drm/xe/xe_exec_queue_types.h |  5 +++--
 3 files changed, 23 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/xe/xe_exec.c b/drivers/gpu/drm/xe/xe_exec.c
index a8a025495b14..96d7506a4c72 100644
--- a/drivers/gpu/drm/xe/xe_exec.c
+++ b/drivers/gpu/drm/xe/xe_exec.c
@@ -161,7 +161,8 @@ int xe_exec_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
 	if (XE_IOCTL_DBG(xe, q->flags & EXEC_QUEUE_FLAG_VM))
 		return -EINVAL;
 
-	if (XE_IOCTL_DBG(xe, q->width != args->num_batch_buffer))
+	if (XE_IOCTL_DBG(xe, args->num_batch_buffer &&
+			 q->width != args->num_batch_buffer))
 		return -EINVAL;
 
 	if (XE_IOCTL_DBG(xe, q->flags & EXEC_QUEUE_FLAG_BANNED)) {
@@ -235,6 +236,18 @@ int xe_exec_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
 		goto err_exec;
 	}
 
+	if (!args->num_batch_buffer) {
+		if (!xe_vm_in_lr_mode(vm)) {
+			struct dma_fence *fence =
+				xe_exec_queue_last_fence_get(q, vm);
+
+			for (i = 0; i < num_syncs; i++)
+				xe_sync_entry_signal(&syncs[i], NULL, fence);
+		}
+
+		goto err_exec;
+	}
+
 	if (xe_exec_queue_is_lr(q) && xe_exec_queue_ring_full(q)) {
 		err = -EWOULDBLOCK;
 		goto err_exec;
@@ -328,6 +341,8 @@ int xe_exec_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
 
 	if (xe_exec_queue_is_lr(q))
 		q->ring_ops->emit_job(job);
+	if (!xe_vm_in_lr_mode(vm))
+		xe_exec_queue_last_fence_set(q, vm, &job->drm.s_fence->finished);
 	xe_sched_job_push(job);
 	xe_vm_reactivate_rebind(vm);
 
diff --git a/drivers/gpu/drm/xe/xe_exec_queue.c b/drivers/gpu/drm/xe/xe_exec_queue.c
index cb284c4ad049..67e3fd9dfc5f 100644
--- a/drivers/gpu/drm/xe/xe_exec_queue.c
+++ b/drivers/gpu/drm/xe/xe_exec_queue.c
@@ -886,7 +886,10 @@ int xe_exec_queue_destroy_ioctl(struct drm_device *dev, void *data,
 static void xe_exec_queue_last_fence_lockdep_assert(struct xe_exec_queue *q,
 						    struct xe_vm *vm)
 {
-	lockdep_assert_held_write(&vm->lock);
+	if (q->flags & EXEC_QUEUE_FLAG_VM)
+		lockdep_assert_held_write(&vm->lock);
+	else
+		xe_vm_assert_held(vm);
 }
 
 /**
diff --git a/drivers/gpu/drm/xe/xe_exec_queue_types.h b/drivers/gpu/drm/xe/xe_exec_queue_types.h
index 5ba47a5cfdbd..52f0927d0d9b 100644
--- a/drivers/gpu/drm/xe/xe_exec_queue_types.h
+++ b/drivers/gpu/drm/xe/xe_exec_queue_types.h
@@ -56,8 +56,9 @@ struct xe_exec_queue {
 	struct xe_hw_fence_irq *fence_irq;
 
 	/**
-	 * @last_fence: last fence on engine, protected by vm->lock in write
-	 * mode if bind engine
+	 * @last_fence: last fence on exec queue, protected by vm->lock in write
+	 * mode if bind exec queue, protected by dma resv lock if non-bind exec
+	 * queue
 	 */
 	struct dma_fence *last_fence;
 
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 22+ messages in thread

* [Intel-xe] [RFC PATCH 5/7] drm/xe: Take in-syncs into account when num_execs or num_binds == 0
  2023-12-07  5:57 [Intel-xe] [RFC PATCH 0/7] Syncs vs async exec/bind uAPI change Matthew Brost
                   ` (3 preceding siblings ...)
  2023-12-07  5:57 ` [Intel-xe] [RFC PATCH 4/7] drm/xe: Allow num_batch_buffer == 0 in exec IOCTL Matthew Brost
@ 2023-12-07  5:57 ` Matthew Brost
  2023-12-08 15:04   ` Thomas Hellström
  2023-12-07  5:57 ` [Intel-xe] [RFC PATCH 6/7] drm/xe: Add last fence as dependency for jobs on user exec queues Matthew Brost
                   ` (2 subsequent siblings)
  7 siblings, 1 reply; 22+ messages in thread
From: Matthew Brost @ 2023-12-07  5:57 UTC (permalink / raw)
  To: intel-xe

Wait on in-syncs before signaling out-syncs if num_execs or num_binds ==
0 in execbuf IOCTL or VM bind IOCTL respectfully.

v2: Wait on last fence in addition to in-fences (Thomas)
v3: Use function for in-fence signaling

Signed-off-by: Matthew Brost <matthew.brost@intel.com>
---
 drivers/gpu/drm/xe/xe_exec.c | 10 ++++-
 drivers/gpu/drm/xe/xe_sync.c | 74 ++++++++++++++++++++++++++++++++++++
 drivers/gpu/drm/xe/xe_sync.h |  5 +++
 drivers/gpu/drm/xe/xe_vm.c   | 41 ++++++++++++++++----
 4 files changed, 121 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/xe/xe_exec.c b/drivers/gpu/drm/xe/xe_exec.c
index 96d7506a4c72..438e34585e1e 100644
--- a/drivers/gpu/drm/xe/xe_exec.c
+++ b/drivers/gpu/drm/xe/xe_exec.c
@@ -238,11 +238,17 @@ int xe_exec_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
 
 	if (!args->num_batch_buffer) {
 		if (!xe_vm_in_lr_mode(vm)) {
-			struct dma_fence *fence =
-				xe_exec_queue_last_fence_get(q, vm);
+			struct dma_fence *fence;
 
+			fence = xe_sync_in_fence_get(syncs, num_syncs, q, vm);
+			if (IS_ERR(fence)) {
+				err = PTR_ERR(fence);
+				goto err_exec;
+			}
 			for (i = 0; i < num_syncs; i++)
 				xe_sync_entry_signal(&syncs[i], NULL, fence);
+			xe_exec_queue_last_fence_set(q, vm, fence);
+			dma_fence_put(fence);
 		}
 
 		goto err_exec;
diff --git a/drivers/gpu/drm/xe/xe_sync.c b/drivers/gpu/drm/xe/xe_sync.c
index d0f118223fa2..e4c220cf9115 100644
--- a/drivers/gpu/drm/xe/xe_sync.c
+++ b/drivers/gpu/drm/xe/xe_sync.c
@@ -5,6 +5,7 @@
 
 #include "xe_sync.h"
 
+#include <linux/dma-fence-array.h>
 #include <linux/kthread.h>
 #include <linux/sched/mm.h>
 #include <linux/uaccess.h>
@@ -14,6 +15,7 @@
 #include <drm/xe_drm.h>
 
 #include "xe_device_types.h"
+#include "xe_exec_queue.h"
 #include "xe_macros.h"
 #include "xe_sched_job_types.h"
 
@@ -268,3 +270,75 @@ void xe_sync_entry_cleanup(struct xe_sync_entry *sync)
 	if (sync->ufence)
 		user_fence_put(sync->ufence);
 }
+
+/**
+ * xe_sync_in_fence_get() - Get a fence from syncs, exec queue, and VM
+ * @sync: input syncs
+ * @num_sync: number of syncs
+ * @q: exec queue
+ * @vm: VM
+ *
+ * Get a fence from syncs, exec queue, and VM. If syncs contain in-fences create
+ * and return a composite fence of all in-fences + last fence. If no in-fences
+ * return last fence on  input exec queue. Caller must drop reference to
+ * returned fence.
+ *
+ * Return: fence on success, ERR_PTR(-ENOMEM) on failure
+ */
+struct dma_fence *
+xe_sync_in_fence_get(struct xe_sync_entry *sync, int num_sync,
+		     struct xe_exec_queue *q, struct xe_vm *vm)
+{
+	struct dma_fence **fences = NULL;
+	struct dma_fence_array *cf = NULL;
+	struct dma_fence *fence;
+	int i, num_in_fence = 0, current_fence = 0;
+
+	lockdep_assert_held(&vm->lock);
+
+	/* Count in-fences */
+	for (i = 0; i < num_sync; ++i) {
+		if (sync[i].fence) {
+			++num_in_fence;
+			fence = sync[i].fence;
+		}
+	}
+
+	/* Easy case... */
+	if (!num_in_fence) {
+		fence = xe_exec_queue_last_fence_get(q, vm);
+		dma_fence_get(fence);
+		return fence;
+	}
+
+	/* Create composite fence */
+	fences = kmalloc_array(num_in_fence + 1, sizeof(*fences), GFP_KERNEL);
+	if (!fences)
+		return ERR_PTR(-ENOMEM);
+	for (i = 0; i < num_sync; ++i) {
+		if (sync[i].fence) {
+			dma_fence_get(sync[i].fence);
+			fences[current_fence++] = sync[i].fence;
+		}
+	}
+	fences[current_fence++] = xe_exec_queue_last_fence_get(q, vm);
+	dma_fence_get(fences[current_fence - 1]);
+	cf = dma_fence_array_create(num_in_fence, fences,
+				    vm->composite_fence_ctx,
+				    vm->composite_fence_seqno++,
+				    false);
+	if (!cf) {
+		--vm->composite_fence_seqno;
+		goto err_out;
+	}
+
+	return &cf->base;
+
+err_out:
+	while (current_fence)
+		dma_fence_put(fences[--current_fence]);
+	kfree(fences);
+	kfree(cf);
+
+	return ERR_PTR(-ENOMEM);
+}
diff --git a/drivers/gpu/drm/xe/xe_sync.h b/drivers/gpu/drm/xe/xe_sync.h
index 45f4371e94b9..d284afbe917c 100644
--- a/drivers/gpu/drm/xe/xe_sync.h
+++ b/drivers/gpu/drm/xe/xe_sync.h
@@ -9,8 +9,10 @@
 #include "xe_sync_types.h"
 
 struct xe_device;
+struct xe_exec_queue;
 struct xe_file;
 struct xe_sched_job;
+struct xe_vm;
 
 #define SYNC_PARSE_FLAG_EXEC			BIT(0)
 #define SYNC_PARSE_FLAG_LR_MODE			BIT(1)
@@ -27,5 +29,8 @@ void xe_sync_entry_signal(struct xe_sync_entry *sync,
 			  struct xe_sched_job *job,
 			  struct dma_fence *fence);
 void xe_sync_entry_cleanup(struct xe_sync_entry *sync);
+struct dma_fence *
+xe_sync_in_fence_get(struct xe_sync_entry *sync, int num_sync,
+		     struct xe_exec_queue *q, struct xe_vm *vm);
 
 #endif
diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c
index f6de0584ea91..cf2eb44a71db 100644
--- a/drivers/gpu/drm/xe/xe_vm.c
+++ b/drivers/gpu/drm/xe/xe_vm.c
@@ -2948,6 +2948,37 @@ static int vm_bind_ioctl_check_args(struct xe_device *xe,
 	return err;
 }
 
+static int vm_bind_ioctl_signal_fences(struct xe_vm *vm,
+				       struct xe_exec_queue *q,
+				       struct xe_sync_entry *syncs,
+				       int num_syncs)
+{
+	struct dma_fence *fence;
+	int i, err = 0;
+
+	fence = xe_sync_in_fence_get(syncs, num_syncs,
+				     to_wait_exec_queue(vm, q), vm);
+	if (IS_ERR(fence))
+		return PTR_ERR(fence);
+
+	for (i = 0; i < num_syncs; i++)
+		xe_sync_entry_signal(&syncs[i], NULL, fence);
+
+	xe_exec_queue_last_fence_set(to_wait_exec_queue(vm, q), vm,
+				     fence);
+
+	if (xe_vm_sync_mode(vm, q)) {
+		long timeout = dma_fence_wait(fence, true);
+
+		if (timeout < 0)
+			err = -EINTR;
+	}
+
+	dma_fence_put(fence);
+
+	return err;
+}
+
 int xe_vm_bind_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
 {
 	struct xe_device *xe = to_xe_device(dev);
@@ -3178,12 +3209,8 @@ int xe_vm_bind_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
 unwind_ops:
 	vm_bind_ioctl_ops_unwind(vm, ops, args->num_binds);
 free_syncs:
-	for (i = 0; err == -ENODATA && i < num_syncs; i++) {
-		struct dma_fence *fence =
-			xe_exec_queue_last_fence_get(to_wait_exec_queue(vm, q), vm);
-
-		xe_sync_entry_signal(&syncs[i], NULL, fence);
-	}
+	if (err == -ENODATA)
+		err = vm_bind_ioctl_signal_fences(vm, q, syncs, num_syncs);
 	while (num_syncs--)
 		xe_sync_entry_cleanup(&syncs[num_syncs]);
 
@@ -3203,7 +3230,7 @@ int xe_vm_bind_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
 	kfree(ops);
 	if (args->num_binds > 1)
 		kfree(bind_ops);
-	return err == -ENODATA ? 0 : err;
+	return err;
 }
 
 /**
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 22+ messages in thread

* [Intel-xe] [RFC PATCH 6/7] drm/xe: Add last fence as dependency for jobs on user exec queues
  2023-12-07  5:57 [Intel-xe] [RFC PATCH 0/7] Syncs vs async exec/bind uAPI change Matthew Brost
                   ` (4 preceding siblings ...)
  2023-12-07  5:57 ` [Intel-xe] [RFC PATCH 5/7] drm/xe: Take in-syncs into account when num_execs or num_binds == 0 Matthew Brost
@ 2023-12-07  5:57 ` Matthew Brost
  2023-12-07  5:57 ` [Intel-xe] [RFC PATCH 7/7] drm/xe/uapi: Uniform async vs sync handling Matthew Brost
  2023-12-07  7:38 ` [Intel-xe] ✗ CI.Patch_applied: failure for Syncs vs async exec/bind uAPI change Patchwork
  7 siblings, 0 replies; 22+ messages in thread
From: Matthew Brost @ 2023-12-07  5:57 UTC (permalink / raw)
  To: intel-xe

The last fence must be added as a dependency for jobs on user exec
queues as it is possible for the last fence to be a composite software
fence (unordered, ioctl with zero bb or binds) rather than hardware
fence (ordered, previous job on queue).

Suggested-by: Thomas Hellström <thomas.hellstrom@linux.intel.com>
Signed-off-by: Matthew Brost <matthew.brost@intel.com>
---
 drivers/gpu/drm/xe/xe_exec.c       |  4 ++++
 drivers/gpu/drm/xe/xe_exec_queue.c |  2 +-
 drivers/gpu/drm/xe/xe_migrate.c    | 14 +++++++++++---
 drivers/gpu/drm/xe/xe_sched_job.c  | 17 +++++++++++++++++
 drivers/gpu/drm/xe/xe_sched_job.h  |  4 ++++
 5 files changed, 37 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/xe/xe_exec.c b/drivers/gpu/drm/xe/xe_exec.c
index 438e34585e1e..92b0da6580e8 100644
--- a/drivers/gpu/drm/xe/xe_exec.c
+++ b/drivers/gpu/drm/xe/xe_exec.c
@@ -313,6 +313,10 @@ int xe_exec_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
 		goto err_put_job;
 
 	if (!xe_vm_in_lr_mode(vm)) {
+		err = xe_sched_job_last_fence_add_dep(job, vm);
+		if (err)
+			goto err_put_job;
+
 		err = down_read_interruptible(&vm->userptr.notifier_lock);
 		if (err)
 			goto err_put_job;
diff --git a/drivers/gpu/drm/xe/xe_exec_queue.c b/drivers/gpu/drm/xe/xe_exec_queue.c
index 67e3fd9dfc5f..3911d14522ee 100644
--- a/drivers/gpu/drm/xe/xe_exec_queue.c
+++ b/drivers/gpu/drm/xe/xe_exec_queue.c
@@ -887,7 +887,7 @@ static void xe_exec_queue_last_fence_lockdep_assert(struct xe_exec_queue *q,
 						    struct xe_vm *vm)
 {
 	if (q->flags & EXEC_QUEUE_FLAG_VM)
-		lockdep_assert_held_write(&vm->lock);
+		lockdep_assert_held(&vm->lock);
 	else
 		xe_vm_assert_held(vm);
 }
diff --git a/drivers/gpu/drm/xe/xe_migrate.c b/drivers/gpu/drm/xe/xe_migrate.c
index e8b567708ac0..ce14498b416a 100644
--- a/drivers/gpu/drm/xe/xe_migrate.c
+++ b/drivers/gpu/drm/xe/xe_migrate.c
@@ -1163,17 +1163,24 @@ xe_migrate_update_pgtables_cpu(struct xe_migrate *m,
 	return fence;
 }
 
-static bool no_in_syncs(struct xe_sync_entry *syncs, u32 num_syncs)
+static bool no_in_syncs(struct xe_vm *vm, struct xe_exec_queue *q,
+			struct xe_sync_entry *syncs, u32 num_syncs)
 {
+	struct dma_fence *fence;
 	int i;
 
 	for (i = 0; i < num_syncs; i++) {
-		struct dma_fence *fence = syncs[i].fence;
+		fence = syncs[i].fence;
 
 		if (fence && !test_bit(DMA_FENCE_FLAG_SIGNALED_BIT,
 				       &fence->flags))
 			return false;
 	}
+	if (q) {
+		fence = xe_exec_queue_last_fence_get(q, vm);
+		if (!test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence->flags))
+			return false;
+	}
 
 	return true;
 }
@@ -1234,7 +1241,7 @@ xe_migrate_update_pgtables(struct xe_migrate *m,
 	u16 pat_index = xe->pat.idx[XE_CACHE_WB];
 
 	/* Use the CPU if no in syncs and engine is idle */
-	if (no_in_syncs(syncs, num_syncs) && xe_exec_queue_is_idle(q_override)) {
+	if (no_in_syncs(vm, q, syncs, num_syncs) && xe_exec_queue_is_idle(q_override)) {
 		fence =  xe_migrate_update_pgtables_cpu(m, vm, bo, updates,
 							num_updates,
 							first_munmap_rebind,
@@ -1351,6 +1358,7 @@ xe_migrate_update_pgtables(struct xe_migrate *m,
 			goto err_job;
 	}
 
+	err = xe_sched_job_last_fence_add_dep(job, vm);
 	for (i = 0; !err && i < num_syncs; i++)
 		err = xe_sync_entry_add_deps(&syncs[i], job);
 
diff --git a/drivers/gpu/drm/xe/xe_sched_job.c b/drivers/gpu/drm/xe/xe_sched_job.c
index b467d5bfa4ac..b7d714522ae1 100644
--- a/drivers/gpu/drm/xe/xe_sched_job.c
+++ b/drivers/gpu/drm/xe/xe_sched_job.c
@@ -260,3 +260,20 @@ void xe_sched_job_push(struct xe_sched_job *job)
 	drm_sched_entity_push_job(&job->drm);
 	xe_sched_job_put(job);
 }
+
+int xe_sched_job_last_fence_add_dep(struct xe_sched_job *job, struct xe_vm *vm)
+{
+	struct dma_fence *fence;
+
+	fence = xe_exec_queue_last_fence_get(job->q, vm);
+
+	/* Only wait on unsignaled software fences */
+	if (!test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence->flags) &&
+	    !(fence->context == job->drm.entity->fence_context ||
+	      fence->context == job->drm.entity->fence_context + 1)) {
+		dma_fence_get(fence);
+		return drm_sched_job_add_dependency(&job->drm, fence);
+	}
+
+	return 0;
+}
diff --git a/drivers/gpu/drm/xe/xe_sched_job.h b/drivers/gpu/drm/xe/xe_sched_job.h
index 6ca1d426c036..34f475ba7f50 100644
--- a/drivers/gpu/drm/xe/xe_sched_job.h
+++ b/drivers/gpu/drm/xe/xe_sched_job.h
@@ -8,6 +8,8 @@
 
 #include "xe_sched_job_types.h"
 
+struct xe_vm;
+
 #define XE_SCHED_HANG_LIMIT 1
 #define XE_SCHED_JOB_TIMEOUT LONG_MAX
 
@@ -54,6 +56,8 @@ bool xe_sched_job_completed(struct xe_sched_job *job);
 void xe_sched_job_arm(struct xe_sched_job *job);
 void xe_sched_job_push(struct xe_sched_job *job);
 
+int xe_sched_job_last_fence_add_dep(struct xe_sched_job *job, struct xe_vm *vm);
+
 static inline struct xe_sched_job *
 to_xe_sched_job(struct drm_sched_job *drm)
 {
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 22+ messages in thread

* [Intel-xe] [RFC PATCH 7/7] drm/xe/uapi: Uniform async vs sync handling
  2023-12-07  5:57 [Intel-xe] [RFC PATCH 0/7] Syncs vs async exec/bind uAPI change Matthew Brost
                   ` (5 preceding siblings ...)
  2023-12-07  5:57 ` [Intel-xe] [RFC PATCH 6/7] drm/xe: Add last fence as dependency for jobs on user exec queues Matthew Brost
@ 2023-12-07  5:57 ` Matthew Brost
  2023-12-07 19:51   ` Rodrigo Vivi
  2023-12-08 15:00   ` Thomas Hellström
  2023-12-07  7:38 ` [Intel-xe] ✗ CI.Patch_applied: failure for Syncs vs async exec/bind uAPI change Patchwork
  7 siblings, 2 replies; 22+ messages in thread
From: Matthew Brost @ 2023-12-07  5:57 UTC (permalink / raw)
  To: intel-xe; +Cc: Francois Dugast, Rodrigo Vivi

Remove concept of async vs sync VM bind queues, rather make async vs
sync a per IOCTL choice. Since this is per IOCTL, it makes sense to have
a singular flag IOCTL rather than per VM bind op flag too. Add
DRM_XE_SYNCS_FLAG_WAIT_FOR_OP which is an input sync flag to support
this. Support this new flag for both the VM bind IOCTL and the exec
IOCTL to match behavior.

Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
Cc: Thomas Hellström <thomas.hellstrom@linux.intel.com>
Cc: Francois Dugast <francois.dugast@intel.com>
Signed-off-by: Matthew Brost <matthew.brost@intel.com>
---
 drivers/gpu/drm/xe/xe_exec.c             |  58 ++++++++----
 drivers/gpu/drm/xe/xe_exec_queue.c       |   7 +-
 drivers/gpu/drm/xe/xe_exec_queue_types.h |   2 -
 drivers/gpu/drm/xe/xe_vm.c               | 110 ++++++++++-------------
 drivers/gpu/drm/xe/xe_vm_types.h         |  15 ++--
 include/uapi/drm/xe_drm.h                |  56 +++++++-----
 6 files changed, 129 insertions(+), 119 deletions(-)

diff --git a/drivers/gpu/drm/xe/xe_exec.c b/drivers/gpu/drm/xe/xe_exec.c
index 92b0da6580e8..c62cabfaa112 100644
--- a/drivers/gpu/drm/xe/xe_exec.c
+++ b/drivers/gpu/drm/xe/xe_exec.c
@@ -130,12 +130,15 @@ static int xe_exec_begin(struct drm_exec *exec, struct xe_vm *vm)
 	return err;
 }
 
+#define ALL_DRM_XE_SYNCS_FLAGS (DRM_XE_SYNCS_FLAG_WAIT_FOR_OP)
+
 int xe_exec_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
 {
 	struct xe_device *xe = to_xe_device(dev);
 	struct xe_file *xef = to_xe_file(file);
 	struct drm_xe_exec *args = data;
-	struct drm_xe_sync __user *syncs_user = u64_to_user_ptr(args->syncs);
+	struct drm_xe_sync __user *syncs_user =
+		u64_to_user_ptr(args->syncs.syncs);
 	u64 __user *addresses_user = u64_to_user_ptr(args->address);
 	struct xe_exec_queue *q;
 	struct xe_sync_entry *syncs = NULL;
@@ -143,15 +146,18 @@ int xe_exec_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
 	struct drm_exec exec;
 	u32 i, num_syncs = 0;
 	struct xe_sched_job *job;
-	struct dma_fence *rebind_fence;
+	struct dma_fence *rebind_fence, *job_fence;
 	struct xe_vm *vm;
-	bool write_locked;
+	bool write_locked, skip_job_put = false;
+	bool wait = args->syncs.flags & DRM_XE_SYNCS_FLAG_WAIT_FOR_OP;
 	ktime_t end = 0;
 	int err = 0;
 
 	if (XE_IOCTL_DBG(xe, args->extensions) ||
-	    XE_IOCTL_DBG(xe, args->pad[0] || args->pad[1] || args->pad[2]) ||
-	    XE_IOCTL_DBG(xe, args->reserved[0] || args->reserved[1]))
+	    XE_IOCTL_DBG(xe, args->pad || args->pad2[0] || args->pad2[1] || args->pad2[2]) ||
+	    XE_IOCTL_DBG(xe, args->reserved[0] || args->reserved[1]) ||
+	    XE_IOCTL_DBG(xe, args->syncs.flags & ~ALL_DRM_XE_SYNCS_FLAGS) ||
+	    XE_IOCTL_DBG(xe, wait && args->syncs.num_syncs))
 		return -EINVAL;
 
 	q = xe_exec_queue_lookup(xef, args->exec_queue_id);
@@ -170,8 +176,9 @@ int xe_exec_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
 		goto err_exec_queue;
 	}
 
-	if (args->num_syncs) {
-		syncs = kcalloc(args->num_syncs, sizeof(*syncs), GFP_KERNEL);
+	if (args->syncs.num_syncs) {
+		syncs = kcalloc(args->syncs.num_syncs, sizeof(*syncs),
+				GFP_KERNEL);
 		if (!syncs) {
 			err = -ENOMEM;
 			goto err_exec_queue;
@@ -180,7 +187,7 @@ int xe_exec_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
 
 	vm = q->vm;
 
-	for (i = 0; i < args->num_syncs; i++) {
+	for (i = 0; i < args->syncs.num_syncs; i++) {
 		err = xe_sync_entry_parse(xe, xef, &syncs[num_syncs++],
 					  &syncs_user[i], SYNC_PARSE_FLAG_EXEC |
 					  (xe_vm_in_lr_mode(vm) ?
@@ -245,9 +252,17 @@ int xe_exec_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
 				err = PTR_ERR(fence);
 				goto err_exec;
 			}
+
 			for (i = 0; i < num_syncs; i++)
 				xe_sync_entry_signal(&syncs[i], NULL, fence);
+
 			xe_exec_queue_last_fence_set(q, vm, fence);
+			if (wait) {
+				long timeout = dma_fence_wait(fence, true);
+
+				if (timeout < 0)
+					err = -EINTR;
+			}
 			dma_fence_put(fence);
 		}
 
@@ -331,42 +346,51 @@ int xe_exec_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
 	 * the job and let the DRM scheduler / backend clean up the job.
 	 */
 	xe_sched_job_arm(job);
+	job_fence = &job->drm.s_fence->finished;
+	if (wait)
+		dma_fence_get(job_fence);
 	if (!xe_vm_in_lr_mode(vm)) {
 		/* Block userptr invalidations / BO eviction */
-		dma_resv_add_fence(&vm->resv,
-				   &job->drm.s_fence->finished,
+		dma_resv_add_fence(&vm->resv, job_fence,
 				   DMA_RESV_USAGE_BOOKKEEP);
 
 		/*
 		 * Make implicit sync work across drivers, assuming all external
 		 * BOs are written as we don't pass in a read / write list.
 		 */
-		xe_vm_fence_all_extobjs(vm, &job->drm.s_fence->finished,
-					DMA_RESV_USAGE_WRITE);
+		xe_vm_fence_all_extobjs(vm, job_fence, DMA_RESV_USAGE_WRITE);
 	}
 
 	for (i = 0; i < num_syncs; i++)
-		xe_sync_entry_signal(&syncs[i], job,
-				     &job->drm.s_fence->finished);
+		xe_sync_entry_signal(&syncs[i], job, job_fence);
 
 	if (xe_exec_queue_is_lr(q))
 		q->ring_ops->emit_job(job);
 	if (!xe_vm_in_lr_mode(vm))
-		xe_exec_queue_last_fence_set(q, vm, &job->drm.s_fence->finished);
+		xe_exec_queue_last_fence_set(q, vm, job_fence);
 	xe_sched_job_push(job);
 	xe_vm_reactivate_rebind(vm);
 
-	if (!err && !xe_vm_in_lr_mode(vm)) {
+	if (!xe_vm_in_lr_mode(vm)) {
 		spin_lock(&xe->ttm.lru_lock);
 		ttm_lru_bulk_move_tail(&vm->lru_bulk_move);
 		spin_unlock(&xe->ttm.lru_lock);
 	}
 
+	skip_job_put = true;
+	if (wait) {
+		long timeout = dma_fence_wait(job_fence, true);
+
+		dma_fence_put(job_fence);
+		if (timeout < 0)
+			err = -EINTR;
+	}
+
 err_repin:
 	if (!xe_vm_in_lr_mode(vm))
 		up_read(&vm->userptr.notifier_lock);
 err_put_job:
-	if (err)
+	if (err && !skip_job_put)
 		xe_sched_job_put(job);
 err_exec:
 	drm_exec_fini(&exec);
diff --git a/drivers/gpu/drm/xe/xe_exec_queue.c b/drivers/gpu/drm/xe/xe_exec_queue.c
index 3911d14522ee..98776d02d634 100644
--- a/drivers/gpu/drm/xe/xe_exec_queue.c
+++ b/drivers/gpu/drm/xe/xe_exec_queue.c
@@ -625,10 +625,7 @@ int xe_exec_queue_create_ioctl(struct drm_device *dev, void *data,
 	if (XE_IOCTL_DBG(xe, eci[0].gt_id >= xe->info.gt_count))
 		return -EINVAL;
 
-	if (eci[0].engine_class >= DRM_XE_ENGINE_CLASS_VM_BIND_ASYNC) {
-		bool sync = eci[0].engine_class ==
-			DRM_XE_ENGINE_CLASS_VM_BIND_SYNC;
-
+	if (eci[0].engine_class == DRM_XE_ENGINE_CLASS_VM_BIND) {
 		for_each_gt(gt, xe, id) {
 			struct xe_exec_queue *new;
 
@@ -654,8 +651,6 @@ int xe_exec_queue_create_ioctl(struct drm_device *dev, void *data,
 						   args->width, hwe,
 						   EXEC_QUEUE_FLAG_PERSISTENT |
 						   EXEC_QUEUE_FLAG_VM |
-						   (sync ? 0 :
-						    EXEC_QUEUE_FLAG_VM_ASYNC) |
 						   (id ?
 						    EXEC_QUEUE_FLAG_BIND_ENGINE_CHILD :
 						    0));
diff --git a/drivers/gpu/drm/xe/xe_exec_queue_types.h b/drivers/gpu/drm/xe/xe_exec_queue_types.h
index 52f0927d0d9b..c78f6e8b41c4 100644
--- a/drivers/gpu/drm/xe/xe_exec_queue_types.h
+++ b/drivers/gpu/drm/xe/xe_exec_queue_types.h
@@ -74,8 +74,6 @@ struct xe_exec_queue {
 #define EXEC_QUEUE_FLAG_VM			BIT(4)
 /* child of VM queue for multi-tile VM jobs */
 #define EXEC_QUEUE_FLAG_BIND_ENGINE_CHILD	BIT(5)
-/* VM jobs for this queue are asynchronous */
-#define EXEC_QUEUE_FLAG_VM_ASYNC		BIT(6)
 
 	/**
 	 * @flags: flags for this exec queue, should statically setup aside from ban
diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c
index cf2eb44a71db..4b0c976c003a 100644
--- a/drivers/gpu/drm/xe/xe_vm.c
+++ b/drivers/gpu/drm/xe/xe_vm.c
@@ -1433,9 +1433,7 @@ struct xe_vm *xe_vm_create(struct xe_device *xe, u32 flags)
 			struct xe_gt *gt = tile->primary_gt;
 			struct xe_vm *migrate_vm;
 			struct xe_exec_queue *q;
-			u32 create_flags = EXEC_QUEUE_FLAG_VM |
-				((flags & XE_VM_FLAG_ASYNC_DEFAULT) ?
-				EXEC_QUEUE_FLAG_VM_ASYNC : 0);
+			u32 create_flags = EXEC_QUEUE_FLAG_VM;
 
 			if (!vm->pt_root[id])
 				continue;
@@ -1835,16 +1833,10 @@ xe_vm_bind_vma(struct xe_vma *vma, struct xe_exec_queue *q,
 	return ERR_PTR(err);
 }
 
-static bool xe_vm_sync_mode(struct xe_vm *vm, struct xe_exec_queue *q)
-{
-	return q ? !(q->flags & EXEC_QUEUE_FLAG_VM_ASYNC) :
-		!(vm->flags & XE_VM_FLAG_ASYNC_DEFAULT);
-}
-
 static int __xe_vm_bind(struct xe_vm *vm, struct xe_vma *vma,
 			struct xe_exec_queue *q, struct xe_sync_entry *syncs,
 			u32 num_syncs, bool immediate, bool first_op,
-			bool last_op)
+			bool last_op, bool async)
 {
 	struct dma_fence *fence;
 	struct xe_exec_queue *wait_exec_queue = to_wait_exec_queue(vm, q);
@@ -1870,7 +1862,7 @@ static int __xe_vm_bind(struct xe_vm *vm, struct xe_vma *vma,
 
 	if (last_op)
 		xe_exec_queue_last_fence_set(wait_exec_queue, vm, fence);
-	if (last_op && xe_vm_sync_mode(vm, q))
+	if (last_op && !async)
 		dma_fence_wait(fence, true);
 	dma_fence_put(fence);
 
@@ -1880,7 +1872,7 @@ static int __xe_vm_bind(struct xe_vm *vm, struct xe_vma *vma,
 static int xe_vm_bind(struct xe_vm *vm, struct xe_vma *vma, struct xe_exec_queue *q,
 		      struct xe_bo *bo, struct xe_sync_entry *syncs,
 		      u32 num_syncs, bool immediate, bool first_op,
-		      bool last_op)
+		      bool last_op, bool async)
 {
 	int err;
 
@@ -1894,12 +1886,12 @@ static int xe_vm_bind(struct xe_vm *vm, struct xe_vma *vma, struct xe_exec_queue
 	}
 
 	return __xe_vm_bind(vm, vma, q, syncs, num_syncs, immediate, first_op,
-			    last_op);
+			    last_op, async);
 }
 
 static int xe_vm_unbind(struct xe_vm *vm, struct xe_vma *vma,
 			struct xe_exec_queue *q, struct xe_sync_entry *syncs,
-			u32 num_syncs, bool first_op, bool last_op)
+			u32 num_syncs, bool first_op, bool last_op, bool async)
 {
 	struct dma_fence *fence;
 	struct xe_exec_queue *wait_exec_queue = to_wait_exec_queue(vm, q);
@@ -1914,7 +1906,7 @@ static int xe_vm_unbind(struct xe_vm *vm, struct xe_vma *vma,
 	xe_vma_destroy(vma, fence);
 	if (last_op)
 		xe_exec_queue_last_fence_set(wait_exec_queue, vm, fence);
-	if (last_op && xe_vm_sync_mode(vm, q))
+	if (last_op && !async)
 		dma_fence_wait(fence, true);
 	dma_fence_put(fence);
 
@@ -1923,7 +1915,6 @@ static int xe_vm_unbind(struct xe_vm *vm, struct xe_vma *vma,
 
 #define ALL_DRM_XE_VM_CREATE_FLAGS (DRM_XE_VM_CREATE_FLAG_SCRATCH_PAGE | \
 				    DRM_XE_VM_CREATE_FLAG_LR_MODE | \
-				    DRM_XE_VM_CREATE_FLAG_ASYNC_DEFAULT | \
 				    DRM_XE_VM_CREATE_FLAG_FAULT_MODE)
 
 int xe_vm_create_ioctl(struct drm_device *dev, void *data,
@@ -1977,8 +1968,6 @@ int xe_vm_create_ioctl(struct drm_device *dev, void *data,
 		flags |= XE_VM_FLAG_SCRATCH_PAGE;
 	if (args->flags & DRM_XE_VM_CREATE_FLAG_LR_MODE)
 		flags |= XE_VM_FLAG_LR_MODE;
-	if (args->flags & DRM_XE_VM_CREATE_FLAG_ASYNC_DEFAULT)
-		flags |= XE_VM_FLAG_ASYNC_DEFAULT;
 	if (args->flags & DRM_XE_VM_CREATE_FLAG_FAULT_MODE)
 		flags |= XE_VM_FLAG_FAULT_MODE;
 
@@ -2062,7 +2051,7 @@ static const u32 region_to_mem_type[] = {
 static int xe_vm_prefetch(struct xe_vm *vm, struct xe_vma *vma,
 			  struct xe_exec_queue *q, u32 region,
 			  struct xe_sync_entry *syncs, u32 num_syncs,
-			  bool first_op, bool last_op)
+			  bool first_op, bool last_op, bool async)
 {
 	struct xe_exec_queue *wait_exec_queue = to_wait_exec_queue(vm, q);
 	int err;
@@ -2077,7 +2066,7 @@ static int xe_vm_prefetch(struct xe_vm *vm, struct xe_vma *vma,
 
 	if (vma->tile_mask != (vma->tile_present & ~vma->usm.tile_invalidated)) {
 		return xe_vm_bind(vm, vma, q, xe_vma_bo(vma), syncs, num_syncs,
-				  true, first_op, last_op);
+				  true, first_op, last_op, async);
 	} else {
 		int i;
 
@@ -2400,6 +2389,8 @@ static int vm_bind_ioctl_ops_parse(struct xe_vm *vm, struct xe_exec_queue *q,
 		}
 
 		op->q = q;
+		if (async)
+			op->flags |= XE_VMA_OP_ASYNC;
 
 		switch (op->base.op) {
 		case DRM_GPUVA_OP_MAP:
@@ -2538,7 +2529,8 @@ static int op_execute(struct drm_exec *exec, struct xe_vm *vm,
 				 op->syncs, op->num_syncs,
 				 op->map.immediate || !xe_vm_in_fault_mode(vm),
 				 op->flags & XE_VMA_OP_FIRST,
-				 op->flags & XE_VMA_OP_LAST);
+				 op->flags & XE_VMA_OP_LAST,
+				 op->flags & XE_VMA_OP_ASYNC);
 		break;
 	case DRM_GPUVA_OP_REMAP:
 	{
@@ -2552,7 +2544,8 @@ static int op_execute(struct drm_exec *exec, struct xe_vm *vm,
 					   op->num_syncs,
 					   op->flags & XE_VMA_OP_FIRST,
 					   op->flags & XE_VMA_OP_LAST &&
-					   !prev && !next);
+					   !prev && !next,
+					   op->flags & XE_VMA_OP_ASYNC);
 			if (err)
 				break;
 			op->remap.unmap_done = true;
@@ -2563,7 +2556,8 @@ static int op_execute(struct drm_exec *exec, struct xe_vm *vm,
 			err = xe_vm_bind(vm, op->remap.prev, op->q,
 					 xe_vma_bo(op->remap.prev), op->syncs,
 					 op->num_syncs, true, false,
-					 op->flags & XE_VMA_OP_LAST && !next);
+					 op->flags & XE_VMA_OP_LAST && !next,
+					 op->flags & XE_VMA_OP_ASYNC);
 			op->remap.prev->gpuva.flags &= ~XE_VMA_LAST_REBIND;
 			if (err)
 				break;
@@ -2576,7 +2570,8 @@ static int op_execute(struct drm_exec *exec, struct xe_vm *vm,
 					 xe_vma_bo(op->remap.next),
 					 op->syncs, op->num_syncs,
 					 true, false,
-					 op->flags & XE_VMA_OP_LAST);
+					 op->flags & XE_VMA_OP_LAST,
+					 op->flags & XE_VMA_OP_ASYNC);
 			op->remap.next->gpuva.flags &= ~XE_VMA_LAST_REBIND;
 			if (err)
 				break;
@@ -2588,13 +2583,15 @@ static int op_execute(struct drm_exec *exec, struct xe_vm *vm,
 	case DRM_GPUVA_OP_UNMAP:
 		err = xe_vm_unbind(vm, vma, op->q, op->syncs,
 				   op->num_syncs, op->flags & XE_VMA_OP_FIRST,
-				   op->flags & XE_VMA_OP_LAST);
+				   op->flags & XE_VMA_OP_LAST,
+				   op->flags & XE_VMA_OP_ASYNC);
 		break;
 	case DRM_GPUVA_OP_PREFETCH:
 		err = xe_vm_prefetch(vm, vma, op->q, op->prefetch.region,
 				     op->syncs, op->num_syncs,
 				     op->flags & XE_VMA_OP_FIRST,
-				     op->flags & XE_VMA_OP_LAST);
+				     op->flags & XE_VMA_OP_LAST,
+				     op->flags & XE_VMA_OP_ASYNC);
 		break;
 	default:
 		drm_warn(&vm->xe->drm, "NOT POSSIBLE");
@@ -2808,16 +2805,16 @@ static int vm_bind_ioctl_ops_execute(struct xe_vm *vm,
 
 #ifdef TEST_VM_ASYNC_OPS_ERROR
 #define SUPPORTED_FLAGS	\
-	(FORCE_ASYNC_OP_ERROR | DRM_XE_VM_BIND_FLAG_ASYNC | \
-	 DRM_XE_VM_BIND_FLAG_READONLY | DRM_XE_VM_BIND_FLAG_IMMEDIATE | \
-	 DRM_XE_VM_BIND_FLAG_NULL | 0xffff)
+	(FORCE_ASYNC_OP_ERROR | DRM_XE_VM_BIND_FLAG_READONLY | \
+	 DRM_XE_VM_BIND_FLAG_IMMEDIATE | DRM_XE_VM_BIND_FLAG_NULL | 0xffff)
 #else
 #define SUPPORTED_FLAGS	\
-	(DRM_XE_VM_BIND_FLAG_ASYNC | DRM_XE_VM_BIND_FLAG_READONLY | \
+	(DRM_XE_VM_BIND_FLAG_READONLY | \
 	 DRM_XE_VM_BIND_FLAG_IMMEDIATE | DRM_XE_VM_BIND_FLAG_NULL | \
 	 0xffff)
 #endif
 #define XE_64K_PAGE_MASK 0xffffull
+#define ALL_DRM_XE_SYNCS_FLAGS (DRM_XE_SYNCS_FLAG_WAIT_FOR_OP)
 
 #define MAX_BINDS	512	/* FIXME: Picking random upper limit */
 
@@ -2829,7 +2826,7 @@ static int vm_bind_ioctl_check_args(struct xe_device *xe,
 	int err;
 	int i;
 
-	if (XE_IOCTL_DBG(xe, args->pad || args->pad2) ||
+	if (XE_IOCTL_DBG(xe, args->pad) ||
 	    XE_IOCTL_DBG(xe, args->reserved[0] || args->reserved[1]))
 		return -EINVAL;
 
@@ -2857,6 +2854,14 @@ static int vm_bind_ioctl_check_args(struct xe_device *xe,
 		*bind_ops = &args->bind;
 	}
 
+	*async = !(args->syncs.flags & DRM_XE_SYNCS_FLAG_WAIT_FOR_OP);
+
+	if (XE_IOCTL_DBG(xe, args->syncs.flags & ~ALL_DRM_XE_SYNCS_FLAGS) ||
+	    XE_IOCTL_DBG(xe, !*async && args->syncs.num_syncs)) {
+		err = -EINVAL;
+		goto free_bind_ops;
+	}
+
 	for (i = 0; i < args->num_binds; ++i) {
 		u64 range = (*bind_ops)[i].range;
 		u64 addr = (*bind_ops)[i].addr;
@@ -2887,18 +2892,6 @@ static int vm_bind_ioctl_check_args(struct xe_device *xe,
 			goto free_bind_ops;
 		}
 
-		if (i == 0) {
-			*async = !!(flags & DRM_XE_VM_BIND_FLAG_ASYNC);
-			if (XE_IOCTL_DBG(xe, !*async && args->num_syncs)) {
-				err = -EINVAL;
-				goto free_bind_ops;
-			}
-		} else if (XE_IOCTL_DBG(xe, *async !=
-					!!(flags & DRM_XE_VM_BIND_FLAG_ASYNC))) {
-			err = -EINVAL;
-			goto free_bind_ops;
-		}
-
 		if (XE_IOCTL_DBG(xe, op > DRM_XE_VM_BIND_OP_PREFETCH) ||
 		    XE_IOCTL_DBG(xe, flags & ~SUPPORTED_FLAGS) ||
 		    XE_IOCTL_DBG(xe, obj && is_null) ||
@@ -2951,7 +2944,7 @@ static int vm_bind_ioctl_check_args(struct xe_device *xe,
 static int vm_bind_ioctl_signal_fences(struct xe_vm *vm,
 				       struct xe_exec_queue *q,
 				       struct xe_sync_entry *syncs,
-				       int num_syncs)
+				       int num_syncs, bool async)
 {
 	struct dma_fence *fence;
 	int i, err = 0;
@@ -2967,7 +2960,7 @@ static int vm_bind_ioctl_signal_fences(struct xe_vm *vm,
 	xe_exec_queue_last_fence_set(to_wait_exec_queue(vm, q), vm,
 				     fence);
 
-	if (xe_vm_sync_mode(vm, q)) {
+	if (!async) {
 		long timeout = dma_fence_wait(fence, true);
 
 		if (timeout < 0)
@@ -3001,7 +2994,7 @@ int xe_vm_bind_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
 	if (err)
 		return err;
 
-	if (XE_IOCTL_DBG(xe, args->pad || args->pad2) ||
+	if (XE_IOCTL_DBG(xe, args->pad) ||
 	    XE_IOCTL_DBG(xe, args->reserved[0] || args->reserved[1]))
 		return -EINVAL;
 
@@ -3016,12 +3009,6 @@ int xe_vm_bind_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
 			err = -EINVAL;
 			goto put_exec_queue;
 		}
-
-		if (XE_IOCTL_DBG(xe, args->num_binds && async !=
-				 !!(q->flags & EXEC_QUEUE_FLAG_VM_ASYNC))) {
-			err = -EINVAL;
-			goto put_exec_queue;
-		}
 	}
 
 	vm = xe_vm_lookup(xef, args->vm_id);
@@ -3030,14 +3017,6 @@ int xe_vm_bind_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
 		goto put_exec_queue;
 	}
 
-	if (!args->exec_queue_id) {
-		if (XE_IOCTL_DBG(xe, args->num_binds && async !=
-				 !!(vm->flags & XE_VM_FLAG_ASYNC_DEFAULT))) {
-			err = -EINVAL;
-			goto put_vm;
-		}
-	}
-
 	err = down_write_killable(&vm->lock);
 	if (err)
 		goto put_vm;
@@ -3127,16 +3106,16 @@ int xe_vm_bind_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
 		}
 	}
 
-	if (args->num_syncs) {
-		syncs = kcalloc(args->num_syncs, sizeof(*syncs), GFP_KERNEL);
+	if (args->syncs.num_syncs) {
+		syncs = kcalloc(args->syncs.num_syncs, sizeof(*syncs), GFP_KERNEL);
 		if (!syncs) {
 			err = -ENOMEM;
 			goto put_obj;
 		}
 	}
 
-	syncs_user = u64_to_user_ptr(args->syncs);
-	for (num_syncs = 0; num_syncs < args->num_syncs; num_syncs++) {
+	syncs_user = u64_to_user_ptr(args->syncs.syncs);
+	for (num_syncs = 0; num_syncs < args->syncs.num_syncs; num_syncs++) {
 		err = xe_sync_entry_parse(xe, xef, &syncs[num_syncs],
 					  &syncs_user[num_syncs],
 					  (xe_vm_in_lr_mode(vm) ?
@@ -3210,7 +3189,8 @@ int xe_vm_bind_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
 	vm_bind_ioctl_ops_unwind(vm, ops, args->num_binds);
 free_syncs:
 	if (err == -ENODATA)
-		err = vm_bind_ioctl_signal_fences(vm, q, syncs, num_syncs);
+		err = vm_bind_ioctl_signal_fences(vm, q, syncs, num_syncs,
+						  async);
 	while (num_syncs--)
 		xe_sync_entry_cleanup(&syncs[num_syncs]);
 
diff --git a/drivers/gpu/drm/xe/xe_vm_types.h b/drivers/gpu/drm/xe/xe_vm_types.h
index 23abdfd8622f..ce8b9bde7e9c 100644
--- a/drivers/gpu/drm/xe/xe_vm_types.h
+++ b/drivers/gpu/drm/xe/xe_vm_types.h
@@ -167,13 +167,12 @@ struct xe_vm {
 	 */
 #define XE_VM_FLAG_64K			BIT(0)
 #define XE_VM_FLAG_LR_MODE		BIT(1)
-#define XE_VM_FLAG_ASYNC_DEFAULT	BIT(2)
-#define XE_VM_FLAG_MIGRATION		BIT(3)
-#define XE_VM_FLAG_SCRATCH_PAGE		BIT(4)
-#define XE_VM_FLAG_FAULT_MODE		BIT(5)
-#define XE_VM_FLAG_BANNED		BIT(6)
-#define XE_VM_FLAG_TILE_ID(flags)	FIELD_GET(GENMASK(8, 7), flags)
-#define XE_VM_FLAG_SET_TILE_ID(tile)	FIELD_PREP(GENMASK(8, 7), (tile)->id)
+#define XE_VM_FLAG_MIGRATION		BIT(2)
+#define XE_VM_FLAG_SCRATCH_PAGE		BIT(3)
+#define XE_VM_FLAG_FAULT_MODE		BIT(4)
+#define XE_VM_FLAG_BANNED		BIT(5)
+#define XE_VM_FLAG_TILE_ID(flags)	FIELD_GET(GENMASK(7, 6), flags)
+#define XE_VM_FLAG_SET_TILE_ID(tile)	FIELD_PREP(GENMASK(7, 6), (tile)->id)
 	unsigned long flags;
 
 	/** @composite_fence_ctx: context composite fence */
@@ -385,6 +384,8 @@ enum xe_vma_op_flags {
 	XE_VMA_OP_PREV_COMMITTED	= BIT(3),
 	/** @XE_VMA_OP_NEXT_COMMITTED: Next VMA operation committed */
 	XE_VMA_OP_NEXT_COMMITTED	= BIT(4),
+	/** @XE_VMA_OP_ASYNC: operation is async */
+	XE_VMA_OP_ASYNC			= BIT(5),
 };
 
 /** struct xe_vma_op - VMA operation */
diff --git a/include/uapi/drm/xe_drm.h b/include/uapi/drm/xe_drm.h
index eb03a49c17a1..fd8172fe2d9a 100644
--- a/include/uapi/drm/xe_drm.h
+++ b/include/uapi/drm/xe_drm.h
@@ -141,8 +141,7 @@ struct drm_xe_engine_class_instance {
 	 * Kernel only classes (not actual hardware engine class). Used for
 	 * creating ordered queues of VM bind operations.
 	 */
-#define DRM_XE_ENGINE_CLASS_VM_BIND_ASYNC	5
-#define DRM_XE_ENGINE_CLASS_VM_BIND_SYNC	6
+#define DRM_XE_ENGINE_CLASS_VM_BIND		5
 	__u16 engine_class;
 
 	__u16 engine_instance;
@@ -660,7 +659,6 @@ struct drm_xe_vm_create {
 	 * still enable recoverable pagefaults if supported by the device.
 	 */
 #define DRM_XE_VM_CREATE_FLAG_LR_MODE	        (1 << 1)
-#define DRM_XE_VM_CREATE_FLAG_ASYNC_DEFAULT	(1 << 2)
 	/*
 	 * DRM_XE_VM_CREATE_FLAG_FAULT_MODE requires also
 	 * DRM_XE_VM_CREATE_FLAG_LR_MODE. It allows memory to be allocated
@@ -668,7 +666,7 @@ struct drm_xe_vm_create {
 	 * The xe driver internally uses recoverable pagefaults to implement
 	 * this.
 	 */
-#define DRM_XE_VM_CREATE_FLAG_FAULT_MODE	(1 << 3)
+#define DRM_XE_VM_CREATE_FLAG_FAULT_MODE	(1 << 2)
 	/** @flags: Flags */
 	__u32 flags;
 
@@ -776,12 +774,11 @@ struct drm_xe_vm_bind_op {
 	__u32 op;
 
 #define DRM_XE_VM_BIND_FLAG_READONLY	(1 << 0)
-#define DRM_XE_VM_BIND_FLAG_ASYNC	(1 << 1)
 	/*
 	 * Valid on a faulting VM only, do the MAP operation immediately rather
 	 * than deferring the MAP to the page fault handler.
 	 */
-#define DRM_XE_VM_BIND_FLAG_IMMEDIATE	(1 << 2)
+#define DRM_XE_VM_BIND_FLAG_IMMEDIATE	(1 << 1)
 	/*
 	 * When the NULL flag is set, the page tables are setup with a special
 	 * bit which indicates writes are dropped and all reads return zero.  In
@@ -789,7 +786,7 @@ struct drm_xe_vm_bind_op {
 	 * operations, the BO handle MBZ, and the BO offset MBZ. This flag is
 	 * intended to implement VK sparse bindings.
 	 */
-#define DRM_XE_VM_BIND_FLAG_NULL	(1 << 3)
+#define DRM_XE_VM_BIND_FLAG_NULL	(1 << 2)
 	/** @flags: Bind flags */
 	__u32 flags;
 
@@ -807,6 +804,27 @@ struct drm_xe_vm_bind_op {
 	__u64 reserved[3];
 };
 
+/**
+ * struct drm_xe_syncs - In / out syncs for IOCTLs.
+ */
+struct drm_xe_syncs {
+	/** @num_syncs: amount of syncs to wait on */
+	__u32 num_syncs;
+
+	/*
+	 * Block in IOCTL until operation complete, num_syncs MBZ if set.
+	 */
+#define DRM_XE_SYNCS_FLAG_WAIT_FOR_OP (1 << 0)
+	/** @flags: Sync flags */
+	__u32 flags;
+
+	/** @syncs: pointer to struct drm_xe_sync array */
+	__u64 syncs;
+
+	/** @reserved: Reserved */
+	__u64 reserved[2];
+};
+
 struct drm_xe_vm_bind {
 	/** @extensions: Pointer to the first extension struct, if any */
 	__u64 extensions;
@@ -838,14 +856,8 @@ struct drm_xe_vm_bind {
 		__u64 vector_of_binds;
 	};
 
-	/** @pad: MBZ */
-	__u32 pad2;
-
-	/** @num_syncs: amount of syncs to wait on */
-	__u32 num_syncs;
-
-	/** @syncs: pointer to struct drm_xe_sync array */
-	__u64 syncs;
+	/** @syncs: syncs for bind */
+	struct drm_xe_syncs syncs;
 
 	/** @reserved: Reserved */
 	__u64 reserved[2];
@@ -974,14 +986,14 @@ struct drm_xe_exec {
 	/** @extensions: Pointer to the first extension struct, if any */
 	__u64 extensions;
 
+	/** @pad: MBZ */
+	__u32 pad;
+
 	/** @exec_queue_id: Exec queue ID for the batch buffer */
 	__u32 exec_queue_id;
 
-	/** @num_syncs: Amount of struct drm_xe_sync in array. */
-	__u32 num_syncs;
-
-	/** @syncs: Pointer to struct drm_xe_sync array. */
-	__u64 syncs;
+	/** @syncs: syncs for exec */
+	struct drm_xe_syncs syncs;
 
 	/**
 	 * @address: address of batch buffer if num_batch_buffer == 1 or an
@@ -995,8 +1007,8 @@ struct drm_xe_exec {
 	 */
 	__u16 num_batch_buffer;
 
-	/** @pad: MBZ */
-	__u16 pad[3];
+	/** @pad2: MBZ */
+	__u16 pad2[3];
 
 	/** @reserved: Reserved */
 	__u64 reserved[2];
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 22+ messages in thread

* [Intel-xe] ✗ CI.Patch_applied: failure for Syncs vs async exec/bind uAPI change
  2023-12-07  5:57 [Intel-xe] [RFC PATCH 0/7] Syncs vs async exec/bind uAPI change Matthew Brost
                   ` (6 preceding siblings ...)
  2023-12-07  5:57 ` [Intel-xe] [RFC PATCH 7/7] drm/xe/uapi: Uniform async vs sync handling Matthew Brost
@ 2023-12-07  7:38 ` Patchwork
  7 siblings, 0 replies; 22+ messages in thread
From: Patchwork @ 2023-12-07  7:38 UTC (permalink / raw)
  To: Matthew Brost; +Cc: intel-xe

== Series Details ==

Series: Syncs vs async exec/bind uAPI change
URL   : https://patchwork.freedesktop.org/series/127472/
State : failure

== Summary ==

=== Applying kernel patches on branch 'drm-xe-next' with base: ===
Base commit: 668d13abe drm/xe: Avoid any races around ccs_mode update
=== git am output follows ===
error: patch failed: include/uapi/drm/xe_drm.h:141
error: include/uapi/drm/xe_drm.h: patch does not apply
hint: Use 'git am --show-current-patch' to see the failed patch
Applying: drm/xe: Use a flags field instead of bools for VMA create
Applying: drm/xe: Use a flags field instead of bools for sync parse
Applying: drm/xe: Allow num_binds == 0 in VM bind IOCTL
Applying: drm/xe: Allow num_batch_buffer == 0 in exec IOCTL
Applying: drm/xe: Take in-syncs into account when num_execs or num_binds == 0
Applying: drm/xe: Add last fence as dependency for jobs on user exec queues
Applying: drm/xe/uapi: Uniform async vs sync handling
Patch failed at 0007 drm/xe/uapi: Uniform async vs sync handling
When you have resolved this problem, run "git am --continue".
If you prefer to skip this patch, run "git am --skip" instead.
To restore the original branch and stop patching, run "git am --abort".



^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [Intel-xe] [RFC PATCH 7/7] drm/xe/uapi: Uniform async vs sync handling
  2023-12-07  5:57 ` [Intel-xe] [RFC PATCH 7/7] drm/xe/uapi: Uniform async vs sync handling Matthew Brost
@ 2023-12-07 19:51   ` Rodrigo Vivi
  2023-12-08 15:00   ` Thomas Hellström
  1 sibling, 0 replies; 22+ messages in thread
From: Rodrigo Vivi @ 2023-12-07 19:51 UTC (permalink / raw)
  To: Matthew Brost; +Cc: Francois Dugast, intel-xe

On Wed, Dec 06, 2023 at 09:57:29PM -0800, Matthew Brost wrote:
> Remove concept of async vs sync VM bind queues, rather make async vs
> sync a per IOCTL choice. Since this is per IOCTL, it makes sense to have
> a singular flag IOCTL rather than per VM bind op flag too. Add
> DRM_XE_SYNCS_FLAG_WAIT_FOR_OP which is an input sync flag to support
> this. Support this new flag for both the VM bind IOCTL and the exec
> IOCTL to match behavior.
> 
> Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
> Cc: Thomas Hellström <thomas.hellstrom@linux.intel.com>

I'd like to get an extra ack from Thomas to ensure this aligns with
all he documented for the async case.

But the code looks good to me and also pahole is happy with
your padding choices, so:

Reviewed-by: Rodrigo Vivi <rodrigo.vivi@intel.com>

> Cc: Francois Dugast <francois.dugast@intel.com>
> Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> ---
>  drivers/gpu/drm/xe/xe_exec.c             |  58 ++++++++----
>  drivers/gpu/drm/xe/xe_exec_queue.c       |   7 +-
>  drivers/gpu/drm/xe/xe_exec_queue_types.h |   2 -
>  drivers/gpu/drm/xe/xe_vm.c               | 110 ++++++++++-------------
>  drivers/gpu/drm/xe/xe_vm_types.h         |  15 ++--
>  include/uapi/drm/xe_drm.h                |  56 +++++++-----
>  6 files changed, 129 insertions(+), 119 deletions(-)
> 
> diff --git a/drivers/gpu/drm/xe/xe_exec.c b/drivers/gpu/drm/xe/xe_exec.c
> index 92b0da6580e8..c62cabfaa112 100644
> --- a/drivers/gpu/drm/xe/xe_exec.c
> +++ b/drivers/gpu/drm/xe/xe_exec.c
> @@ -130,12 +130,15 @@ static int xe_exec_begin(struct drm_exec *exec, struct xe_vm *vm)
>  	return err;
>  }
>  
> +#define ALL_DRM_XE_SYNCS_FLAGS (DRM_XE_SYNCS_FLAG_WAIT_FOR_OP)
> +
>  int xe_exec_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
>  {
>  	struct xe_device *xe = to_xe_device(dev);
>  	struct xe_file *xef = to_xe_file(file);
>  	struct drm_xe_exec *args = data;
> -	struct drm_xe_sync __user *syncs_user = u64_to_user_ptr(args->syncs);
> +	struct drm_xe_sync __user *syncs_user =
> +		u64_to_user_ptr(args->syncs.syncs);
>  	u64 __user *addresses_user = u64_to_user_ptr(args->address);
>  	struct xe_exec_queue *q;
>  	struct xe_sync_entry *syncs = NULL;
> @@ -143,15 +146,18 @@ int xe_exec_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
>  	struct drm_exec exec;
>  	u32 i, num_syncs = 0;
>  	struct xe_sched_job *job;
> -	struct dma_fence *rebind_fence;
> +	struct dma_fence *rebind_fence, *job_fence;
>  	struct xe_vm *vm;
> -	bool write_locked;
> +	bool write_locked, skip_job_put = false;
> +	bool wait = args->syncs.flags & DRM_XE_SYNCS_FLAG_WAIT_FOR_OP;
>  	ktime_t end = 0;
>  	int err = 0;
>  
>  	if (XE_IOCTL_DBG(xe, args->extensions) ||
> -	    XE_IOCTL_DBG(xe, args->pad[0] || args->pad[1] || args->pad[2]) ||
> -	    XE_IOCTL_DBG(xe, args->reserved[0] || args->reserved[1]))
> +	    XE_IOCTL_DBG(xe, args->pad || args->pad2[0] || args->pad2[1] || args->pad2[2]) ||
> +	    XE_IOCTL_DBG(xe, args->reserved[0] || args->reserved[1]) ||
> +	    XE_IOCTL_DBG(xe, args->syncs.flags & ~ALL_DRM_XE_SYNCS_FLAGS) ||
> +	    XE_IOCTL_DBG(xe, wait && args->syncs.num_syncs))
>  		return -EINVAL;
>  
>  	q = xe_exec_queue_lookup(xef, args->exec_queue_id);
> @@ -170,8 +176,9 @@ int xe_exec_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
>  		goto err_exec_queue;
>  	}
>  
> -	if (args->num_syncs) {
> -		syncs = kcalloc(args->num_syncs, sizeof(*syncs), GFP_KERNEL);
> +	if (args->syncs.num_syncs) {
> +		syncs = kcalloc(args->syncs.num_syncs, sizeof(*syncs),
> +				GFP_KERNEL);
>  		if (!syncs) {
>  			err = -ENOMEM;
>  			goto err_exec_queue;
> @@ -180,7 +187,7 @@ int xe_exec_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
>  
>  	vm = q->vm;
>  
> -	for (i = 0; i < args->num_syncs; i++) {
> +	for (i = 0; i < args->syncs.num_syncs; i++) {
>  		err = xe_sync_entry_parse(xe, xef, &syncs[num_syncs++],
>  					  &syncs_user[i], SYNC_PARSE_FLAG_EXEC |
>  					  (xe_vm_in_lr_mode(vm) ?
> @@ -245,9 +252,17 @@ int xe_exec_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
>  				err = PTR_ERR(fence);
>  				goto err_exec;
>  			}
> +
>  			for (i = 0; i < num_syncs; i++)
>  				xe_sync_entry_signal(&syncs[i], NULL, fence);
> +
>  			xe_exec_queue_last_fence_set(q, vm, fence);
> +			if (wait) {
> +				long timeout = dma_fence_wait(fence, true);
> +
> +				if (timeout < 0)
> +					err = -EINTR;
> +			}
>  			dma_fence_put(fence);
>  		}
>  
> @@ -331,42 +346,51 @@ int xe_exec_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
>  	 * the job and let the DRM scheduler / backend clean up the job.
>  	 */
>  	xe_sched_job_arm(job);
> +	job_fence = &job->drm.s_fence->finished;
> +	if (wait)
> +		dma_fence_get(job_fence);
>  	if (!xe_vm_in_lr_mode(vm)) {
>  		/* Block userptr invalidations / BO eviction */
> -		dma_resv_add_fence(&vm->resv,
> -				   &job->drm.s_fence->finished,
> +		dma_resv_add_fence(&vm->resv, job_fence,
>  				   DMA_RESV_USAGE_BOOKKEEP);
>  
>  		/*
>  		 * Make implicit sync work across drivers, assuming all external
>  		 * BOs are written as we don't pass in a read / write list.
>  		 */
> -		xe_vm_fence_all_extobjs(vm, &job->drm.s_fence->finished,
> -					DMA_RESV_USAGE_WRITE);
> +		xe_vm_fence_all_extobjs(vm, job_fence, DMA_RESV_USAGE_WRITE);
>  	}
>  
>  	for (i = 0; i < num_syncs; i++)
> -		xe_sync_entry_signal(&syncs[i], job,
> -				     &job->drm.s_fence->finished);
> +		xe_sync_entry_signal(&syncs[i], job, job_fence);
>  
>  	if (xe_exec_queue_is_lr(q))
>  		q->ring_ops->emit_job(job);
>  	if (!xe_vm_in_lr_mode(vm))
> -		xe_exec_queue_last_fence_set(q, vm, &job->drm.s_fence->finished);
> +		xe_exec_queue_last_fence_set(q, vm, job_fence);
>  	xe_sched_job_push(job);
>  	xe_vm_reactivate_rebind(vm);
>  
> -	if (!err && !xe_vm_in_lr_mode(vm)) {
> +	if (!xe_vm_in_lr_mode(vm)) {
>  		spin_lock(&xe->ttm.lru_lock);
>  		ttm_lru_bulk_move_tail(&vm->lru_bulk_move);
>  		spin_unlock(&xe->ttm.lru_lock);
>  	}
>  
> +	skip_job_put = true;
> +	if (wait) {
> +		long timeout = dma_fence_wait(job_fence, true);
> +
> +		dma_fence_put(job_fence);
> +		if (timeout < 0)
> +			err = -EINTR;
> +	}
> +
>  err_repin:
>  	if (!xe_vm_in_lr_mode(vm))
>  		up_read(&vm->userptr.notifier_lock);
>  err_put_job:
> -	if (err)
> +	if (err && !skip_job_put)
>  		xe_sched_job_put(job);
>  err_exec:
>  	drm_exec_fini(&exec);
> diff --git a/drivers/gpu/drm/xe/xe_exec_queue.c b/drivers/gpu/drm/xe/xe_exec_queue.c
> index 3911d14522ee..98776d02d634 100644
> --- a/drivers/gpu/drm/xe/xe_exec_queue.c
> +++ b/drivers/gpu/drm/xe/xe_exec_queue.c
> @@ -625,10 +625,7 @@ int xe_exec_queue_create_ioctl(struct drm_device *dev, void *data,
>  	if (XE_IOCTL_DBG(xe, eci[0].gt_id >= xe->info.gt_count))
>  		return -EINVAL;
>  
> -	if (eci[0].engine_class >= DRM_XE_ENGINE_CLASS_VM_BIND_ASYNC) {
> -		bool sync = eci[0].engine_class ==
> -			DRM_XE_ENGINE_CLASS_VM_BIND_SYNC;
> -
> +	if (eci[0].engine_class == DRM_XE_ENGINE_CLASS_VM_BIND) {
>  		for_each_gt(gt, xe, id) {
>  			struct xe_exec_queue *new;
>  
> @@ -654,8 +651,6 @@ int xe_exec_queue_create_ioctl(struct drm_device *dev, void *data,
>  						   args->width, hwe,
>  						   EXEC_QUEUE_FLAG_PERSISTENT |
>  						   EXEC_QUEUE_FLAG_VM |
> -						   (sync ? 0 :
> -						    EXEC_QUEUE_FLAG_VM_ASYNC) |
>  						   (id ?
>  						    EXEC_QUEUE_FLAG_BIND_ENGINE_CHILD :
>  						    0));
> diff --git a/drivers/gpu/drm/xe/xe_exec_queue_types.h b/drivers/gpu/drm/xe/xe_exec_queue_types.h
> index 52f0927d0d9b..c78f6e8b41c4 100644
> --- a/drivers/gpu/drm/xe/xe_exec_queue_types.h
> +++ b/drivers/gpu/drm/xe/xe_exec_queue_types.h
> @@ -74,8 +74,6 @@ struct xe_exec_queue {
>  #define EXEC_QUEUE_FLAG_VM			BIT(4)
>  /* child of VM queue for multi-tile VM jobs */
>  #define EXEC_QUEUE_FLAG_BIND_ENGINE_CHILD	BIT(5)
> -/* VM jobs for this queue are asynchronous */
> -#define EXEC_QUEUE_FLAG_VM_ASYNC		BIT(6)
>  
>  	/**
>  	 * @flags: flags for this exec queue, should statically setup aside from ban
> diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c
> index cf2eb44a71db..4b0c976c003a 100644
> --- a/drivers/gpu/drm/xe/xe_vm.c
> +++ b/drivers/gpu/drm/xe/xe_vm.c
> @@ -1433,9 +1433,7 @@ struct xe_vm *xe_vm_create(struct xe_device *xe, u32 flags)
>  			struct xe_gt *gt = tile->primary_gt;
>  			struct xe_vm *migrate_vm;
>  			struct xe_exec_queue *q;
> -			u32 create_flags = EXEC_QUEUE_FLAG_VM |
> -				((flags & XE_VM_FLAG_ASYNC_DEFAULT) ?
> -				EXEC_QUEUE_FLAG_VM_ASYNC : 0);
> +			u32 create_flags = EXEC_QUEUE_FLAG_VM;
>  
>  			if (!vm->pt_root[id])
>  				continue;
> @@ -1835,16 +1833,10 @@ xe_vm_bind_vma(struct xe_vma *vma, struct xe_exec_queue *q,
>  	return ERR_PTR(err);
>  }
>  
> -static bool xe_vm_sync_mode(struct xe_vm *vm, struct xe_exec_queue *q)
> -{
> -	return q ? !(q->flags & EXEC_QUEUE_FLAG_VM_ASYNC) :
> -		!(vm->flags & XE_VM_FLAG_ASYNC_DEFAULT);
> -}
> -
>  static int __xe_vm_bind(struct xe_vm *vm, struct xe_vma *vma,
>  			struct xe_exec_queue *q, struct xe_sync_entry *syncs,
>  			u32 num_syncs, bool immediate, bool first_op,
> -			bool last_op)
> +			bool last_op, bool async)
>  {
>  	struct dma_fence *fence;
>  	struct xe_exec_queue *wait_exec_queue = to_wait_exec_queue(vm, q);
> @@ -1870,7 +1862,7 @@ static int __xe_vm_bind(struct xe_vm *vm, struct xe_vma *vma,
>  
>  	if (last_op)
>  		xe_exec_queue_last_fence_set(wait_exec_queue, vm, fence);
> -	if (last_op && xe_vm_sync_mode(vm, q))
> +	if (last_op && !async)
>  		dma_fence_wait(fence, true);
>  	dma_fence_put(fence);
>  
> @@ -1880,7 +1872,7 @@ static int __xe_vm_bind(struct xe_vm *vm, struct xe_vma *vma,
>  static int xe_vm_bind(struct xe_vm *vm, struct xe_vma *vma, struct xe_exec_queue *q,
>  		      struct xe_bo *bo, struct xe_sync_entry *syncs,
>  		      u32 num_syncs, bool immediate, bool first_op,
> -		      bool last_op)
> +		      bool last_op, bool async)
>  {
>  	int err;
>  
> @@ -1894,12 +1886,12 @@ static int xe_vm_bind(struct xe_vm *vm, struct xe_vma *vma, struct xe_exec_queue
>  	}
>  
>  	return __xe_vm_bind(vm, vma, q, syncs, num_syncs, immediate, first_op,
> -			    last_op);
> +			    last_op, async);
>  }
>  
>  static int xe_vm_unbind(struct xe_vm *vm, struct xe_vma *vma,
>  			struct xe_exec_queue *q, struct xe_sync_entry *syncs,
> -			u32 num_syncs, bool first_op, bool last_op)
> +			u32 num_syncs, bool first_op, bool last_op, bool async)
>  {
>  	struct dma_fence *fence;
>  	struct xe_exec_queue *wait_exec_queue = to_wait_exec_queue(vm, q);
> @@ -1914,7 +1906,7 @@ static int xe_vm_unbind(struct xe_vm *vm, struct xe_vma *vma,
>  	xe_vma_destroy(vma, fence);
>  	if (last_op)
>  		xe_exec_queue_last_fence_set(wait_exec_queue, vm, fence);
> -	if (last_op && xe_vm_sync_mode(vm, q))
> +	if (last_op && !async)
>  		dma_fence_wait(fence, true);
>  	dma_fence_put(fence);
>  
> @@ -1923,7 +1915,6 @@ static int xe_vm_unbind(struct xe_vm *vm, struct xe_vma *vma,
>  
>  #define ALL_DRM_XE_VM_CREATE_FLAGS (DRM_XE_VM_CREATE_FLAG_SCRATCH_PAGE | \
>  				    DRM_XE_VM_CREATE_FLAG_LR_MODE | \
> -				    DRM_XE_VM_CREATE_FLAG_ASYNC_DEFAULT | \
>  				    DRM_XE_VM_CREATE_FLAG_FAULT_MODE)
>  
>  int xe_vm_create_ioctl(struct drm_device *dev, void *data,
> @@ -1977,8 +1968,6 @@ int xe_vm_create_ioctl(struct drm_device *dev, void *data,
>  		flags |= XE_VM_FLAG_SCRATCH_PAGE;
>  	if (args->flags & DRM_XE_VM_CREATE_FLAG_LR_MODE)
>  		flags |= XE_VM_FLAG_LR_MODE;
> -	if (args->flags & DRM_XE_VM_CREATE_FLAG_ASYNC_DEFAULT)
> -		flags |= XE_VM_FLAG_ASYNC_DEFAULT;
>  	if (args->flags & DRM_XE_VM_CREATE_FLAG_FAULT_MODE)
>  		flags |= XE_VM_FLAG_FAULT_MODE;
>  
> @@ -2062,7 +2051,7 @@ static const u32 region_to_mem_type[] = {
>  static int xe_vm_prefetch(struct xe_vm *vm, struct xe_vma *vma,
>  			  struct xe_exec_queue *q, u32 region,
>  			  struct xe_sync_entry *syncs, u32 num_syncs,
> -			  bool first_op, bool last_op)
> +			  bool first_op, bool last_op, bool async)
>  {
>  	struct xe_exec_queue *wait_exec_queue = to_wait_exec_queue(vm, q);
>  	int err;
> @@ -2077,7 +2066,7 @@ static int xe_vm_prefetch(struct xe_vm *vm, struct xe_vma *vma,
>  
>  	if (vma->tile_mask != (vma->tile_present & ~vma->usm.tile_invalidated)) {
>  		return xe_vm_bind(vm, vma, q, xe_vma_bo(vma), syncs, num_syncs,
> -				  true, first_op, last_op);
> +				  true, first_op, last_op, async);
>  	} else {
>  		int i;
>  
> @@ -2400,6 +2389,8 @@ static int vm_bind_ioctl_ops_parse(struct xe_vm *vm, struct xe_exec_queue *q,
>  		}
>  
>  		op->q = q;
> +		if (async)
> +			op->flags |= XE_VMA_OP_ASYNC;
>  
>  		switch (op->base.op) {
>  		case DRM_GPUVA_OP_MAP:
> @@ -2538,7 +2529,8 @@ static int op_execute(struct drm_exec *exec, struct xe_vm *vm,
>  				 op->syncs, op->num_syncs,
>  				 op->map.immediate || !xe_vm_in_fault_mode(vm),
>  				 op->flags & XE_VMA_OP_FIRST,
> -				 op->flags & XE_VMA_OP_LAST);
> +				 op->flags & XE_VMA_OP_LAST,
> +				 op->flags & XE_VMA_OP_ASYNC);
>  		break;
>  	case DRM_GPUVA_OP_REMAP:
>  	{
> @@ -2552,7 +2544,8 @@ static int op_execute(struct drm_exec *exec, struct xe_vm *vm,
>  					   op->num_syncs,
>  					   op->flags & XE_VMA_OP_FIRST,
>  					   op->flags & XE_VMA_OP_LAST &&
> -					   !prev && !next);
> +					   !prev && !next,
> +					   op->flags & XE_VMA_OP_ASYNC);
>  			if (err)
>  				break;
>  			op->remap.unmap_done = true;
> @@ -2563,7 +2556,8 @@ static int op_execute(struct drm_exec *exec, struct xe_vm *vm,
>  			err = xe_vm_bind(vm, op->remap.prev, op->q,
>  					 xe_vma_bo(op->remap.prev), op->syncs,
>  					 op->num_syncs, true, false,
> -					 op->flags & XE_VMA_OP_LAST && !next);
> +					 op->flags & XE_VMA_OP_LAST && !next,
> +					 op->flags & XE_VMA_OP_ASYNC);
>  			op->remap.prev->gpuva.flags &= ~XE_VMA_LAST_REBIND;
>  			if (err)
>  				break;
> @@ -2576,7 +2570,8 @@ static int op_execute(struct drm_exec *exec, struct xe_vm *vm,
>  					 xe_vma_bo(op->remap.next),
>  					 op->syncs, op->num_syncs,
>  					 true, false,
> -					 op->flags & XE_VMA_OP_LAST);
> +					 op->flags & XE_VMA_OP_LAST,
> +					 op->flags & XE_VMA_OP_ASYNC);
>  			op->remap.next->gpuva.flags &= ~XE_VMA_LAST_REBIND;
>  			if (err)
>  				break;
> @@ -2588,13 +2583,15 @@ static int op_execute(struct drm_exec *exec, struct xe_vm *vm,
>  	case DRM_GPUVA_OP_UNMAP:
>  		err = xe_vm_unbind(vm, vma, op->q, op->syncs,
>  				   op->num_syncs, op->flags & XE_VMA_OP_FIRST,
> -				   op->flags & XE_VMA_OP_LAST);
> +				   op->flags & XE_VMA_OP_LAST,
> +				   op->flags & XE_VMA_OP_ASYNC);
>  		break;
>  	case DRM_GPUVA_OP_PREFETCH:
>  		err = xe_vm_prefetch(vm, vma, op->q, op->prefetch.region,
>  				     op->syncs, op->num_syncs,
>  				     op->flags & XE_VMA_OP_FIRST,
> -				     op->flags & XE_VMA_OP_LAST);
> +				     op->flags & XE_VMA_OP_LAST,
> +				     op->flags & XE_VMA_OP_ASYNC);
>  		break;
>  	default:
>  		drm_warn(&vm->xe->drm, "NOT POSSIBLE");
> @@ -2808,16 +2805,16 @@ static int vm_bind_ioctl_ops_execute(struct xe_vm *vm,
>  
>  #ifdef TEST_VM_ASYNC_OPS_ERROR
>  #define SUPPORTED_FLAGS	\
> -	(FORCE_ASYNC_OP_ERROR | DRM_XE_VM_BIND_FLAG_ASYNC | \
> -	 DRM_XE_VM_BIND_FLAG_READONLY | DRM_XE_VM_BIND_FLAG_IMMEDIATE | \
> -	 DRM_XE_VM_BIND_FLAG_NULL | 0xffff)
> +	(FORCE_ASYNC_OP_ERROR | DRM_XE_VM_BIND_FLAG_READONLY | \
> +	 DRM_XE_VM_BIND_FLAG_IMMEDIATE | DRM_XE_VM_BIND_FLAG_NULL | 0xffff)
>  #else
>  #define SUPPORTED_FLAGS	\
> -	(DRM_XE_VM_BIND_FLAG_ASYNC | DRM_XE_VM_BIND_FLAG_READONLY | \
> +	(DRM_XE_VM_BIND_FLAG_READONLY | \
>  	 DRM_XE_VM_BIND_FLAG_IMMEDIATE | DRM_XE_VM_BIND_FLAG_NULL | \
>  	 0xffff)
>  #endif
>  #define XE_64K_PAGE_MASK 0xffffull
> +#define ALL_DRM_XE_SYNCS_FLAGS (DRM_XE_SYNCS_FLAG_WAIT_FOR_OP)
>  
>  #define MAX_BINDS	512	/* FIXME: Picking random upper limit */
>  
> @@ -2829,7 +2826,7 @@ static int vm_bind_ioctl_check_args(struct xe_device *xe,
>  	int err;
>  	int i;
>  
> -	if (XE_IOCTL_DBG(xe, args->pad || args->pad2) ||
> +	if (XE_IOCTL_DBG(xe, args->pad) ||
>  	    XE_IOCTL_DBG(xe, args->reserved[0] || args->reserved[1]))
>  		return -EINVAL;
>  
> @@ -2857,6 +2854,14 @@ static int vm_bind_ioctl_check_args(struct xe_device *xe,
>  		*bind_ops = &args->bind;
>  	}
>  
> +	*async = !(args->syncs.flags & DRM_XE_SYNCS_FLAG_WAIT_FOR_OP);
> +
> +	if (XE_IOCTL_DBG(xe, args->syncs.flags & ~ALL_DRM_XE_SYNCS_FLAGS) ||
> +	    XE_IOCTL_DBG(xe, !*async && args->syncs.num_syncs)) {
> +		err = -EINVAL;
> +		goto free_bind_ops;
> +	}
> +
>  	for (i = 0; i < args->num_binds; ++i) {
>  		u64 range = (*bind_ops)[i].range;
>  		u64 addr = (*bind_ops)[i].addr;
> @@ -2887,18 +2892,6 @@ static int vm_bind_ioctl_check_args(struct xe_device *xe,
>  			goto free_bind_ops;
>  		}
>  
> -		if (i == 0) {
> -			*async = !!(flags & DRM_XE_VM_BIND_FLAG_ASYNC);
> -			if (XE_IOCTL_DBG(xe, !*async && args->num_syncs)) {
> -				err = -EINVAL;
> -				goto free_bind_ops;
> -			}
> -		} else if (XE_IOCTL_DBG(xe, *async !=
> -					!!(flags & DRM_XE_VM_BIND_FLAG_ASYNC))) {
> -			err = -EINVAL;
> -			goto free_bind_ops;
> -		}
> -
>  		if (XE_IOCTL_DBG(xe, op > DRM_XE_VM_BIND_OP_PREFETCH) ||
>  		    XE_IOCTL_DBG(xe, flags & ~SUPPORTED_FLAGS) ||
>  		    XE_IOCTL_DBG(xe, obj && is_null) ||
> @@ -2951,7 +2944,7 @@ static int vm_bind_ioctl_check_args(struct xe_device *xe,
>  static int vm_bind_ioctl_signal_fences(struct xe_vm *vm,
>  				       struct xe_exec_queue *q,
>  				       struct xe_sync_entry *syncs,
> -				       int num_syncs)
> +				       int num_syncs, bool async)
>  {
>  	struct dma_fence *fence;
>  	int i, err = 0;
> @@ -2967,7 +2960,7 @@ static int vm_bind_ioctl_signal_fences(struct xe_vm *vm,
>  	xe_exec_queue_last_fence_set(to_wait_exec_queue(vm, q), vm,
>  				     fence);
>  
> -	if (xe_vm_sync_mode(vm, q)) {
> +	if (!async) {
>  		long timeout = dma_fence_wait(fence, true);
>  
>  		if (timeout < 0)
> @@ -3001,7 +2994,7 @@ int xe_vm_bind_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
>  	if (err)
>  		return err;
>  
> -	if (XE_IOCTL_DBG(xe, args->pad || args->pad2) ||
> +	if (XE_IOCTL_DBG(xe, args->pad) ||
>  	    XE_IOCTL_DBG(xe, args->reserved[0] || args->reserved[1]))
>  		return -EINVAL;
>  
> @@ -3016,12 +3009,6 @@ int xe_vm_bind_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
>  			err = -EINVAL;
>  			goto put_exec_queue;
>  		}
> -
> -		if (XE_IOCTL_DBG(xe, args->num_binds && async !=
> -				 !!(q->flags & EXEC_QUEUE_FLAG_VM_ASYNC))) {
> -			err = -EINVAL;
> -			goto put_exec_queue;
> -		}
>  	}
>  
>  	vm = xe_vm_lookup(xef, args->vm_id);
> @@ -3030,14 +3017,6 @@ int xe_vm_bind_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
>  		goto put_exec_queue;
>  	}
>  
> -	if (!args->exec_queue_id) {
> -		if (XE_IOCTL_DBG(xe, args->num_binds && async !=
> -				 !!(vm->flags & XE_VM_FLAG_ASYNC_DEFAULT))) {
> -			err = -EINVAL;
> -			goto put_vm;
> -		}
> -	}
> -
>  	err = down_write_killable(&vm->lock);
>  	if (err)
>  		goto put_vm;
> @@ -3127,16 +3106,16 @@ int xe_vm_bind_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
>  		}
>  	}
>  
> -	if (args->num_syncs) {
> -		syncs = kcalloc(args->num_syncs, sizeof(*syncs), GFP_KERNEL);
> +	if (args->syncs.num_syncs) {
> +		syncs = kcalloc(args->syncs.num_syncs, sizeof(*syncs), GFP_KERNEL);
>  		if (!syncs) {
>  			err = -ENOMEM;
>  			goto put_obj;
>  		}
>  	}
>  
> -	syncs_user = u64_to_user_ptr(args->syncs);
> -	for (num_syncs = 0; num_syncs < args->num_syncs; num_syncs++) {
> +	syncs_user = u64_to_user_ptr(args->syncs.syncs);
> +	for (num_syncs = 0; num_syncs < args->syncs.num_syncs; num_syncs++) {
>  		err = xe_sync_entry_parse(xe, xef, &syncs[num_syncs],
>  					  &syncs_user[num_syncs],
>  					  (xe_vm_in_lr_mode(vm) ?
> @@ -3210,7 +3189,8 @@ int xe_vm_bind_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
>  	vm_bind_ioctl_ops_unwind(vm, ops, args->num_binds);
>  free_syncs:
>  	if (err == -ENODATA)
> -		err = vm_bind_ioctl_signal_fences(vm, q, syncs, num_syncs);
> +		err = vm_bind_ioctl_signal_fences(vm, q, syncs, num_syncs,
> +						  async);
>  	while (num_syncs--)
>  		xe_sync_entry_cleanup(&syncs[num_syncs]);
>  
> diff --git a/drivers/gpu/drm/xe/xe_vm_types.h b/drivers/gpu/drm/xe/xe_vm_types.h
> index 23abdfd8622f..ce8b9bde7e9c 100644
> --- a/drivers/gpu/drm/xe/xe_vm_types.h
> +++ b/drivers/gpu/drm/xe/xe_vm_types.h
> @@ -167,13 +167,12 @@ struct xe_vm {
>  	 */
>  #define XE_VM_FLAG_64K			BIT(0)
>  #define XE_VM_FLAG_LR_MODE		BIT(1)
> -#define XE_VM_FLAG_ASYNC_DEFAULT	BIT(2)
> -#define XE_VM_FLAG_MIGRATION		BIT(3)
> -#define XE_VM_FLAG_SCRATCH_PAGE		BIT(4)
> -#define XE_VM_FLAG_FAULT_MODE		BIT(5)
> -#define XE_VM_FLAG_BANNED		BIT(6)
> -#define XE_VM_FLAG_TILE_ID(flags)	FIELD_GET(GENMASK(8, 7), flags)
> -#define XE_VM_FLAG_SET_TILE_ID(tile)	FIELD_PREP(GENMASK(8, 7), (tile)->id)
> +#define XE_VM_FLAG_MIGRATION		BIT(2)
> +#define XE_VM_FLAG_SCRATCH_PAGE		BIT(3)
> +#define XE_VM_FLAG_FAULT_MODE		BIT(4)
> +#define XE_VM_FLAG_BANNED		BIT(5)
> +#define XE_VM_FLAG_TILE_ID(flags)	FIELD_GET(GENMASK(7, 6), flags)
> +#define XE_VM_FLAG_SET_TILE_ID(tile)	FIELD_PREP(GENMASK(7, 6), (tile)->id)
>  	unsigned long flags;
>  
>  	/** @composite_fence_ctx: context composite fence */
> @@ -385,6 +384,8 @@ enum xe_vma_op_flags {
>  	XE_VMA_OP_PREV_COMMITTED	= BIT(3),
>  	/** @XE_VMA_OP_NEXT_COMMITTED: Next VMA operation committed */
>  	XE_VMA_OP_NEXT_COMMITTED	= BIT(4),
> +	/** @XE_VMA_OP_ASYNC: operation is async */
> +	XE_VMA_OP_ASYNC			= BIT(5),
>  };
>  
>  /** struct xe_vma_op - VMA operation */
> diff --git a/include/uapi/drm/xe_drm.h b/include/uapi/drm/xe_drm.h
> index eb03a49c17a1..fd8172fe2d9a 100644
> --- a/include/uapi/drm/xe_drm.h
> +++ b/include/uapi/drm/xe_drm.h
> @@ -141,8 +141,7 @@ struct drm_xe_engine_class_instance {
>  	 * Kernel only classes (not actual hardware engine class). Used for
>  	 * creating ordered queues of VM bind operations.
>  	 */
> -#define DRM_XE_ENGINE_CLASS_VM_BIND_ASYNC	5
> -#define DRM_XE_ENGINE_CLASS_VM_BIND_SYNC	6
> +#define DRM_XE_ENGINE_CLASS_VM_BIND		5
>  	__u16 engine_class;
>  
>  	__u16 engine_instance;
> @@ -660,7 +659,6 @@ struct drm_xe_vm_create {
>  	 * still enable recoverable pagefaults if supported by the device.
>  	 */
>  #define DRM_XE_VM_CREATE_FLAG_LR_MODE	        (1 << 1)
> -#define DRM_XE_VM_CREATE_FLAG_ASYNC_DEFAULT	(1 << 2)
>  	/*
>  	 * DRM_XE_VM_CREATE_FLAG_FAULT_MODE requires also
>  	 * DRM_XE_VM_CREATE_FLAG_LR_MODE. It allows memory to be allocated
> @@ -668,7 +666,7 @@ struct drm_xe_vm_create {
>  	 * The xe driver internally uses recoverable pagefaults to implement
>  	 * this.
>  	 */
> -#define DRM_XE_VM_CREATE_FLAG_FAULT_MODE	(1 << 3)
> +#define DRM_XE_VM_CREATE_FLAG_FAULT_MODE	(1 << 2)
>  	/** @flags: Flags */
>  	__u32 flags;
>  
> @@ -776,12 +774,11 @@ struct drm_xe_vm_bind_op {
>  	__u32 op;
>  
>  #define DRM_XE_VM_BIND_FLAG_READONLY	(1 << 0)
> -#define DRM_XE_VM_BIND_FLAG_ASYNC	(1 << 1)
>  	/*
>  	 * Valid on a faulting VM only, do the MAP operation immediately rather
>  	 * than deferring the MAP to the page fault handler.
>  	 */
> -#define DRM_XE_VM_BIND_FLAG_IMMEDIATE	(1 << 2)
> +#define DRM_XE_VM_BIND_FLAG_IMMEDIATE	(1 << 1)
>  	/*
>  	 * When the NULL flag is set, the page tables are setup with a special
>  	 * bit which indicates writes are dropped and all reads return zero.  In
> @@ -789,7 +786,7 @@ struct drm_xe_vm_bind_op {
>  	 * operations, the BO handle MBZ, and the BO offset MBZ. This flag is
>  	 * intended to implement VK sparse bindings.
>  	 */
> -#define DRM_XE_VM_BIND_FLAG_NULL	(1 << 3)
> +#define DRM_XE_VM_BIND_FLAG_NULL	(1 << 2)
>  	/** @flags: Bind flags */
>  	__u32 flags;
>  
> @@ -807,6 +804,27 @@ struct drm_xe_vm_bind_op {
>  	__u64 reserved[3];
>  };
>  
> +/**
> + * struct drm_xe_syncs - In / out syncs for IOCTLs.
> + */
> +struct drm_xe_syncs {
> +	/** @num_syncs: amount of syncs to wait on */
> +	__u32 num_syncs;
> +
> +	/*
> +	 * Block in IOCTL until operation complete, num_syncs MBZ if set.
> +	 */
> +#define DRM_XE_SYNCS_FLAG_WAIT_FOR_OP (1 << 0)
> +	/** @flags: Sync flags */
> +	__u32 flags;
> +
> +	/** @syncs: pointer to struct drm_xe_sync array */
> +	__u64 syncs;
> +
> +	/** @reserved: Reserved */
> +	__u64 reserved[2];
> +};
> +
>  struct drm_xe_vm_bind {
>  	/** @extensions: Pointer to the first extension struct, if any */
>  	__u64 extensions;
> @@ -838,14 +856,8 @@ struct drm_xe_vm_bind {
>  		__u64 vector_of_binds;
>  	};
>  
> -	/** @pad: MBZ */
> -	__u32 pad2;
> -
> -	/** @num_syncs: amount of syncs to wait on */
> -	__u32 num_syncs;
> -
> -	/** @syncs: pointer to struct drm_xe_sync array */
> -	__u64 syncs;
> +	/** @syncs: syncs for bind */
> +	struct drm_xe_syncs syncs;
>  
>  	/** @reserved: Reserved */
>  	__u64 reserved[2];
> @@ -974,14 +986,14 @@ struct drm_xe_exec {
>  	/** @extensions: Pointer to the first extension struct, if any */
>  	__u64 extensions;
>  
> +	/** @pad: MBZ */
> +	__u32 pad;
> +
>  	/** @exec_queue_id: Exec queue ID for the batch buffer */
>  	__u32 exec_queue_id;
>  
> -	/** @num_syncs: Amount of struct drm_xe_sync in array. */
> -	__u32 num_syncs;
> -
> -	/** @syncs: Pointer to struct drm_xe_sync array. */
> -	__u64 syncs;
> +	/** @syncs: syncs for exec */
> +	struct drm_xe_syncs syncs;
>  
>  	/**
>  	 * @address: address of batch buffer if num_batch_buffer == 1 or an
> @@ -995,8 +1007,8 @@ struct drm_xe_exec {
>  	 */
>  	__u16 num_batch_buffer;
>  
> -	/** @pad: MBZ */
> -	__u16 pad[3];
> +	/** @pad2: MBZ */
> +	__u16 pad2[3];
>  
>  	/** @reserved: Reserved */
>  	__u64 reserved[2];
> -- 
> 2.34.1
> 

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [RFC PATCH 7/7] drm/xe/uapi: Uniform async vs sync handling
  2023-12-08 15:00   ` Thomas Hellström
@ 2023-12-08  9:45     ` Matthew Brost
  2023-12-11 15:43       ` Thomas Hellström
  2023-12-08 12:24     ` Matthew Brost
  1 sibling, 1 reply; 22+ messages in thread
From: Matthew Brost @ 2023-12-08  9:45 UTC (permalink / raw)
  To: Thomas Hellström; +Cc: Francois Dugast, intel-xe, Rodrigo Vivi

On Fri, Dec 08, 2023 at 04:00:37PM +0100, Thomas Hellström wrote:
> 
> On 12/7/23 06:57, Matthew Brost wrote:
> > Remove concept of async vs sync VM bind queues, rather make async vs
> > sync a per IOCTL choice. Since this is per IOCTL, it makes sense to have
> > a singular flag IOCTL rather than per VM bind op flag too. Add
> > DRM_XE_SYNCS_FLAG_WAIT_FOR_OP which is an input sync flag to support
> > this. Support this new flag for both the VM bind IOCTL and the exec
> > IOCTL to match behavior.
> > 
> > Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
> > Cc: Thomas Hellström <thomas.hellstrom@linux.intel.com>
> > Cc: Francois Dugast <francois.dugast@intel.com>
> > Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> > ---
> >   drivers/gpu/drm/xe/xe_exec.c             |  58 ++++++++----
> >   drivers/gpu/drm/xe/xe_exec_queue.c       |   7 +-
> >   drivers/gpu/drm/xe/xe_exec_queue_types.h |   2 -
> >   drivers/gpu/drm/xe/xe_vm.c               | 110 ++++++++++-------------
> >   drivers/gpu/drm/xe/xe_vm_types.h         |  15 ++--
> >   include/uapi/drm/xe_drm.h                |  56 +++++++-----
> >   6 files changed, 129 insertions(+), 119 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/xe/xe_exec.c b/drivers/gpu/drm/xe/xe_exec.c
> > index 92b0da6580e8..c62cabfaa112 100644
> > --- a/drivers/gpu/drm/xe/xe_exec.c
> > +++ b/drivers/gpu/drm/xe/xe_exec.c
> > @@ -130,12 +130,15 @@ static int xe_exec_begin(struct drm_exec *exec, struct xe_vm *vm)
> >   	return err;
> >   }
> > +#define ALL_DRM_XE_SYNCS_FLAGS (DRM_XE_SYNCS_FLAG_WAIT_FOR_OP)
> > +
> >   int xe_exec_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
> >   {
> >   	struct xe_device *xe = to_xe_device(dev);
> >   	struct xe_file *xef = to_xe_file(file);
> >   	struct drm_xe_exec *args = data;
> > -	struct drm_xe_sync __user *syncs_user = u64_to_user_ptr(args->syncs);
> > +	struct drm_xe_sync __user *syncs_user =
> > +		u64_to_user_ptr(args->syncs.syncs);
> >   	u64 __user *addresses_user = u64_to_user_ptr(args->address);
> >   	struct xe_exec_queue *q;
> >   	struct xe_sync_entry *syncs = NULL;
> > @@ -143,15 +146,18 @@ int xe_exec_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
> >   	struct drm_exec exec;
> >   	u32 i, num_syncs = 0;
> >   	struct xe_sched_job *job;
> > -	struct dma_fence *rebind_fence;
> > +	struct dma_fence *rebind_fence, *job_fence;
> >   	struct xe_vm *vm;
> > -	bool write_locked;
> > +	bool write_locked, skip_job_put = false;
> > +	bool wait = args->syncs.flags & DRM_XE_SYNCS_FLAG_WAIT_FOR_OP;
> >   	ktime_t end = 0;
> >   	int err = 0;
> >   	if (XE_IOCTL_DBG(xe, args->extensions) ||
> > -	    XE_IOCTL_DBG(xe, args->pad[0] || args->pad[1] || args->pad[2]) ||
> > -	    XE_IOCTL_DBG(xe, args->reserved[0] || args->reserved[1]))
> > +	    XE_IOCTL_DBG(xe, args->pad || args->pad2[0] || args->pad2[1] || args->pad2[2]) ||
> > +	    XE_IOCTL_DBG(xe, args->reserved[0] || args->reserved[1]) ||
> > +	    XE_IOCTL_DBG(xe, args->syncs.flags & ~ALL_DRM_XE_SYNCS_FLAGS) ||
> > +	    XE_IOCTL_DBG(xe, wait && args->syncs.num_syncs))
> >   		return -EINVAL;
> >   	q = xe_exec_queue_lookup(xef, args->exec_queue_id);
> > @@ -170,8 +176,9 @@ int xe_exec_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
> >   		goto err_exec_queue;
> >   	}
> > -	if (args->num_syncs) {
> > -		syncs = kcalloc(args->num_syncs, sizeof(*syncs), GFP_KERNEL);
> > +	if (args->syncs.num_syncs) {
> > +		syncs = kcalloc(args->syncs.num_syncs, sizeof(*syncs),
> > +				GFP_KERNEL);
> >   		if (!syncs) {
> >   			err = -ENOMEM;
> >   			goto err_exec_queue;
> > @@ -180,7 +187,7 @@ int xe_exec_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
> >   	vm = q->vm;
> > -	for (i = 0; i < args->num_syncs; i++) {
> > +	for (i = 0; i < args->syncs.num_syncs; i++) {
> >   		err = xe_sync_entry_parse(xe, xef, &syncs[num_syncs++],
> >   					  &syncs_user[i], SYNC_PARSE_FLAG_EXEC |
> >   					  (xe_vm_in_lr_mode(vm) ?
> > @@ -245,9 +252,17 @@ int xe_exec_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
> >   				err = PTR_ERR(fence);
> >   				goto err_exec;
> >   			}
> > +
> >   			for (i = 0; i < num_syncs; i++)
> >   				xe_sync_entry_signal(&syncs[i], NULL, fence);
> > +
> >   			xe_exec_queue_last_fence_set(q, vm, fence);
> > +			if (wait) {
> > +				long timeout = dma_fence_wait(fence, true);
> > +
> > +				if (timeout < 0)
> > +					err = -EINTR;
> > +			}
> 
> Here it looks like we will rerun the same IOCTL again if we return -EINTR.
> The user-space expected action on -EINTR is to just restart the IOCTL
> without any argument changes. Solution is to add an ioctl argument cookie
> (or to skip sync vm binds and have the user just use the 0 batch buffers /
> vm_binds calls or wait for an out-fence). If you go for the cookie solution
> then IMO we should keep the -ERESTARTSYS returned from dma_fence_wait()
> since it's converted to -EINTR on return-to-user-space, and the kernel
> restarts the IOCTL automatically if there was no requested-for-delivery
> signal pending.
> 
> I think the simplest solution at this point is to skip the sync behaviour,
> in particular if we enable the 0 batch / bind possibility.
> 
> If we still want to provide it, we could add a cookie address as an
> extension to the ioctl and activate sync if present? (Just throwing up ideas
> here).
> 

Hmm, forgot about this. A cookie is fairly easy, what about something like this:

 807 /**
 808  * struct drm_xe_syncs - In / out syncs for IOCTLs.
 809  */
 810 struct drm_xe_syncs {
 811         /** @num_syncs: amount of syncs to wait on */
 812         __u32 num_syncs;
 813
 814         /*
 815          * Block in IOCTL until operation complete, num_syncs MBZ if set.
 816          */
 817 #define DRM_XE_SYNCS_IN_FLAG_WAIT_FOR_OP (1 << 0)
 818         /** @in_flags: Input Sync flags */
 819         __u16 in_flags;
 820
 821         /*
 822          * IOCTL operation has started (no need for user to resubmit on
 823          * -ERESTARTSYS)
 824          */
 825 #define DRM_XE_SYNCS_OUT_FLAG_OP_COMMITTED (1 << 0)
 826         /** @out_flags: Output Sync flags */
 827         __u16 out_flags;
 828
 829         /** @syncs: pointer to struct drm_xe_sync array */
 830         __u64 syncs;
 831
 832         /** @reserved: Reserved */
 833         __u64 reserved[2];
 834 };

DRM_XE_SYNCS_OUT_FLAG_OP_COMMITTED gets set in exec / bind IOCTL after
the job is committed or in the of zero ops last-fence updated on the
queue. Note that for binds we don't yet do 1 job per IOCTL but after
landing some version of [1]

After DRM_XE_SYNCS_OUT_FLAG_OP_COMMITTED is set we return -ERESTARTSYS if
the wait is interrupted and -EINTR is still
DRM_XE_SYNCS_OUT_FLAG_OP_COMMITTED (interrupted before job is
committed).

I'd rather go with patch as we have to change the uAPI here regardless
so we might as well make this complete.

Matt

[1] https://patchwork.freedesktop.org/series/125608/

> >   			dma_fence_put(fence);
> >   		}
> > @@ -331,42 +346,51 @@ int xe_exec_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
> >   	 * the job and let the DRM scheduler / backend clean up the job.
> >   	 */
> >   	xe_sched_job_arm(job);
> > +	job_fence = &job->drm.s_fence->finished;
> > +	if (wait)
> > +		dma_fence_get(job_fence);
> >   	if (!xe_vm_in_lr_mode(vm)) {
> >   		/* Block userptr invalidations / BO eviction */
> > -		dma_resv_add_fence(&vm->resv,
> > -				   &job->drm.s_fence->finished,
> > +		dma_resv_add_fence(&vm->resv, job_fence,
> >   				   DMA_RESV_USAGE_BOOKKEEP);
> >   		/*
> >   		 * Make implicit sync work across drivers, assuming all external
> >   		 * BOs are written as we don't pass in a read / write list.
> >   		 */
> > -		xe_vm_fence_all_extobjs(vm, &job->drm.s_fence->finished,
> > -					DMA_RESV_USAGE_WRITE);
> > +		xe_vm_fence_all_extobjs(vm, job_fence, DMA_RESV_USAGE_WRITE);
> >   	}
> >   	for (i = 0; i < num_syncs; i++)
> > -		xe_sync_entry_signal(&syncs[i], job,
> > -				     &job->drm.s_fence->finished);
> > +		xe_sync_entry_signal(&syncs[i], job, job_fence);
> >   	if (xe_exec_queue_is_lr(q))
> >   		q->ring_ops->emit_job(job);
> >   	if (!xe_vm_in_lr_mode(vm))
> > -		xe_exec_queue_last_fence_set(q, vm, &job->drm.s_fence->finished);
> > +		xe_exec_queue_last_fence_set(q, vm, job_fence);
> >   	xe_sched_job_push(job);
> >   	xe_vm_reactivate_rebind(vm);
> > -	if (!err && !xe_vm_in_lr_mode(vm)) {
> > +	if (!xe_vm_in_lr_mode(vm)) {
> >   		spin_lock(&xe->ttm.lru_lock);
> >   		ttm_lru_bulk_move_tail(&vm->lru_bulk_move);
> >   		spin_unlock(&xe->ttm.lru_lock);
> >   	}
> > +	skip_job_put = true;
> > +	if (wait) {
> > +		long timeout = dma_fence_wait(job_fence, true);
> > +
> > +		dma_fence_put(job_fence);
> > +		if (timeout < 0)
> > +			err = -EINTR;
> > +	}
> > +
> >   err_repin:
> >   	if (!xe_vm_in_lr_mode(vm))
> >   		up_read(&vm->userptr.notifier_lock);
> >   err_put_job:
> > -	if (err)
> > +	if (err && !skip_job_put)
> >   		xe_sched_job_put(job);
> >   err_exec:
> >   	drm_exec_fini(&exec);
> > diff --git a/drivers/gpu/drm/xe/xe_exec_queue.c b/drivers/gpu/drm/xe/xe_exec_queue.c
> > index 3911d14522ee..98776d02d634 100644
> > --- a/drivers/gpu/drm/xe/xe_exec_queue.c
> > +++ b/drivers/gpu/drm/xe/xe_exec_queue.c
> > @@ -625,10 +625,7 @@ int xe_exec_queue_create_ioctl(struct drm_device *dev, void *data,
> >   	if (XE_IOCTL_DBG(xe, eci[0].gt_id >= xe->info.gt_count))
> >   		return -EINVAL;
> > -	if (eci[0].engine_class >= DRM_XE_ENGINE_CLASS_VM_BIND_ASYNC) {
> > -		bool sync = eci[0].engine_class ==
> > -			DRM_XE_ENGINE_CLASS_VM_BIND_SYNC;
> > -
> > +	if (eci[0].engine_class == DRM_XE_ENGINE_CLASS_VM_BIND) {
> >   		for_each_gt(gt, xe, id) {
> >   			struct xe_exec_queue *new;
> > @@ -654,8 +651,6 @@ int xe_exec_queue_create_ioctl(struct drm_device *dev, void *data,
> >   						   args->width, hwe,
> >   						   EXEC_QUEUE_FLAG_PERSISTENT |
> >   						   EXEC_QUEUE_FLAG_VM |
> > -						   (sync ? 0 :
> > -						    EXEC_QUEUE_FLAG_VM_ASYNC) |
> >   						   (id ?
> >   						    EXEC_QUEUE_FLAG_BIND_ENGINE_CHILD :
> >   						    0));
> > diff --git a/drivers/gpu/drm/xe/xe_exec_queue_types.h b/drivers/gpu/drm/xe/xe_exec_queue_types.h
> > index 52f0927d0d9b..c78f6e8b41c4 100644
> > --- a/drivers/gpu/drm/xe/xe_exec_queue_types.h
> > +++ b/drivers/gpu/drm/xe/xe_exec_queue_types.h
> > @@ -74,8 +74,6 @@ struct xe_exec_queue {
> >   #define EXEC_QUEUE_FLAG_VM			BIT(4)
> >   /* child of VM queue for multi-tile VM jobs */
> >   #define EXEC_QUEUE_FLAG_BIND_ENGINE_CHILD	BIT(5)
> > -/* VM jobs for this queue are asynchronous */
> > -#define EXEC_QUEUE_FLAG_VM_ASYNC		BIT(6)
> >   	/**
> >   	 * @flags: flags for this exec queue, should statically setup aside from ban
> > diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c
> > index cf2eb44a71db..4b0c976c003a 100644
> > --- a/drivers/gpu/drm/xe/xe_vm.c
> > +++ b/drivers/gpu/drm/xe/xe_vm.c
> > @@ -1433,9 +1433,7 @@ struct xe_vm *xe_vm_create(struct xe_device *xe, u32 flags)
> >   			struct xe_gt *gt = tile->primary_gt;
> >   			struct xe_vm *migrate_vm;
> >   			struct xe_exec_queue *q;
> > -			u32 create_flags = EXEC_QUEUE_FLAG_VM |
> > -				((flags & XE_VM_FLAG_ASYNC_DEFAULT) ?
> > -				EXEC_QUEUE_FLAG_VM_ASYNC : 0);
> > +			u32 create_flags = EXEC_QUEUE_FLAG_VM;
> >   			if (!vm->pt_root[id])
> >   				continue;
> > @@ -1835,16 +1833,10 @@ xe_vm_bind_vma(struct xe_vma *vma, struct xe_exec_queue *q,
> >   	return ERR_PTR(err);
> >   }
> > -static bool xe_vm_sync_mode(struct xe_vm *vm, struct xe_exec_queue *q)
> > -{
> > -	return q ? !(q->flags & EXEC_QUEUE_FLAG_VM_ASYNC) :
> > -		!(vm->flags & XE_VM_FLAG_ASYNC_DEFAULT);
> > -}
> > -
> >   static int __xe_vm_bind(struct xe_vm *vm, struct xe_vma *vma,
> >   			struct xe_exec_queue *q, struct xe_sync_entry *syncs,
> >   			u32 num_syncs, bool immediate, bool first_op,
> > -			bool last_op)
> > +			bool last_op, bool async)
> >   {
> >   	struct dma_fence *fence;
> >   	struct xe_exec_queue *wait_exec_queue = to_wait_exec_queue(vm, q);
> > @@ -1870,7 +1862,7 @@ static int __xe_vm_bind(struct xe_vm *vm, struct xe_vma *vma,
> >   	if (last_op)
> >   		xe_exec_queue_last_fence_set(wait_exec_queue, vm, fence);
> > -	if (last_op && xe_vm_sync_mode(vm, q))
> > +	if (last_op && !async)
> >   		dma_fence_wait(fence, true);
> >   	dma_fence_put(fence);
> > @@ -1880,7 +1872,7 @@ static int __xe_vm_bind(struct xe_vm *vm, struct xe_vma *vma,
> >   static int xe_vm_bind(struct xe_vm *vm, struct xe_vma *vma, struct xe_exec_queue *q,
> >   		      struct xe_bo *bo, struct xe_sync_entry *syncs,
> >   		      u32 num_syncs, bool immediate, bool first_op,
> > -		      bool last_op)
> > +		      bool last_op, bool async)
> >   {
> >   	int err;
> > @@ -1894,12 +1886,12 @@ static int xe_vm_bind(struct xe_vm *vm, struct xe_vma *vma, struct xe_exec_queue
> >   	}
> >   	return __xe_vm_bind(vm, vma, q, syncs, num_syncs, immediate, first_op,
> > -			    last_op);
> > +			    last_op, async);
> >   }
> >   static int xe_vm_unbind(struct xe_vm *vm, struct xe_vma *vma,
> >   			struct xe_exec_queue *q, struct xe_sync_entry *syncs,
> > -			u32 num_syncs, bool first_op, bool last_op)
> > +			u32 num_syncs, bool first_op, bool last_op, bool async)
> >   {
> >   	struct dma_fence *fence;
> >   	struct xe_exec_queue *wait_exec_queue = to_wait_exec_queue(vm, q);
> > @@ -1914,7 +1906,7 @@ static int xe_vm_unbind(struct xe_vm *vm, struct xe_vma *vma,
> >   	xe_vma_destroy(vma, fence);
> >   	if (last_op)
> >   		xe_exec_queue_last_fence_set(wait_exec_queue, vm, fence);
> > -	if (last_op && xe_vm_sync_mode(vm, q))
> > +	if (last_op && !async)
> >   		dma_fence_wait(fence, true);
> 
> It looks like we're dropping the error return code here.
> 
> 
> >   	dma_fence_put(fence);
> > @@ -1923,7 +1915,6 @@ static int xe_vm_unbind(struct xe_vm *vm, struct xe_vma *vma,
> >   #define ALL_DRM_XE_VM_CREATE_FLAGS (DRM_XE_VM_CREATE_FLAG_SCRATCH_PAGE | \
> >   				    DRM_XE_VM_CREATE_FLAG_LR_MODE | \
> > -				    DRM_XE_VM_CREATE_FLAG_ASYNC_DEFAULT | \
> >   				    DRM_XE_VM_CREATE_FLAG_FAULT_MODE)
> >   int xe_vm_create_ioctl(struct drm_device *dev, void *data,
> > @@ -1977,8 +1968,6 @@ int xe_vm_create_ioctl(struct drm_device *dev, void *data,
> >   		flags |= XE_VM_FLAG_SCRATCH_PAGE;
> >   	if (args->flags & DRM_XE_VM_CREATE_FLAG_LR_MODE)
> >   		flags |= XE_VM_FLAG_LR_MODE;
> > -	if (args->flags & DRM_XE_VM_CREATE_FLAG_ASYNC_DEFAULT)
> > -		flags |= XE_VM_FLAG_ASYNC_DEFAULT;
> >   	if (args->flags & DRM_XE_VM_CREATE_FLAG_FAULT_MODE)
> >   		flags |= XE_VM_FLAG_FAULT_MODE;
> > @@ -2062,7 +2051,7 @@ static const u32 region_to_mem_type[] = {
> >   static int xe_vm_prefetch(struct xe_vm *vm, struct xe_vma *vma,
> >   			  struct xe_exec_queue *q, u32 region,
> >   			  struct xe_sync_entry *syncs, u32 num_syncs,
> > -			  bool first_op, bool last_op)
> > +			  bool first_op, bool last_op, bool async)
> >   {
> >   	struct xe_exec_queue *wait_exec_queue = to_wait_exec_queue(vm, q);
> >   	int err;
> > @@ -2077,7 +2066,7 @@ static int xe_vm_prefetch(struct xe_vm *vm, struct xe_vma *vma,
> >   	if (vma->tile_mask != (vma->tile_present & ~vma->usm.tile_invalidated)) {
> >   		return xe_vm_bind(vm, vma, q, xe_vma_bo(vma), syncs, num_syncs,
> > -				  true, first_op, last_op);
> > +				  true, first_op, last_op, async);
> >   	} else {
> >   		int i;
> > @@ -2400,6 +2389,8 @@ static int vm_bind_ioctl_ops_parse(struct xe_vm *vm, struct xe_exec_queue *q,
> >   		}
> >   		op->q = q;
> > +		if (async)
> > +			op->flags |= XE_VMA_OP_ASYNC;
> >   		switch (op->base.op) {
> >   		case DRM_GPUVA_OP_MAP:
> > @@ -2538,7 +2529,8 @@ static int op_execute(struct drm_exec *exec, struct xe_vm *vm,
> >   				 op->syncs, op->num_syncs,
> >   				 op->map.immediate || !xe_vm_in_fault_mode(vm),
> >   				 op->flags & XE_VMA_OP_FIRST,
> > -				 op->flags & XE_VMA_OP_LAST);
> > +				 op->flags & XE_VMA_OP_LAST,
> > +				 op->flags & XE_VMA_OP_ASYNC);
> >   		break;
> >   	case DRM_GPUVA_OP_REMAP:
> >   	{
> > @@ -2552,7 +2544,8 @@ static int op_execute(struct drm_exec *exec, struct xe_vm *vm,
> >   					   op->num_syncs,
> >   					   op->flags & XE_VMA_OP_FIRST,
> >   					   op->flags & XE_VMA_OP_LAST &&
> > -					   !prev && !next);
> > +					   !prev && !next,
> > +					   op->flags & XE_VMA_OP_ASYNC);
> >   			if (err)
> >   				break;
> >   			op->remap.unmap_done = true;
> > @@ -2563,7 +2556,8 @@ static int op_execute(struct drm_exec *exec, struct xe_vm *vm,
> >   			err = xe_vm_bind(vm, op->remap.prev, op->q,
> >   					 xe_vma_bo(op->remap.prev), op->syncs,
> >   					 op->num_syncs, true, false,
> > -					 op->flags & XE_VMA_OP_LAST && !next);
> > +					 op->flags & XE_VMA_OP_LAST && !next,
> > +					 op->flags & XE_VMA_OP_ASYNC);
> >   			op->remap.prev->gpuva.flags &= ~XE_VMA_LAST_REBIND;
> >   			if (err)
> >   				break;
> > @@ -2576,7 +2570,8 @@ static int op_execute(struct drm_exec *exec, struct xe_vm *vm,
> >   					 xe_vma_bo(op->remap.next),
> >   					 op->syncs, op->num_syncs,
> >   					 true, false,
> > -					 op->flags & XE_VMA_OP_LAST);
> > +					 op->flags & XE_VMA_OP_LAST,
> > +					 op->flags & XE_VMA_OP_ASYNC);
> >   			op->remap.next->gpuva.flags &= ~XE_VMA_LAST_REBIND;
> >   			if (err)
> >   				break;
> > @@ -2588,13 +2583,15 @@ static int op_execute(struct drm_exec *exec, struct xe_vm *vm,
> >   	case DRM_GPUVA_OP_UNMAP:
> >   		err = xe_vm_unbind(vm, vma, op->q, op->syncs,
> >   				   op->num_syncs, op->flags & XE_VMA_OP_FIRST,
> > -				   op->flags & XE_VMA_OP_LAST);
> > +				   op->flags & XE_VMA_OP_LAST,
> > +				   op->flags & XE_VMA_OP_ASYNC);
> >   		break;
> >   	case DRM_GPUVA_OP_PREFETCH:
> >   		err = xe_vm_prefetch(vm, vma, op->q, op->prefetch.region,
> >   				     op->syncs, op->num_syncs,
> >   				     op->flags & XE_VMA_OP_FIRST,
> > -				     op->flags & XE_VMA_OP_LAST);
> > +				     op->flags & XE_VMA_OP_LAST,
> > +				     op->flags & XE_VMA_OP_ASYNC);
> >   		break;
> >   	default:
> >   		drm_warn(&vm->xe->drm, "NOT POSSIBLE");
> > @@ -2808,16 +2805,16 @@ static int vm_bind_ioctl_ops_execute(struct xe_vm *vm,
> >   #ifdef TEST_VM_ASYNC_OPS_ERROR
> >   #define SUPPORTED_FLAGS	\
> > -	(FORCE_ASYNC_OP_ERROR | DRM_XE_VM_BIND_FLAG_ASYNC | \
> > -	 DRM_XE_VM_BIND_FLAG_READONLY | DRM_XE_VM_BIND_FLAG_IMMEDIATE | \
> > -	 DRM_XE_VM_BIND_FLAG_NULL | 0xffff)
> > +	(FORCE_ASYNC_OP_ERROR | DRM_XE_VM_BIND_FLAG_READONLY | \
> > +	 DRM_XE_VM_BIND_FLAG_IMMEDIATE | DRM_XE_VM_BIND_FLAG_NULL | 0xffff)
> >   #else
> >   #define SUPPORTED_FLAGS	\
> > -	(DRM_XE_VM_BIND_FLAG_ASYNC | DRM_XE_VM_BIND_FLAG_READONLY | \
> > +	(DRM_XE_VM_BIND_FLAG_READONLY | \
> >   	 DRM_XE_VM_BIND_FLAG_IMMEDIATE | DRM_XE_VM_BIND_FLAG_NULL | \
> >   	 0xffff)
> >   #endif
> >   #define XE_64K_PAGE_MASK 0xffffull
> > +#define ALL_DRM_XE_SYNCS_FLAGS (DRM_XE_SYNCS_FLAG_WAIT_FOR_OP)
> >   #define MAX_BINDS	512	/* FIXME: Picking random upper limit */
> > @@ -2829,7 +2826,7 @@ static int vm_bind_ioctl_check_args(struct xe_device *xe,
> >   	int err;
> >   	int i;
> > -	if (XE_IOCTL_DBG(xe, args->pad || args->pad2) ||
> > +	if (XE_IOCTL_DBG(xe, args->pad) ||
> >   	    XE_IOCTL_DBG(xe, args->reserved[0] || args->reserved[1]))
> >   		return -EINVAL;
> > @@ -2857,6 +2854,14 @@ static int vm_bind_ioctl_check_args(struct xe_device *xe,
> >   		*bind_ops = &args->bind;
> >   	}
> > +	*async = !(args->syncs.flags & DRM_XE_SYNCS_FLAG_WAIT_FOR_OP);
> > +
> > +	if (XE_IOCTL_DBG(xe, args->syncs.flags & ~ALL_DRM_XE_SYNCS_FLAGS) ||
> > +	    XE_IOCTL_DBG(xe, !*async && args->syncs.num_syncs)) {
> > +		err = -EINVAL;
> > +		goto free_bind_ops;
> > +	}
> > +
> >   	for (i = 0; i < args->num_binds; ++i) {
> >   		u64 range = (*bind_ops)[i].range;
> >   		u64 addr = (*bind_ops)[i].addr;
> > @@ -2887,18 +2892,6 @@ static int vm_bind_ioctl_check_args(struct xe_device *xe,
> >   			goto free_bind_ops;
> >   		}
> > -		if (i == 0) {
> > -			*async = !!(flags & DRM_XE_VM_BIND_FLAG_ASYNC);
> > -			if (XE_IOCTL_DBG(xe, !*async && args->num_syncs)) {
> > -				err = -EINVAL;
> > -				goto free_bind_ops;
> > -			}
> > -		} else if (XE_IOCTL_DBG(xe, *async !=
> > -					!!(flags & DRM_XE_VM_BIND_FLAG_ASYNC))) {
> > -			err = -EINVAL;
> > -			goto free_bind_ops;
> > -		}
> > -
> >   		if (XE_IOCTL_DBG(xe, op > DRM_XE_VM_BIND_OP_PREFETCH) ||
> >   		    XE_IOCTL_DBG(xe, flags & ~SUPPORTED_FLAGS) ||
> >   		    XE_IOCTL_DBG(xe, obj && is_null) ||
> > @@ -2951,7 +2944,7 @@ static int vm_bind_ioctl_check_args(struct xe_device *xe,
> >   static int vm_bind_ioctl_signal_fences(struct xe_vm *vm,
> >   				       struct xe_exec_queue *q,
> >   				       struct xe_sync_entry *syncs,
> > -				       int num_syncs)
> > +				       int num_syncs, bool async)
> >   {
> >   	struct dma_fence *fence;
> >   	int i, err = 0;
> > @@ -2967,7 +2960,7 @@ static int vm_bind_ioctl_signal_fences(struct xe_vm *vm,
> >   	xe_exec_queue_last_fence_set(to_wait_exec_queue(vm, q), vm,
> >   				     fence);
> > -	if (xe_vm_sync_mode(vm, q)) {
> > +	if (!async) {
> >   		long timeout = dma_fence_wait(fence, true);
> >   		if (timeout < 0)
> > @@ -3001,7 +2994,7 @@ int xe_vm_bind_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
> >   	if (err)
> >   		return err;
> > -	if (XE_IOCTL_DBG(xe, args->pad || args->pad2) ||
> > +	if (XE_IOCTL_DBG(xe, args->pad) ||
> >   	    XE_IOCTL_DBG(xe, args->reserved[0] || args->reserved[1]))
> >   		return -EINVAL;
> > @@ -3016,12 +3009,6 @@ int xe_vm_bind_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
> >   			err = -EINVAL;
> >   			goto put_exec_queue;
> >   		}
> > -
> > -		if (XE_IOCTL_DBG(xe, args->num_binds && async !=
> > -				 !!(q->flags & EXEC_QUEUE_FLAG_VM_ASYNC))) {
> > -			err = -EINVAL;
> > -			goto put_exec_queue;
> > -		}
> >   	}
> >   	vm = xe_vm_lookup(xef, args->vm_id);
> > @@ -3030,14 +3017,6 @@ int xe_vm_bind_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
> >   		goto put_exec_queue;
> >   	}
> > -	if (!args->exec_queue_id) {
> > -		if (XE_IOCTL_DBG(xe, args->num_binds && async !=
> > -				 !!(vm->flags & XE_VM_FLAG_ASYNC_DEFAULT))) {
> > -			err = -EINVAL;
> > -			goto put_vm;
> > -		}
> > -	}
> > -
> >   	err = down_write_killable(&vm->lock);
> >   	if (err)
> >   		goto put_vm;
> > @@ -3127,16 +3106,16 @@ int xe_vm_bind_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
> >   		}
> >   	}
> > -	if (args->num_syncs) {
> > -		syncs = kcalloc(args->num_syncs, sizeof(*syncs), GFP_KERNEL);
> > +	if (args->syncs.num_syncs) {
> > +		syncs = kcalloc(args->syncs.num_syncs, sizeof(*syncs), GFP_KERNEL);
> >   		if (!syncs) {
> >   			err = -ENOMEM;
> >   			goto put_obj;
> >   		}
> >   	}
> > -	syncs_user = u64_to_user_ptr(args->syncs);
> > -	for (num_syncs = 0; num_syncs < args->num_syncs; num_syncs++) {
> > +	syncs_user = u64_to_user_ptr(args->syncs.syncs);
> > +	for (num_syncs = 0; num_syncs < args->syncs.num_syncs; num_syncs++) {
> >   		err = xe_sync_entry_parse(xe, xef, &syncs[num_syncs],
> >   					  &syncs_user[num_syncs],
> >   					  (xe_vm_in_lr_mode(vm) ?
> > @@ -3210,7 +3189,8 @@ int xe_vm_bind_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
> >   	vm_bind_ioctl_ops_unwind(vm, ops, args->num_binds);
> >   free_syncs:
> >   	if (err == -ENODATA)
> > -		err = vm_bind_ioctl_signal_fences(vm, q, syncs, num_syncs);
> > +		err = vm_bind_ioctl_signal_fences(vm, q, syncs, num_syncs,
> > +						  async);
> >   	while (num_syncs--)
> >   		xe_sync_entry_cleanup(&syncs[num_syncs]);
> > diff --git a/drivers/gpu/drm/xe/xe_vm_types.h b/drivers/gpu/drm/xe/xe_vm_types.h
> > index 23abdfd8622f..ce8b9bde7e9c 100644
> > --- a/drivers/gpu/drm/xe/xe_vm_types.h
> > +++ b/drivers/gpu/drm/xe/xe_vm_types.h
> > @@ -167,13 +167,12 @@ struct xe_vm {
> >   	 */
> >   #define XE_VM_FLAG_64K			BIT(0)
> >   #define XE_VM_FLAG_LR_MODE		BIT(1)
> > -#define XE_VM_FLAG_ASYNC_DEFAULT	BIT(2)
> > -#define XE_VM_FLAG_MIGRATION		BIT(3)
> > -#define XE_VM_FLAG_SCRATCH_PAGE		BIT(4)
> > -#define XE_VM_FLAG_FAULT_MODE		BIT(5)
> > -#define XE_VM_FLAG_BANNED		BIT(6)
> > -#define XE_VM_FLAG_TILE_ID(flags)	FIELD_GET(GENMASK(8, 7), flags)
> > -#define XE_VM_FLAG_SET_TILE_ID(tile)	FIELD_PREP(GENMASK(8, 7), (tile)->id)
> > +#define XE_VM_FLAG_MIGRATION		BIT(2)
> > +#define XE_VM_FLAG_SCRATCH_PAGE		BIT(3)
> > +#define XE_VM_FLAG_FAULT_MODE		BIT(4)
> > +#define XE_VM_FLAG_BANNED		BIT(5)
> > +#define XE_VM_FLAG_TILE_ID(flags)	FIELD_GET(GENMASK(7, 6), flags)
> > +#define XE_VM_FLAG_SET_TILE_ID(tile)	FIELD_PREP(GENMASK(7, 6), (tile)->id)
> >   	unsigned long flags;
> >   	/** @composite_fence_ctx: context composite fence */
> > @@ -385,6 +384,8 @@ enum xe_vma_op_flags {
> >   	XE_VMA_OP_PREV_COMMITTED	= BIT(3),
> >   	/** @XE_VMA_OP_NEXT_COMMITTED: Next VMA operation committed */
> >   	XE_VMA_OP_NEXT_COMMITTED	= BIT(4),
> > +	/** @XE_VMA_OP_ASYNC: operation is async */
> > +	XE_VMA_OP_ASYNC			= BIT(5),
> >   };
> >   /** struct xe_vma_op - VMA operation */
> > diff --git a/include/uapi/drm/xe_drm.h b/include/uapi/drm/xe_drm.h
> > index eb03a49c17a1..fd8172fe2d9a 100644
> > --- a/include/uapi/drm/xe_drm.h
> > +++ b/include/uapi/drm/xe_drm.h
> > @@ -141,8 +141,7 @@ struct drm_xe_engine_class_instance {
> >   	 * Kernel only classes (not actual hardware engine class). Used for
> >   	 * creating ordered queues of VM bind operations.
> >   	 */
> > -#define DRM_XE_ENGINE_CLASS_VM_BIND_ASYNC	5
> > -#define DRM_XE_ENGINE_CLASS_VM_BIND_SYNC	6
> > +#define DRM_XE_ENGINE_CLASS_VM_BIND		5
> >   	__u16 engine_class;
> >   	__u16 engine_instance;
> > @@ -660,7 +659,6 @@ struct drm_xe_vm_create {
> >   	 * still enable recoverable pagefaults if supported by the device.
> >   	 */
> >   #define DRM_XE_VM_CREATE_FLAG_LR_MODE	        (1 << 1)
> > -#define DRM_XE_VM_CREATE_FLAG_ASYNC_DEFAULT	(1 << 2)
> >   	/*
> >   	 * DRM_XE_VM_CREATE_FLAG_FAULT_MODE requires also
> >   	 * DRM_XE_VM_CREATE_FLAG_LR_MODE. It allows memory to be allocated
> > @@ -668,7 +666,7 @@ struct drm_xe_vm_create {
> >   	 * The xe driver internally uses recoverable pagefaults to implement
> >   	 * this.
> >   	 */
> > -#define DRM_XE_VM_CREATE_FLAG_FAULT_MODE	(1 << 3)
> > +#define DRM_XE_VM_CREATE_FLAG_FAULT_MODE	(1 << 2)
> >   	/** @flags: Flags */
> >   	__u32 flags;
> > @@ -776,12 +774,11 @@ struct drm_xe_vm_bind_op {
> >   	__u32 op;
> >   #define DRM_XE_VM_BIND_FLAG_READONLY	(1 << 0)
> > -#define DRM_XE_VM_BIND_FLAG_ASYNC	(1 << 1)
> >   	/*
> >   	 * Valid on a faulting VM only, do the MAP operation immediately rather
> >   	 * than deferring the MAP to the page fault handler.
> >   	 */
> > -#define DRM_XE_VM_BIND_FLAG_IMMEDIATE	(1 << 2)
> > +#define DRM_XE_VM_BIND_FLAG_IMMEDIATE	(1 << 1)
> >   	/*
> >   	 * When the NULL flag is set, the page tables are setup with a special
> >   	 * bit which indicates writes are dropped and all reads return zero.  In
> > @@ -789,7 +786,7 @@ struct drm_xe_vm_bind_op {
> >   	 * operations, the BO handle MBZ, and the BO offset MBZ. This flag is
> >   	 * intended to implement VK sparse bindings.
> >   	 */
> > -#define DRM_XE_VM_BIND_FLAG_NULL	(1 << 3)
> > +#define DRM_XE_VM_BIND_FLAG_NULL	(1 << 2)
> >   	/** @flags: Bind flags */
> >   	__u32 flags;
> > @@ -807,6 +804,27 @@ struct drm_xe_vm_bind_op {
> >   	__u64 reserved[3];
> >   };
> > +/**
> > + * struct drm_xe_syncs - In / out syncs for IOCTLs.
> > + */
> > +struct drm_xe_syncs {
> > +	/** @num_syncs: amount of syncs to wait on */
> > +	__u32 num_syncs;
> > +
> > +	/*
> > +	 * Block in IOCTL until operation complete, num_syncs MBZ if set.
> > +	 */
> > +#define DRM_XE_SYNCS_FLAG_WAIT_FOR_OP (1 << 0)
> > +	/** @flags: Sync flags */
> > +	__u32 flags;
> > +
> > +	/** @syncs: pointer to struct drm_xe_sync array */
> > +	__u64 syncs;
> > +
> > +	/** @reserved: Reserved */
> > +	__u64 reserved[2];
> > +};
> > +
> >   struct drm_xe_vm_bind {
> >   	/** @extensions: Pointer to the first extension struct, if any */
> >   	__u64 extensions;
> > @@ -838,14 +856,8 @@ struct drm_xe_vm_bind {
> >   		__u64 vector_of_binds;
> >   	};
> > -	/** @pad: MBZ */
> > -	__u32 pad2;
> > -
> > -	/** @num_syncs: amount of syncs to wait on */
> > -	__u32 num_syncs;
> > -
> > -	/** @syncs: pointer to struct drm_xe_sync array */
> > -	__u64 syncs;
> > +	/** @syncs: syncs for bind */
> > +	struct drm_xe_syncs syncs;
> >   	/** @reserved: Reserved */
> >   	__u64 reserved[2];
> > @@ -974,14 +986,14 @@ struct drm_xe_exec {
> >   	/** @extensions: Pointer to the first extension struct, if any */
> >   	__u64 extensions;
> > +	/** @pad: MBZ */
> > +	__u32 pad;
> > +
> >   	/** @exec_queue_id: Exec queue ID for the batch buffer */
> >   	__u32 exec_queue_id;
> > -	/** @num_syncs: Amount of struct drm_xe_sync in array. */
> > -	__u32 num_syncs;
> > -
> > -	/** @syncs: Pointer to struct drm_xe_sync array. */
> > -	__u64 syncs;
> > +	/** @syncs: syncs for exec */
> > +	struct drm_xe_syncs syncs;
> >   	/**
> >   	 * @address: address of batch buffer if num_batch_buffer == 1 or an
> > @@ -995,8 +1007,8 @@ struct drm_xe_exec {
> >   	 */
> >   	__u16 num_batch_buffer;
> > -	/** @pad: MBZ */
> > -	__u16 pad[3];
> > +	/** @pad2: MBZ */
> > +	__u16 pad2[3];
> >   	/** @reserved: Reserved */
> >   	__u64 reserved[2];

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [RFC PATCH 7/7] drm/xe/uapi: Uniform async vs sync handling
  2023-12-08 15:00   ` Thomas Hellström
  2023-12-08  9:45     ` Matthew Brost
@ 2023-12-08 12:24     ` Matthew Brost
  2023-12-11 15:34       ` Thomas Hellström
  1 sibling, 1 reply; 22+ messages in thread
From: Matthew Brost @ 2023-12-08 12:24 UTC (permalink / raw)
  To: Thomas Hellström; +Cc: Francois Dugast, intel-xe, Rodrigo Vivi

On Fri, Dec 08, 2023 at 04:00:37PM +0100, Thomas Hellström wrote:
> 

Missed a comment, addressing below.

> On 12/7/23 06:57, Matthew Brost wrote:
> > Remove concept of async vs sync VM bind queues, rather make async vs
> > sync a per IOCTL choice. Since this is per IOCTL, it makes sense to have
> > a singular flag IOCTL rather than per VM bind op flag too. Add
> > DRM_XE_SYNCS_FLAG_WAIT_FOR_OP which is an input sync flag to support
> > this. Support this new flag for both the VM bind IOCTL and the exec
> > IOCTL to match behavior.
> > 
> > Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
> > Cc: Thomas Hellström <thomas.hellstrom@linux.intel.com>
> > Cc: Francois Dugast <francois.dugast@intel.com>
> > Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> > ---
> >   drivers/gpu/drm/xe/xe_exec.c             |  58 ++++++++----
> >   drivers/gpu/drm/xe/xe_exec_queue.c       |   7 +-
> >   drivers/gpu/drm/xe/xe_exec_queue_types.h |   2 -
> >   drivers/gpu/drm/xe/xe_vm.c               | 110 ++++++++++-------------
> >   drivers/gpu/drm/xe/xe_vm_types.h         |  15 ++--
> >   include/uapi/drm/xe_drm.h                |  56 +++++++-----
> >   6 files changed, 129 insertions(+), 119 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/xe/xe_exec.c b/drivers/gpu/drm/xe/xe_exec.c
> > index 92b0da6580e8..c62cabfaa112 100644
> > --- a/drivers/gpu/drm/xe/xe_exec.c
> > +++ b/drivers/gpu/drm/xe/xe_exec.c
> > @@ -130,12 +130,15 @@ static int xe_exec_begin(struct drm_exec *exec, struct xe_vm *vm)
> >   	return err;
> >   }
> > +#define ALL_DRM_XE_SYNCS_FLAGS (DRM_XE_SYNCS_FLAG_WAIT_FOR_OP)
> > +
> >   int xe_exec_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
> >   {
> >   	struct xe_device *xe = to_xe_device(dev);
> >   	struct xe_file *xef = to_xe_file(file);
> >   	struct drm_xe_exec *args = data;
> > -	struct drm_xe_sync __user *syncs_user = u64_to_user_ptr(args->syncs);
> > +	struct drm_xe_sync __user *syncs_user =
> > +		u64_to_user_ptr(args->syncs.syncs);
> >   	u64 __user *addresses_user = u64_to_user_ptr(args->address);
> >   	struct xe_exec_queue *q;
> >   	struct xe_sync_entry *syncs = NULL;
> > @@ -143,15 +146,18 @@ int xe_exec_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
> >   	struct drm_exec exec;
> >   	u32 i, num_syncs = 0;
> >   	struct xe_sched_job *job;
> > -	struct dma_fence *rebind_fence;
> > +	struct dma_fence *rebind_fence, *job_fence;
> >   	struct xe_vm *vm;
> > -	bool write_locked;
> > +	bool write_locked, skip_job_put = false;
> > +	bool wait = args->syncs.flags & DRM_XE_SYNCS_FLAG_WAIT_FOR_OP;
> >   	ktime_t end = 0;
> >   	int err = 0;
> >   	if (XE_IOCTL_DBG(xe, args->extensions) ||
> > -	    XE_IOCTL_DBG(xe, args->pad[0] || args->pad[1] || args->pad[2]) ||
> > -	    XE_IOCTL_DBG(xe, args->reserved[0] || args->reserved[1]))
> > +	    XE_IOCTL_DBG(xe, args->pad || args->pad2[0] || args->pad2[1] || args->pad2[2]) ||
> > +	    XE_IOCTL_DBG(xe, args->reserved[0] || args->reserved[1]) ||
> > +	    XE_IOCTL_DBG(xe, args->syncs.flags & ~ALL_DRM_XE_SYNCS_FLAGS) ||
> > +	    XE_IOCTL_DBG(xe, wait && args->syncs.num_syncs))
> >   		return -EINVAL;
> >   	q = xe_exec_queue_lookup(xef, args->exec_queue_id);
> > @@ -170,8 +176,9 @@ int xe_exec_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
> >   		goto err_exec_queue;
> >   	}
> > -	if (args->num_syncs) {
> > -		syncs = kcalloc(args->num_syncs, sizeof(*syncs), GFP_KERNEL);
> > +	if (args->syncs.num_syncs) {
> > +		syncs = kcalloc(args->syncs.num_syncs, sizeof(*syncs),
> > +				GFP_KERNEL);
> >   		if (!syncs) {
> >   			err = -ENOMEM;
> >   			goto err_exec_queue;
> > @@ -180,7 +187,7 @@ int xe_exec_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
> >   	vm = q->vm;
> > -	for (i = 0; i < args->num_syncs; i++) {
> > +	for (i = 0; i < args->syncs.num_syncs; i++) {
> >   		err = xe_sync_entry_parse(xe, xef, &syncs[num_syncs++],
> >   					  &syncs_user[i], SYNC_PARSE_FLAG_EXEC |
> >   					  (xe_vm_in_lr_mode(vm) ?
> > @@ -245,9 +252,17 @@ int xe_exec_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
> >   				err = PTR_ERR(fence);
> >   				goto err_exec;
> >   			}
> > +
> >   			for (i = 0; i < num_syncs; i++)
> >   				xe_sync_entry_signal(&syncs[i], NULL, fence);
> > +
> >   			xe_exec_queue_last_fence_set(q, vm, fence);
> > +			if (wait) {
> > +				long timeout = dma_fence_wait(fence, true);
> > +
> > +				if (timeout < 0)
> > +					err = -EINTR;
> > +			}
> 
> Here it looks like we will rerun the same IOCTL again if we return -EINTR.
> The user-space expected action on -EINTR is to just restart the IOCTL
> without any argument changes. Solution is to add an ioctl argument cookie
> (or to skip sync vm binds and have the user just use the 0 batch buffers /
> vm_binds calls or wait for an out-fence). If you go for the cookie solution
> then IMO we should keep the -ERESTARTSYS returned from dma_fence_wait()
> since it's converted to -EINTR on return-to-user-space, and the kernel
> restarts the IOCTL automatically if there was no requested-for-delivery
> signal pending.
> 
> I think the simplest solution at this point is to skip the sync behaviour,
> in particular if we enable the 0 batch / bind possibility.
> 
> If we still want to provide it, we could add a cookie address as an
> extension to the ioctl and activate sync if present? (Just throwing up ideas
> here).
> 
> >   			dma_fence_put(fence);
> >   		}
> > @@ -331,42 +346,51 @@ int xe_exec_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
> >   	 * the job and let the DRM scheduler / backend clean up the job.
> >   	 */
> >   	xe_sched_job_arm(job);
> > +	job_fence = &job->drm.s_fence->finished;
> > +	if (wait)
> > +		dma_fence_get(job_fence);
> >   	if (!xe_vm_in_lr_mode(vm)) {
> >   		/* Block userptr invalidations / BO eviction */
> > -		dma_resv_add_fence(&vm->resv,
> > -				   &job->drm.s_fence->finished,
> > +		dma_resv_add_fence(&vm->resv, job_fence,
> >   				   DMA_RESV_USAGE_BOOKKEEP);
> >   		/*
> >   		 * Make implicit sync work across drivers, assuming all external
> >   		 * BOs are written as we don't pass in a read / write list.
> >   		 */
> > -		xe_vm_fence_all_extobjs(vm, &job->drm.s_fence->finished,
> > -					DMA_RESV_USAGE_WRITE);
> > +		xe_vm_fence_all_extobjs(vm, job_fence, DMA_RESV_USAGE_WRITE);
> >   	}
> >   	for (i = 0; i < num_syncs; i++)
> > -		xe_sync_entry_signal(&syncs[i], job,
> > -				     &job->drm.s_fence->finished);
> > +		xe_sync_entry_signal(&syncs[i], job, job_fence);
> >   	if (xe_exec_queue_is_lr(q))
> >   		q->ring_ops->emit_job(job);
> >   	if (!xe_vm_in_lr_mode(vm))
> > -		xe_exec_queue_last_fence_set(q, vm, &job->drm.s_fence->finished);
> > +		xe_exec_queue_last_fence_set(q, vm, job_fence);
> >   	xe_sched_job_push(job);
> >   	xe_vm_reactivate_rebind(vm);
> > -	if (!err && !xe_vm_in_lr_mode(vm)) {
> > +	if (!xe_vm_in_lr_mode(vm)) {
> >   		spin_lock(&xe->ttm.lru_lock);
> >   		ttm_lru_bulk_move_tail(&vm->lru_bulk_move);
> >   		spin_unlock(&xe->ttm.lru_lock);
> >   	}
> > +	skip_job_put = true;
> > +	if (wait) {
> > +		long timeout = dma_fence_wait(job_fence, true);
> > +
> > +		dma_fence_put(job_fence);
> > +		if (timeout < 0)
> > +			err = -EINTR;
> > +	}
> > +
> >   err_repin:
> >   	if (!xe_vm_in_lr_mode(vm))
> >   		up_read(&vm->userptr.notifier_lock);
> >   err_put_job:
> > -	if (err)
> > +	if (err && !skip_job_put)
> >   		xe_sched_job_put(job);
> >   err_exec:
> >   	drm_exec_fini(&exec);
> > diff --git a/drivers/gpu/drm/xe/xe_exec_queue.c b/drivers/gpu/drm/xe/xe_exec_queue.c
> > index 3911d14522ee..98776d02d634 100644
> > --- a/drivers/gpu/drm/xe/xe_exec_queue.c
> > +++ b/drivers/gpu/drm/xe/xe_exec_queue.c
> > @@ -625,10 +625,7 @@ int xe_exec_queue_create_ioctl(struct drm_device *dev, void *data,
> >   	if (XE_IOCTL_DBG(xe, eci[0].gt_id >= xe->info.gt_count))
> >   		return -EINVAL;
> > -	if (eci[0].engine_class >= DRM_XE_ENGINE_CLASS_VM_BIND_ASYNC) {
> > -		bool sync = eci[0].engine_class ==
> > -			DRM_XE_ENGINE_CLASS_VM_BIND_SYNC;
> > -
> > +	if (eci[0].engine_class == DRM_XE_ENGINE_CLASS_VM_BIND) {
> >   		for_each_gt(gt, xe, id) {
> >   			struct xe_exec_queue *new;
> > @@ -654,8 +651,6 @@ int xe_exec_queue_create_ioctl(struct drm_device *dev, void *data,
> >   						   args->width, hwe,
> >   						   EXEC_QUEUE_FLAG_PERSISTENT |
> >   						   EXEC_QUEUE_FLAG_VM |
> > -						   (sync ? 0 :
> > -						    EXEC_QUEUE_FLAG_VM_ASYNC) |
> >   						   (id ?
> >   						    EXEC_QUEUE_FLAG_BIND_ENGINE_CHILD :
> >   						    0));
> > diff --git a/drivers/gpu/drm/xe/xe_exec_queue_types.h b/drivers/gpu/drm/xe/xe_exec_queue_types.h
> > index 52f0927d0d9b..c78f6e8b41c4 100644
> > --- a/drivers/gpu/drm/xe/xe_exec_queue_types.h
> > +++ b/drivers/gpu/drm/xe/xe_exec_queue_types.h
> > @@ -74,8 +74,6 @@ struct xe_exec_queue {
> >   #define EXEC_QUEUE_FLAG_VM			BIT(4)
> >   /* child of VM queue for multi-tile VM jobs */
> >   #define EXEC_QUEUE_FLAG_BIND_ENGINE_CHILD	BIT(5)
> > -/* VM jobs for this queue are asynchronous */
> > -#define EXEC_QUEUE_FLAG_VM_ASYNC		BIT(6)
> >   	/**
> >   	 * @flags: flags for this exec queue, should statically setup aside from ban
> > diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c
> > index cf2eb44a71db..4b0c976c003a 100644
> > --- a/drivers/gpu/drm/xe/xe_vm.c
> > +++ b/drivers/gpu/drm/xe/xe_vm.c
> > @@ -1433,9 +1433,7 @@ struct xe_vm *xe_vm_create(struct xe_device *xe, u32 flags)
> >   			struct xe_gt *gt = tile->primary_gt;
> >   			struct xe_vm *migrate_vm;
> >   			struct xe_exec_queue *q;
> > -			u32 create_flags = EXEC_QUEUE_FLAG_VM |
> > -				((flags & XE_VM_FLAG_ASYNC_DEFAULT) ?
> > -				EXEC_QUEUE_FLAG_VM_ASYNC : 0);
> > +			u32 create_flags = EXEC_QUEUE_FLAG_VM;
> >   			if (!vm->pt_root[id])
> >   				continue;
> > @@ -1835,16 +1833,10 @@ xe_vm_bind_vma(struct xe_vma *vma, struct xe_exec_queue *q,
> >   	return ERR_PTR(err);
> >   }
> > -static bool xe_vm_sync_mode(struct xe_vm *vm, struct xe_exec_queue *q)
> > -{
> > -	return q ? !(q->flags & EXEC_QUEUE_FLAG_VM_ASYNC) :
> > -		!(vm->flags & XE_VM_FLAG_ASYNC_DEFAULT);
> > -}
> > -
> >   static int __xe_vm_bind(struct xe_vm *vm, struct xe_vma *vma,
> >   			struct xe_exec_queue *q, struct xe_sync_entry *syncs,
> >   			u32 num_syncs, bool immediate, bool first_op,
> > -			bool last_op)
> > +			bool last_op, bool async)
> >   {
> >   	struct dma_fence *fence;
> >   	struct xe_exec_queue *wait_exec_queue = to_wait_exec_queue(vm, q);
> > @@ -1870,7 +1862,7 @@ static int __xe_vm_bind(struct xe_vm *vm, struct xe_vma *vma,
> >   	if (last_op)
> >   		xe_exec_queue_last_fence_set(wait_exec_queue, vm, fence);
> > -	if (last_op && xe_vm_sync_mode(vm, q))
> > +	if (last_op && !async)
> >   		dma_fence_wait(fence, true);
> >   	dma_fence_put(fence);
> > @@ -1880,7 +1872,7 @@ static int __xe_vm_bind(struct xe_vm *vm, struct xe_vma *vma,
> >   static int xe_vm_bind(struct xe_vm *vm, struct xe_vma *vma, struct xe_exec_queue *q,
> >   		      struct xe_bo *bo, struct xe_sync_entry *syncs,
> >   		      u32 num_syncs, bool immediate, bool first_op,
> > -		      bool last_op)
> > +		      bool last_op, bool async)
> >   {
> >   	int err;
> > @@ -1894,12 +1886,12 @@ static int xe_vm_bind(struct xe_vm *vm, struct xe_vma *vma, struct xe_exec_queue
> >   	}
> >   	return __xe_vm_bind(vm, vma, q, syncs, num_syncs, immediate, first_op,
> > -			    last_op);
> > +			    last_op, async);
> >   }
> >   static int xe_vm_unbind(struct xe_vm *vm, struct xe_vma *vma,
> >   			struct xe_exec_queue *q, struct xe_sync_entry *syncs,
> > -			u32 num_syncs, bool first_op, bool last_op)
> > +			u32 num_syncs, bool first_op, bool last_op, bool async)
> >   {
> >   	struct dma_fence *fence;
> >   	struct xe_exec_queue *wait_exec_queue = to_wait_exec_queue(vm, q);
> > @@ -1914,7 +1906,7 @@ static int xe_vm_unbind(struct xe_vm *vm, struct xe_vma *vma,
> >   	xe_vma_destroy(vma, fence);
> >   	if (last_op)
> >   		xe_exec_queue_last_fence_set(wait_exec_queue, vm, fence);
> > -	if (last_op && xe_vm_sync_mode(vm, q))
> > +	if (last_op && !async)
> >   		dma_fence_wait(fence, true);
> 
> It looks like we're dropping the error return code here.
> 

I am aware of this. This is fixed in the larger refactor of the VM bind
error handling [1]. The idea with this series is land the uAPI and get
the implementation 100% correct in the larger follow up series.

Matt

[1] https://patchwork.freedesktop.org/series/125608/

> 
> >   	dma_fence_put(fence);
> > @@ -1923,7 +1915,6 @@ static int xe_vm_unbind(struct xe_vm *vm, struct xe_vma *vma,
> >   #define ALL_DRM_XE_VM_CREATE_FLAGS (DRM_XE_VM_CREATE_FLAG_SCRATCH_PAGE | \
> >   				    DRM_XE_VM_CREATE_FLAG_LR_MODE | \
> > -				    DRM_XE_VM_CREATE_FLAG_ASYNC_DEFAULT | \
> >   				    DRM_XE_VM_CREATE_FLAG_FAULT_MODE)
> >   int xe_vm_create_ioctl(struct drm_device *dev, void *data,
> > @@ -1977,8 +1968,6 @@ int xe_vm_create_ioctl(struct drm_device *dev, void *data,
> >   		flags |= XE_VM_FLAG_SCRATCH_PAGE;
> >   	if (args->flags & DRM_XE_VM_CREATE_FLAG_LR_MODE)
> >   		flags |= XE_VM_FLAG_LR_MODE;
> > -	if (args->flags & DRM_XE_VM_CREATE_FLAG_ASYNC_DEFAULT)
> > -		flags |= XE_VM_FLAG_ASYNC_DEFAULT;
> >   	if (args->flags & DRM_XE_VM_CREATE_FLAG_FAULT_MODE)
> >   		flags |= XE_VM_FLAG_FAULT_MODE;
> > @@ -2062,7 +2051,7 @@ static const u32 region_to_mem_type[] = {
> >   static int xe_vm_prefetch(struct xe_vm *vm, struct xe_vma *vma,
> >   			  struct xe_exec_queue *q, u32 region,
> >   			  struct xe_sync_entry *syncs, u32 num_syncs,
> > -			  bool first_op, bool last_op)
> > +			  bool first_op, bool last_op, bool async)
> >   {
> >   	struct xe_exec_queue *wait_exec_queue = to_wait_exec_queue(vm, q);
> >   	int err;
> > @@ -2077,7 +2066,7 @@ static int xe_vm_prefetch(struct xe_vm *vm, struct xe_vma *vma,
> >   	if (vma->tile_mask != (vma->tile_present & ~vma->usm.tile_invalidated)) {
> >   		return xe_vm_bind(vm, vma, q, xe_vma_bo(vma), syncs, num_syncs,
> > -				  true, first_op, last_op);
> > +				  true, first_op, last_op, async);
> >   	} else {
> >   		int i;
> > @@ -2400,6 +2389,8 @@ static int vm_bind_ioctl_ops_parse(struct xe_vm *vm, struct xe_exec_queue *q,
> >   		}
> >   		op->q = q;
> > +		if (async)
> > +			op->flags |= XE_VMA_OP_ASYNC;
> >   		switch (op->base.op) {
> >   		case DRM_GPUVA_OP_MAP:
> > @@ -2538,7 +2529,8 @@ static int op_execute(struct drm_exec *exec, struct xe_vm *vm,
> >   				 op->syncs, op->num_syncs,
> >   				 op->map.immediate || !xe_vm_in_fault_mode(vm),
> >   				 op->flags & XE_VMA_OP_FIRST,
> > -				 op->flags & XE_VMA_OP_LAST);
> > +				 op->flags & XE_VMA_OP_LAST,
> > +				 op->flags & XE_VMA_OP_ASYNC);
> >   		break;
> >   	case DRM_GPUVA_OP_REMAP:
> >   	{
> > @@ -2552,7 +2544,8 @@ static int op_execute(struct drm_exec *exec, struct xe_vm *vm,
> >   					   op->num_syncs,
> >   					   op->flags & XE_VMA_OP_FIRST,
> >   					   op->flags & XE_VMA_OP_LAST &&
> > -					   !prev && !next);
> > +					   !prev && !next,
> > +					   op->flags & XE_VMA_OP_ASYNC);
> >   			if (err)
> >   				break;
> >   			op->remap.unmap_done = true;
> > @@ -2563,7 +2556,8 @@ static int op_execute(struct drm_exec *exec, struct xe_vm *vm,
> >   			err = xe_vm_bind(vm, op->remap.prev, op->q,
> >   					 xe_vma_bo(op->remap.prev), op->syncs,
> >   					 op->num_syncs, true, false,
> > -					 op->flags & XE_VMA_OP_LAST && !next);
> > +					 op->flags & XE_VMA_OP_LAST && !next,
> > +					 op->flags & XE_VMA_OP_ASYNC);
> >   			op->remap.prev->gpuva.flags &= ~XE_VMA_LAST_REBIND;
> >   			if (err)
> >   				break;
> > @@ -2576,7 +2570,8 @@ static int op_execute(struct drm_exec *exec, struct xe_vm *vm,
> >   					 xe_vma_bo(op->remap.next),
> >   					 op->syncs, op->num_syncs,
> >   					 true, false,
> > -					 op->flags & XE_VMA_OP_LAST);
> > +					 op->flags & XE_VMA_OP_LAST,
> > +					 op->flags & XE_VMA_OP_ASYNC);
> >   			op->remap.next->gpuva.flags &= ~XE_VMA_LAST_REBIND;
> >   			if (err)
> >   				break;
> > @@ -2588,13 +2583,15 @@ static int op_execute(struct drm_exec *exec, struct xe_vm *vm,
> >   	case DRM_GPUVA_OP_UNMAP:
> >   		err = xe_vm_unbind(vm, vma, op->q, op->syncs,
> >   				   op->num_syncs, op->flags & XE_VMA_OP_FIRST,
> > -				   op->flags & XE_VMA_OP_LAST);
> > +				   op->flags & XE_VMA_OP_LAST,
> > +				   op->flags & XE_VMA_OP_ASYNC);
> >   		break;
> >   	case DRM_GPUVA_OP_PREFETCH:
> >   		err = xe_vm_prefetch(vm, vma, op->q, op->prefetch.region,
> >   				     op->syncs, op->num_syncs,
> >   				     op->flags & XE_VMA_OP_FIRST,
> > -				     op->flags & XE_VMA_OP_LAST);
> > +				     op->flags & XE_VMA_OP_LAST,
> > +				     op->flags & XE_VMA_OP_ASYNC);
> >   		break;
> >   	default:
> >   		drm_warn(&vm->xe->drm, "NOT POSSIBLE");
> > @@ -2808,16 +2805,16 @@ static int vm_bind_ioctl_ops_execute(struct xe_vm *vm,
> >   #ifdef TEST_VM_ASYNC_OPS_ERROR
> >   #define SUPPORTED_FLAGS	\
> > -	(FORCE_ASYNC_OP_ERROR | DRM_XE_VM_BIND_FLAG_ASYNC | \
> > -	 DRM_XE_VM_BIND_FLAG_READONLY | DRM_XE_VM_BIND_FLAG_IMMEDIATE | \
> > -	 DRM_XE_VM_BIND_FLAG_NULL | 0xffff)
> > +	(FORCE_ASYNC_OP_ERROR | DRM_XE_VM_BIND_FLAG_READONLY | \
> > +	 DRM_XE_VM_BIND_FLAG_IMMEDIATE | DRM_XE_VM_BIND_FLAG_NULL | 0xffff)
> >   #else
> >   #define SUPPORTED_FLAGS	\
> > -	(DRM_XE_VM_BIND_FLAG_ASYNC | DRM_XE_VM_BIND_FLAG_READONLY | \
> > +	(DRM_XE_VM_BIND_FLAG_READONLY | \
> >   	 DRM_XE_VM_BIND_FLAG_IMMEDIATE | DRM_XE_VM_BIND_FLAG_NULL | \
> >   	 0xffff)
> >   #endif
> >   #define XE_64K_PAGE_MASK 0xffffull
> > +#define ALL_DRM_XE_SYNCS_FLAGS (DRM_XE_SYNCS_FLAG_WAIT_FOR_OP)
> >   #define MAX_BINDS	512	/* FIXME: Picking random upper limit */
> > @@ -2829,7 +2826,7 @@ static int vm_bind_ioctl_check_args(struct xe_device *xe,
> >   	int err;
> >   	int i;
> > -	if (XE_IOCTL_DBG(xe, args->pad || args->pad2) ||
> > +	if (XE_IOCTL_DBG(xe, args->pad) ||
> >   	    XE_IOCTL_DBG(xe, args->reserved[0] || args->reserved[1]))
> >   		return -EINVAL;
> > @@ -2857,6 +2854,14 @@ static int vm_bind_ioctl_check_args(struct xe_device *xe,
> >   		*bind_ops = &args->bind;
> >   	}
> > +	*async = !(args->syncs.flags & DRM_XE_SYNCS_FLAG_WAIT_FOR_OP);
> > +
> > +	if (XE_IOCTL_DBG(xe, args->syncs.flags & ~ALL_DRM_XE_SYNCS_FLAGS) ||
> > +	    XE_IOCTL_DBG(xe, !*async && args->syncs.num_syncs)) {
> > +		err = -EINVAL;
> > +		goto free_bind_ops;
> > +	}
> > +
> >   	for (i = 0; i < args->num_binds; ++i) {
> >   		u64 range = (*bind_ops)[i].range;
> >   		u64 addr = (*bind_ops)[i].addr;
> > @@ -2887,18 +2892,6 @@ static int vm_bind_ioctl_check_args(struct xe_device *xe,
> >   			goto free_bind_ops;
> >   		}
> > -		if (i == 0) {
> > -			*async = !!(flags & DRM_XE_VM_BIND_FLAG_ASYNC);
> > -			if (XE_IOCTL_DBG(xe, !*async && args->num_syncs)) {
> > -				err = -EINVAL;
> > -				goto free_bind_ops;
> > -			}
> > -		} else if (XE_IOCTL_DBG(xe, *async !=
> > -					!!(flags & DRM_XE_VM_BIND_FLAG_ASYNC))) {
> > -			err = -EINVAL;
> > -			goto free_bind_ops;
> > -		}
> > -
> >   		if (XE_IOCTL_DBG(xe, op > DRM_XE_VM_BIND_OP_PREFETCH) ||
> >   		    XE_IOCTL_DBG(xe, flags & ~SUPPORTED_FLAGS) ||
> >   		    XE_IOCTL_DBG(xe, obj && is_null) ||
> > @@ -2951,7 +2944,7 @@ static int vm_bind_ioctl_check_args(struct xe_device *xe,
> >   static int vm_bind_ioctl_signal_fences(struct xe_vm *vm,
> >   				       struct xe_exec_queue *q,
> >   				       struct xe_sync_entry *syncs,
> > -				       int num_syncs)
> > +				       int num_syncs, bool async)
> >   {
> >   	struct dma_fence *fence;
> >   	int i, err = 0;
> > @@ -2967,7 +2960,7 @@ static int vm_bind_ioctl_signal_fences(struct xe_vm *vm,
> >   	xe_exec_queue_last_fence_set(to_wait_exec_queue(vm, q), vm,
> >   				     fence);
> > -	if (xe_vm_sync_mode(vm, q)) {
> > +	if (!async) {
> >   		long timeout = dma_fence_wait(fence, true);
> >   		if (timeout < 0)
> > @@ -3001,7 +2994,7 @@ int xe_vm_bind_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
> >   	if (err)
> >   		return err;
> > -	if (XE_IOCTL_DBG(xe, args->pad || args->pad2) ||
> > +	if (XE_IOCTL_DBG(xe, args->pad) ||
> >   	    XE_IOCTL_DBG(xe, args->reserved[0] || args->reserved[1]))
> >   		return -EINVAL;
> > @@ -3016,12 +3009,6 @@ int xe_vm_bind_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
> >   			err = -EINVAL;
> >   			goto put_exec_queue;
> >   		}
> > -
> > -		if (XE_IOCTL_DBG(xe, args->num_binds && async !=
> > -				 !!(q->flags & EXEC_QUEUE_FLAG_VM_ASYNC))) {
> > -			err = -EINVAL;
> > -			goto put_exec_queue;
> > -		}
> >   	}
> >   	vm = xe_vm_lookup(xef, args->vm_id);
> > @@ -3030,14 +3017,6 @@ int xe_vm_bind_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
> >   		goto put_exec_queue;
> >   	}
> > -	if (!args->exec_queue_id) {
> > -		if (XE_IOCTL_DBG(xe, args->num_binds && async !=
> > -				 !!(vm->flags & XE_VM_FLAG_ASYNC_DEFAULT))) {
> > -			err = -EINVAL;
> > -			goto put_vm;
> > -		}
> > -	}
> > -
> >   	err = down_write_killable(&vm->lock);
> >   	if (err)
> >   		goto put_vm;
> > @@ -3127,16 +3106,16 @@ int xe_vm_bind_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
> >   		}
> >   	}
> > -	if (args->num_syncs) {
> > -		syncs = kcalloc(args->num_syncs, sizeof(*syncs), GFP_KERNEL);
> > +	if (args->syncs.num_syncs) {
> > +		syncs = kcalloc(args->syncs.num_syncs, sizeof(*syncs), GFP_KERNEL);
> >   		if (!syncs) {
> >   			err = -ENOMEM;
> >   			goto put_obj;
> >   		}
> >   	}
> > -	syncs_user = u64_to_user_ptr(args->syncs);
> > -	for (num_syncs = 0; num_syncs < args->num_syncs; num_syncs++) {
> > +	syncs_user = u64_to_user_ptr(args->syncs.syncs);
> > +	for (num_syncs = 0; num_syncs < args->syncs.num_syncs; num_syncs++) {
> >   		err = xe_sync_entry_parse(xe, xef, &syncs[num_syncs],
> >   					  &syncs_user[num_syncs],
> >   					  (xe_vm_in_lr_mode(vm) ?
> > @@ -3210,7 +3189,8 @@ int xe_vm_bind_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
> >   	vm_bind_ioctl_ops_unwind(vm, ops, args->num_binds);
> >   free_syncs:
> >   	if (err == -ENODATA)
> > -		err = vm_bind_ioctl_signal_fences(vm, q, syncs, num_syncs);
> > +		err = vm_bind_ioctl_signal_fences(vm, q, syncs, num_syncs,
> > +						  async);
> >   	while (num_syncs--)
> >   		xe_sync_entry_cleanup(&syncs[num_syncs]);
> > diff --git a/drivers/gpu/drm/xe/xe_vm_types.h b/drivers/gpu/drm/xe/xe_vm_types.h
> > index 23abdfd8622f..ce8b9bde7e9c 100644
> > --- a/drivers/gpu/drm/xe/xe_vm_types.h
> > +++ b/drivers/gpu/drm/xe/xe_vm_types.h
> > @@ -167,13 +167,12 @@ struct xe_vm {
> >   	 */
> >   #define XE_VM_FLAG_64K			BIT(0)
> >   #define XE_VM_FLAG_LR_MODE		BIT(1)
> > -#define XE_VM_FLAG_ASYNC_DEFAULT	BIT(2)
> > -#define XE_VM_FLAG_MIGRATION		BIT(3)
> > -#define XE_VM_FLAG_SCRATCH_PAGE		BIT(4)
> > -#define XE_VM_FLAG_FAULT_MODE		BIT(5)
> > -#define XE_VM_FLAG_BANNED		BIT(6)
> > -#define XE_VM_FLAG_TILE_ID(flags)	FIELD_GET(GENMASK(8, 7), flags)
> > -#define XE_VM_FLAG_SET_TILE_ID(tile)	FIELD_PREP(GENMASK(8, 7), (tile)->id)
> > +#define XE_VM_FLAG_MIGRATION		BIT(2)
> > +#define XE_VM_FLAG_SCRATCH_PAGE		BIT(3)
> > +#define XE_VM_FLAG_FAULT_MODE		BIT(4)
> > +#define XE_VM_FLAG_BANNED		BIT(5)
> > +#define XE_VM_FLAG_TILE_ID(flags)	FIELD_GET(GENMASK(7, 6), flags)
> > +#define XE_VM_FLAG_SET_TILE_ID(tile)	FIELD_PREP(GENMASK(7, 6), (tile)->id)
> >   	unsigned long flags;
> >   	/** @composite_fence_ctx: context composite fence */
> > @@ -385,6 +384,8 @@ enum xe_vma_op_flags {
> >   	XE_VMA_OP_PREV_COMMITTED	= BIT(3),
> >   	/** @XE_VMA_OP_NEXT_COMMITTED: Next VMA operation committed */
> >   	XE_VMA_OP_NEXT_COMMITTED	= BIT(4),
> > +	/** @XE_VMA_OP_ASYNC: operation is async */
> > +	XE_VMA_OP_ASYNC			= BIT(5),
> >   };
> >   /** struct xe_vma_op - VMA operation */
> > diff --git a/include/uapi/drm/xe_drm.h b/include/uapi/drm/xe_drm.h
> > index eb03a49c17a1..fd8172fe2d9a 100644
> > --- a/include/uapi/drm/xe_drm.h
> > +++ b/include/uapi/drm/xe_drm.h
> > @@ -141,8 +141,7 @@ struct drm_xe_engine_class_instance {
> >   	 * Kernel only classes (not actual hardware engine class). Used for
> >   	 * creating ordered queues of VM bind operations.
> >   	 */
> > -#define DRM_XE_ENGINE_CLASS_VM_BIND_ASYNC	5
> > -#define DRM_XE_ENGINE_CLASS_VM_BIND_SYNC	6
> > +#define DRM_XE_ENGINE_CLASS_VM_BIND		5
> >   	__u16 engine_class;
> >   	__u16 engine_instance;
> > @@ -660,7 +659,6 @@ struct drm_xe_vm_create {
> >   	 * still enable recoverable pagefaults if supported by the device.
> >   	 */
> >   #define DRM_XE_VM_CREATE_FLAG_LR_MODE	        (1 << 1)
> > -#define DRM_XE_VM_CREATE_FLAG_ASYNC_DEFAULT	(1 << 2)
> >   	/*
> >   	 * DRM_XE_VM_CREATE_FLAG_FAULT_MODE requires also
> >   	 * DRM_XE_VM_CREATE_FLAG_LR_MODE. It allows memory to be allocated
> > @@ -668,7 +666,7 @@ struct drm_xe_vm_create {
> >   	 * The xe driver internally uses recoverable pagefaults to implement
> >   	 * this.
> >   	 */
> > -#define DRM_XE_VM_CREATE_FLAG_FAULT_MODE	(1 << 3)
> > +#define DRM_XE_VM_CREATE_FLAG_FAULT_MODE	(1 << 2)
> >   	/** @flags: Flags */
> >   	__u32 flags;
> > @@ -776,12 +774,11 @@ struct drm_xe_vm_bind_op {
> >   	__u32 op;
> >   #define DRM_XE_VM_BIND_FLAG_READONLY	(1 << 0)
> > -#define DRM_XE_VM_BIND_FLAG_ASYNC	(1 << 1)
> >   	/*
> >   	 * Valid on a faulting VM only, do the MAP operation immediately rather
> >   	 * than deferring the MAP to the page fault handler.
> >   	 */
> > -#define DRM_XE_VM_BIND_FLAG_IMMEDIATE	(1 << 2)
> > +#define DRM_XE_VM_BIND_FLAG_IMMEDIATE	(1 << 1)
> >   	/*
> >   	 * When the NULL flag is set, the page tables are setup with a special
> >   	 * bit which indicates writes are dropped and all reads return zero.  In
> > @@ -789,7 +786,7 @@ struct drm_xe_vm_bind_op {
> >   	 * operations, the BO handle MBZ, and the BO offset MBZ. This flag is
> >   	 * intended to implement VK sparse bindings.
> >   	 */
> > -#define DRM_XE_VM_BIND_FLAG_NULL	(1 << 3)
> > +#define DRM_XE_VM_BIND_FLAG_NULL	(1 << 2)
> >   	/** @flags: Bind flags */
> >   	__u32 flags;
> > @@ -807,6 +804,27 @@ struct drm_xe_vm_bind_op {
> >   	__u64 reserved[3];
> >   };
> > +/**
> > + * struct drm_xe_syncs - In / out syncs for IOCTLs.
> > + */
> > +struct drm_xe_syncs {
> > +	/** @num_syncs: amount of syncs to wait on */
> > +	__u32 num_syncs;
> > +
> > +	/*
> > +	 * Block in IOCTL until operation complete, num_syncs MBZ if set.
> > +	 */
> > +#define DRM_XE_SYNCS_FLAG_WAIT_FOR_OP (1 << 0)
> > +	/** @flags: Sync flags */
> > +	__u32 flags;
> > +
> > +	/** @syncs: pointer to struct drm_xe_sync array */
> > +	__u64 syncs;
> > +
> > +	/** @reserved: Reserved */
> > +	__u64 reserved[2];
> > +};
> > +
> >   struct drm_xe_vm_bind {
> >   	/** @extensions: Pointer to the first extension struct, if any */
> >   	__u64 extensions;
> > @@ -838,14 +856,8 @@ struct drm_xe_vm_bind {
> >   		__u64 vector_of_binds;
> >   	};
> > -	/** @pad: MBZ */
> > -	__u32 pad2;
> > -
> > -	/** @num_syncs: amount of syncs to wait on */
> > -	__u32 num_syncs;
> > -
> > -	/** @syncs: pointer to struct drm_xe_sync array */
> > -	__u64 syncs;
> > +	/** @syncs: syncs for bind */
> > +	struct drm_xe_syncs syncs;
> >   	/** @reserved: Reserved */
> >   	__u64 reserved[2];
> > @@ -974,14 +986,14 @@ struct drm_xe_exec {
> >   	/** @extensions: Pointer to the first extension struct, if any */
> >   	__u64 extensions;
> > +	/** @pad: MBZ */
> > +	__u32 pad;
> > +
> >   	/** @exec_queue_id: Exec queue ID for the batch buffer */
> >   	__u32 exec_queue_id;
> > -	/** @num_syncs: Amount of struct drm_xe_sync in array. */
> > -	__u32 num_syncs;
> > -
> > -	/** @syncs: Pointer to struct drm_xe_sync array. */
> > -	__u64 syncs;
> > +	/** @syncs: syncs for exec */
> > +	struct drm_xe_syncs syncs;
> >   	/**
> >   	 * @address: address of batch buffer if num_batch_buffer == 1 or an
> > @@ -995,8 +1007,8 @@ struct drm_xe_exec {
> >   	 */
> >   	__u16 num_batch_buffer;
> > -	/** @pad: MBZ */
> > -	__u16 pad[3];
> > +	/** @pad2: MBZ */
> > +	__u16 pad2[3];
> >   	/** @reserved: Reserved */
> >   	__u64 reserved[2];

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [RFC PATCH 7/7] drm/xe/uapi: Uniform async vs sync handling
  2023-12-07  5:57 ` [Intel-xe] [RFC PATCH 7/7] drm/xe/uapi: Uniform async vs sync handling Matthew Brost
  2023-12-07 19:51   ` Rodrigo Vivi
@ 2023-12-08 15:00   ` Thomas Hellström
  2023-12-08  9:45     ` Matthew Brost
  2023-12-08 12:24     ` Matthew Brost
  1 sibling, 2 replies; 22+ messages in thread
From: Thomas Hellström @ 2023-12-08 15:00 UTC (permalink / raw)
  To: Matthew Brost, intel-xe; +Cc: Francois Dugast, Rodrigo Vivi


On 12/7/23 06:57, Matthew Brost wrote:
> Remove concept of async vs sync VM bind queues, rather make async vs
> sync a per IOCTL choice. Since this is per IOCTL, it makes sense to have
> a singular flag IOCTL rather than per VM bind op flag too. Add
> DRM_XE_SYNCS_FLAG_WAIT_FOR_OP which is an input sync flag to support
> this. Support this new flag for both the VM bind IOCTL and the exec
> IOCTL to match behavior.
>
> Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
> Cc: Thomas Hellström <thomas.hellstrom@linux.intel.com>
> Cc: Francois Dugast <francois.dugast@intel.com>
> Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> ---
>   drivers/gpu/drm/xe/xe_exec.c             |  58 ++++++++----
>   drivers/gpu/drm/xe/xe_exec_queue.c       |   7 +-
>   drivers/gpu/drm/xe/xe_exec_queue_types.h |   2 -
>   drivers/gpu/drm/xe/xe_vm.c               | 110 ++++++++++-------------
>   drivers/gpu/drm/xe/xe_vm_types.h         |  15 ++--
>   include/uapi/drm/xe_drm.h                |  56 +++++++-----
>   6 files changed, 129 insertions(+), 119 deletions(-)
>
> diff --git a/drivers/gpu/drm/xe/xe_exec.c b/drivers/gpu/drm/xe/xe_exec.c
> index 92b0da6580e8..c62cabfaa112 100644
> --- a/drivers/gpu/drm/xe/xe_exec.c
> +++ b/drivers/gpu/drm/xe/xe_exec.c
> @@ -130,12 +130,15 @@ static int xe_exec_begin(struct drm_exec *exec, struct xe_vm *vm)
>   	return err;
>   }
>   
> +#define ALL_DRM_XE_SYNCS_FLAGS (DRM_XE_SYNCS_FLAG_WAIT_FOR_OP)
> +
>   int xe_exec_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
>   {
>   	struct xe_device *xe = to_xe_device(dev);
>   	struct xe_file *xef = to_xe_file(file);
>   	struct drm_xe_exec *args = data;
> -	struct drm_xe_sync __user *syncs_user = u64_to_user_ptr(args->syncs);
> +	struct drm_xe_sync __user *syncs_user =
> +		u64_to_user_ptr(args->syncs.syncs);
>   	u64 __user *addresses_user = u64_to_user_ptr(args->address);
>   	struct xe_exec_queue *q;
>   	struct xe_sync_entry *syncs = NULL;
> @@ -143,15 +146,18 @@ int xe_exec_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
>   	struct drm_exec exec;
>   	u32 i, num_syncs = 0;
>   	struct xe_sched_job *job;
> -	struct dma_fence *rebind_fence;
> +	struct dma_fence *rebind_fence, *job_fence;
>   	struct xe_vm *vm;
> -	bool write_locked;
> +	bool write_locked, skip_job_put = false;
> +	bool wait = args->syncs.flags & DRM_XE_SYNCS_FLAG_WAIT_FOR_OP;
>   	ktime_t end = 0;
>   	int err = 0;
>   
>   	if (XE_IOCTL_DBG(xe, args->extensions) ||
> -	    XE_IOCTL_DBG(xe, args->pad[0] || args->pad[1] || args->pad[2]) ||
> -	    XE_IOCTL_DBG(xe, args->reserved[0] || args->reserved[1]))
> +	    XE_IOCTL_DBG(xe, args->pad || args->pad2[0] || args->pad2[1] || args->pad2[2]) ||
> +	    XE_IOCTL_DBG(xe, args->reserved[0] || args->reserved[1]) ||
> +	    XE_IOCTL_DBG(xe, args->syncs.flags & ~ALL_DRM_XE_SYNCS_FLAGS) ||
> +	    XE_IOCTL_DBG(xe, wait && args->syncs.num_syncs))
>   		return -EINVAL;
>   
>   	q = xe_exec_queue_lookup(xef, args->exec_queue_id);
> @@ -170,8 +176,9 @@ int xe_exec_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
>   		goto err_exec_queue;
>   	}
>   
> -	if (args->num_syncs) {
> -		syncs = kcalloc(args->num_syncs, sizeof(*syncs), GFP_KERNEL);
> +	if (args->syncs.num_syncs) {
> +		syncs = kcalloc(args->syncs.num_syncs, sizeof(*syncs),
> +				GFP_KERNEL);
>   		if (!syncs) {
>   			err = -ENOMEM;
>   			goto err_exec_queue;
> @@ -180,7 +187,7 @@ int xe_exec_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
>   
>   	vm = q->vm;
>   
> -	for (i = 0; i < args->num_syncs; i++) {
> +	for (i = 0; i < args->syncs.num_syncs; i++) {
>   		err = xe_sync_entry_parse(xe, xef, &syncs[num_syncs++],
>   					  &syncs_user[i], SYNC_PARSE_FLAG_EXEC |
>   					  (xe_vm_in_lr_mode(vm) ?
> @@ -245,9 +252,17 @@ int xe_exec_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
>   				err = PTR_ERR(fence);
>   				goto err_exec;
>   			}
> +
>   			for (i = 0; i < num_syncs; i++)
>   				xe_sync_entry_signal(&syncs[i], NULL, fence);
> +
>   			xe_exec_queue_last_fence_set(q, vm, fence);
> +			if (wait) {
> +				long timeout = dma_fence_wait(fence, true);
> +
> +				if (timeout < 0)
> +					err = -EINTR;
> +			}

Here it looks like we will rerun the same IOCTL again if we return 
-EINTR. The user-space expected action on -EINTR is to just restart the 
IOCTL without any argument changes. Solution is to add an ioctl argument 
cookie (or to skip sync vm binds and have the user just use the 0 batch 
buffers / vm_binds calls or wait for an out-fence). If you go for the 
cookie solution then IMO we should keep the -ERESTARTSYS returned from 
dma_fence_wait() since it's converted to -EINTR on return-to-user-space, 
and the kernel restarts the IOCTL automatically if there was no 
requested-for-delivery signal pending.

I think the simplest solution at this point is to skip the sync 
behaviour, in particular if we enable the 0 batch / bind possibility.

If we still want to provide it, we could add a cookie address as an 
extension to the ioctl and activate sync if present? (Just throwing up 
ideas here).

>   			dma_fence_put(fence);
>   		}
>   
> @@ -331,42 +346,51 @@ int xe_exec_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
>   	 * the job and let the DRM scheduler / backend clean up the job.
>   	 */
>   	xe_sched_job_arm(job);
> +	job_fence = &job->drm.s_fence->finished;
> +	if (wait)
> +		dma_fence_get(job_fence);
>   	if (!xe_vm_in_lr_mode(vm)) {
>   		/* Block userptr invalidations / BO eviction */
> -		dma_resv_add_fence(&vm->resv,
> -				   &job->drm.s_fence->finished,
> +		dma_resv_add_fence(&vm->resv, job_fence,
>   				   DMA_RESV_USAGE_BOOKKEEP);
>   
>   		/*
>   		 * Make implicit sync work across drivers, assuming all external
>   		 * BOs are written as we don't pass in a read / write list.
>   		 */
> -		xe_vm_fence_all_extobjs(vm, &job->drm.s_fence->finished,
> -					DMA_RESV_USAGE_WRITE);
> +		xe_vm_fence_all_extobjs(vm, job_fence, DMA_RESV_USAGE_WRITE);
>   	}
>   
>   	for (i = 0; i < num_syncs; i++)
> -		xe_sync_entry_signal(&syncs[i], job,
> -				     &job->drm.s_fence->finished);
> +		xe_sync_entry_signal(&syncs[i], job, job_fence);
>   
>   	if (xe_exec_queue_is_lr(q))
>   		q->ring_ops->emit_job(job);
>   	if (!xe_vm_in_lr_mode(vm))
> -		xe_exec_queue_last_fence_set(q, vm, &job->drm.s_fence->finished);
> +		xe_exec_queue_last_fence_set(q, vm, job_fence);
>   	xe_sched_job_push(job);
>   	xe_vm_reactivate_rebind(vm);
>   
> -	if (!err && !xe_vm_in_lr_mode(vm)) {
> +	if (!xe_vm_in_lr_mode(vm)) {
>   		spin_lock(&xe->ttm.lru_lock);
>   		ttm_lru_bulk_move_tail(&vm->lru_bulk_move);
>   		spin_unlock(&xe->ttm.lru_lock);
>   	}
>   
> +	skip_job_put = true;
> +	if (wait) {
> +		long timeout = dma_fence_wait(job_fence, true);
> +
> +		dma_fence_put(job_fence);
> +		if (timeout < 0)
> +			err = -EINTR;
> +	}
> +
>   err_repin:
>   	if (!xe_vm_in_lr_mode(vm))
>   		up_read(&vm->userptr.notifier_lock);
>   err_put_job:
> -	if (err)
> +	if (err && !skip_job_put)
>   		xe_sched_job_put(job);
>   err_exec:
>   	drm_exec_fini(&exec);
> diff --git a/drivers/gpu/drm/xe/xe_exec_queue.c b/drivers/gpu/drm/xe/xe_exec_queue.c
> index 3911d14522ee..98776d02d634 100644
> --- a/drivers/gpu/drm/xe/xe_exec_queue.c
> +++ b/drivers/gpu/drm/xe/xe_exec_queue.c
> @@ -625,10 +625,7 @@ int xe_exec_queue_create_ioctl(struct drm_device *dev, void *data,
>   	if (XE_IOCTL_DBG(xe, eci[0].gt_id >= xe->info.gt_count))
>   		return -EINVAL;
>   
> -	if (eci[0].engine_class >= DRM_XE_ENGINE_CLASS_VM_BIND_ASYNC) {
> -		bool sync = eci[0].engine_class ==
> -			DRM_XE_ENGINE_CLASS_VM_BIND_SYNC;
> -
> +	if (eci[0].engine_class == DRM_XE_ENGINE_CLASS_VM_BIND) {
>   		for_each_gt(gt, xe, id) {
>   			struct xe_exec_queue *new;
>   
> @@ -654,8 +651,6 @@ int xe_exec_queue_create_ioctl(struct drm_device *dev, void *data,
>   						   args->width, hwe,
>   						   EXEC_QUEUE_FLAG_PERSISTENT |
>   						   EXEC_QUEUE_FLAG_VM |
> -						   (sync ? 0 :
> -						    EXEC_QUEUE_FLAG_VM_ASYNC) |
>   						   (id ?
>   						    EXEC_QUEUE_FLAG_BIND_ENGINE_CHILD :
>   						    0));
> diff --git a/drivers/gpu/drm/xe/xe_exec_queue_types.h b/drivers/gpu/drm/xe/xe_exec_queue_types.h
> index 52f0927d0d9b..c78f6e8b41c4 100644
> --- a/drivers/gpu/drm/xe/xe_exec_queue_types.h
> +++ b/drivers/gpu/drm/xe/xe_exec_queue_types.h
> @@ -74,8 +74,6 @@ struct xe_exec_queue {
>   #define EXEC_QUEUE_FLAG_VM			BIT(4)
>   /* child of VM queue for multi-tile VM jobs */
>   #define EXEC_QUEUE_FLAG_BIND_ENGINE_CHILD	BIT(5)
> -/* VM jobs for this queue are asynchronous */
> -#define EXEC_QUEUE_FLAG_VM_ASYNC		BIT(6)
>   
>   	/**
>   	 * @flags: flags for this exec queue, should statically setup aside from ban
> diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c
> index cf2eb44a71db..4b0c976c003a 100644
> --- a/drivers/gpu/drm/xe/xe_vm.c
> +++ b/drivers/gpu/drm/xe/xe_vm.c
> @@ -1433,9 +1433,7 @@ struct xe_vm *xe_vm_create(struct xe_device *xe, u32 flags)
>   			struct xe_gt *gt = tile->primary_gt;
>   			struct xe_vm *migrate_vm;
>   			struct xe_exec_queue *q;
> -			u32 create_flags = EXEC_QUEUE_FLAG_VM |
> -				((flags & XE_VM_FLAG_ASYNC_DEFAULT) ?
> -				EXEC_QUEUE_FLAG_VM_ASYNC : 0);
> +			u32 create_flags = EXEC_QUEUE_FLAG_VM;
>   
>   			if (!vm->pt_root[id])
>   				continue;
> @@ -1835,16 +1833,10 @@ xe_vm_bind_vma(struct xe_vma *vma, struct xe_exec_queue *q,
>   	return ERR_PTR(err);
>   }
>   
> -static bool xe_vm_sync_mode(struct xe_vm *vm, struct xe_exec_queue *q)
> -{
> -	return q ? !(q->flags & EXEC_QUEUE_FLAG_VM_ASYNC) :
> -		!(vm->flags & XE_VM_FLAG_ASYNC_DEFAULT);
> -}
> -
>   static int __xe_vm_bind(struct xe_vm *vm, struct xe_vma *vma,
>   			struct xe_exec_queue *q, struct xe_sync_entry *syncs,
>   			u32 num_syncs, bool immediate, bool first_op,
> -			bool last_op)
> +			bool last_op, bool async)
>   {
>   	struct dma_fence *fence;
>   	struct xe_exec_queue *wait_exec_queue = to_wait_exec_queue(vm, q);
> @@ -1870,7 +1862,7 @@ static int __xe_vm_bind(struct xe_vm *vm, struct xe_vma *vma,
>   
>   	if (last_op)
>   		xe_exec_queue_last_fence_set(wait_exec_queue, vm, fence);
> -	if (last_op && xe_vm_sync_mode(vm, q))
> +	if (last_op && !async)
>   		dma_fence_wait(fence, true);
>   	dma_fence_put(fence);
>   
> @@ -1880,7 +1872,7 @@ static int __xe_vm_bind(struct xe_vm *vm, struct xe_vma *vma,
>   static int xe_vm_bind(struct xe_vm *vm, struct xe_vma *vma, struct xe_exec_queue *q,
>   		      struct xe_bo *bo, struct xe_sync_entry *syncs,
>   		      u32 num_syncs, bool immediate, bool first_op,
> -		      bool last_op)
> +		      bool last_op, bool async)
>   {
>   	int err;
>   
> @@ -1894,12 +1886,12 @@ static int xe_vm_bind(struct xe_vm *vm, struct xe_vma *vma, struct xe_exec_queue
>   	}
>   
>   	return __xe_vm_bind(vm, vma, q, syncs, num_syncs, immediate, first_op,
> -			    last_op);
> +			    last_op, async);
>   }
>   
>   static int xe_vm_unbind(struct xe_vm *vm, struct xe_vma *vma,
>   			struct xe_exec_queue *q, struct xe_sync_entry *syncs,
> -			u32 num_syncs, bool first_op, bool last_op)
> +			u32 num_syncs, bool first_op, bool last_op, bool async)
>   {
>   	struct dma_fence *fence;
>   	struct xe_exec_queue *wait_exec_queue = to_wait_exec_queue(vm, q);
> @@ -1914,7 +1906,7 @@ static int xe_vm_unbind(struct xe_vm *vm, struct xe_vma *vma,
>   	xe_vma_destroy(vma, fence);
>   	if (last_op)
>   		xe_exec_queue_last_fence_set(wait_exec_queue, vm, fence);
> -	if (last_op && xe_vm_sync_mode(vm, q))
> +	if (last_op && !async)
>   		dma_fence_wait(fence, true);

It looks like we're dropping the error return code here.


>   	dma_fence_put(fence);
>   
> @@ -1923,7 +1915,6 @@ static int xe_vm_unbind(struct xe_vm *vm, struct xe_vma *vma,
>   
>   #define ALL_DRM_XE_VM_CREATE_FLAGS (DRM_XE_VM_CREATE_FLAG_SCRATCH_PAGE | \
>   				    DRM_XE_VM_CREATE_FLAG_LR_MODE | \
> -				    DRM_XE_VM_CREATE_FLAG_ASYNC_DEFAULT | \
>   				    DRM_XE_VM_CREATE_FLAG_FAULT_MODE)
>   
>   int xe_vm_create_ioctl(struct drm_device *dev, void *data,
> @@ -1977,8 +1968,6 @@ int xe_vm_create_ioctl(struct drm_device *dev, void *data,
>   		flags |= XE_VM_FLAG_SCRATCH_PAGE;
>   	if (args->flags & DRM_XE_VM_CREATE_FLAG_LR_MODE)
>   		flags |= XE_VM_FLAG_LR_MODE;
> -	if (args->flags & DRM_XE_VM_CREATE_FLAG_ASYNC_DEFAULT)
> -		flags |= XE_VM_FLAG_ASYNC_DEFAULT;
>   	if (args->flags & DRM_XE_VM_CREATE_FLAG_FAULT_MODE)
>   		flags |= XE_VM_FLAG_FAULT_MODE;
>   
> @@ -2062,7 +2051,7 @@ static const u32 region_to_mem_type[] = {
>   static int xe_vm_prefetch(struct xe_vm *vm, struct xe_vma *vma,
>   			  struct xe_exec_queue *q, u32 region,
>   			  struct xe_sync_entry *syncs, u32 num_syncs,
> -			  bool first_op, bool last_op)
> +			  bool first_op, bool last_op, bool async)
>   {
>   	struct xe_exec_queue *wait_exec_queue = to_wait_exec_queue(vm, q);
>   	int err;
> @@ -2077,7 +2066,7 @@ static int xe_vm_prefetch(struct xe_vm *vm, struct xe_vma *vma,
>   
>   	if (vma->tile_mask != (vma->tile_present & ~vma->usm.tile_invalidated)) {
>   		return xe_vm_bind(vm, vma, q, xe_vma_bo(vma), syncs, num_syncs,
> -				  true, first_op, last_op);
> +				  true, first_op, last_op, async);
>   	} else {
>   		int i;
>   
> @@ -2400,6 +2389,8 @@ static int vm_bind_ioctl_ops_parse(struct xe_vm *vm, struct xe_exec_queue *q,
>   		}
>   
>   		op->q = q;
> +		if (async)
> +			op->flags |= XE_VMA_OP_ASYNC;
>   
>   		switch (op->base.op) {
>   		case DRM_GPUVA_OP_MAP:
> @@ -2538,7 +2529,8 @@ static int op_execute(struct drm_exec *exec, struct xe_vm *vm,
>   				 op->syncs, op->num_syncs,
>   				 op->map.immediate || !xe_vm_in_fault_mode(vm),
>   				 op->flags & XE_VMA_OP_FIRST,
> -				 op->flags & XE_VMA_OP_LAST);
> +				 op->flags & XE_VMA_OP_LAST,
> +				 op->flags & XE_VMA_OP_ASYNC);
>   		break;
>   	case DRM_GPUVA_OP_REMAP:
>   	{
> @@ -2552,7 +2544,8 @@ static int op_execute(struct drm_exec *exec, struct xe_vm *vm,
>   					   op->num_syncs,
>   					   op->flags & XE_VMA_OP_FIRST,
>   					   op->flags & XE_VMA_OP_LAST &&
> -					   !prev && !next);
> +					   !prev && !next,
> +					   op->flags & XE_VMA_OP_ASYNC);
>   			if (err)
>   				break;
>   			op->remap.unmap_done = true;
> @@ -2563,7 +2556,8 @@ static int op_execute(struct drm_exec *exec, struct xe_vm *vm,
>   			err = xe_vm_bind(vm, op->remap.prev, op->q,
>   					 xe_vma_bo(op->remap.prev), op->syncs,
>   					 op->num_syncs, true, false,
> -					 op->flags & XE_VMA_OP_LAST && !next);
> +					 op->flags & XE_VMA_OP_LAST && !next,
> +					 op->flags & XE_VMA_OP_ASYNC);
>   			op->remap.prev->gpuva.flags &= ~XE_VMA_LAST_REBIND;
>   			if (err)
>   				break;
> @@ -2576,7 +2570,8 @@ static int op_execute(struct drm_exec *exec, struct xe_vm *vm,
>   					 xe_vma_bo(op->remap.next),
>   					 op->syncs, op->num_syncs,
>   					 true, false,
> -					 op->flags & XE_VMA_OP_LAST);
> +					 op->flags & XE_VMA_OP_LAST,
> +					 op->flags & XE_VMA_OP_ASYNC);
>   			op->remap.next->gpuva.flags &= ~XE_VMA_LAST_REBIND;
>   			if (err)
>   				break;
> @@ -2588,13 +2583,15 @@ static int op_execute(struct drm_exec *exec, struct xe_vm *vm,
>   	case DRM_GPUVA_OP_UNMAP:
>   		err = xe_vm_unbind(vm, vma, op->q, op->syncs,
>   				   op->num_syncs, op->flags & XE_VMA_OP_FIRST,
> -				   op->flags & XE_VMA_OP_LAST);
> +				   op->flags & XE_VMA_OP_LAST,
> +				   op->flags & XE_VMA_OP_ASYNC);
>   		break;
>   	case DRM_GPUVA_OP_PREFETCH:
>   		err = xe_vm_prefetch(vm, vma, op->q, op->prefetch.region,
>   				     op->syncs, op->num_syncs,
>   				     op->flags & XE_VMA_OP_FIRST,
> -				     op->flags & XE_VMA_OP_LAST);
> +				     op->flags & XE_VMA_OP_LAST,
> +				     op->flags & XE_VMA_OP_ASYNC);
>   		break;
>   	default:
>   		drm_warn(&vm->xe->drm, "NOT POSSIBLE");
> @@ -2808,16 +2805,16 @@ static int vm_bind_ioctl_ops_execute(struct xe_vm *vm,
>   
>   #ifdef TEST_VM_ASYNC_OPS_ERROR
>   #define SUPPORTED_FLAGS	\
> -	(FORCE_ASYNC_OP_ERROR | DRM_XE_VM_BIND_FLAG_ASYNC | \
> -	 DRM_XE_VM_BIND_FLAG_READONLY | DRM_XE_VM_BIND_FLAG_IMMEDIATE | \
> -	 DRM_XE_VM_BIND_FLAG_NULL | 0xffff)
> +	(FORCE_ASYNC_OP_ERROR | DRM_XE_VM_BIND_FLAG_READONLY | \
> +	 DRM_XE_VM_BIND_FLAG_IMMEDIATE | DRM_XE_VM_BIND_FLAG_NULL | 0xffff)
>   #else
>   #define SUPPORTED_FLAGS	\
> -	(DRM_XE_VM_BIND_FLAG_ASYNC | DRM_XE_VM_BIND_FLAG_READONLY | \
> +	(DRM_XE_VM_BIND_FLAG_READONLY | \
>   	 DRM_XE_VM_BIND_FLAG_IMMEDIATE | DRM_XE_VM_BIND_FLAG_NULL | \
>   	 0xffff)
>   #endif
>   #define XE_64K_PAGE_MASK 0xffffull
> +#define ALL_DRM_XE_SYNCS_FLAGS (DRM_XE_SYNCS_FLAG_WAIT_FOR_OP)
>   
>   #define MAX_BINDS	512	/* FIXME: Picking random upper limit */
>   
> @@ -2829,7 +2826,7 @@ static int vm_bind_ioctl_check_args(struct xe_device *xe,
>   	int err;
>   	int i;
>   
> -	if (XE_IOCTL_DBG(xe, args->pad || args->pad2) ||
> +	if (XE_IOCTL_DBG(xe, args->pad) ||
>   	    XE_IOCTL_DBG(xe, args->reserved[0] || args->reserved[1]))
>   		return -EINVAL;
>   
> @@ -2857,6 +2854,14 @@ static int vm_bind_ioctl_check_args(struct xe_device *xe,
>   		*bind_ops = &args->bind;
>   	}
>   
> +	*async = !(args->syncs.flags & DRM_XE_SYNCS_FLAG_WAIT_FOR_OP);
> +
> +	if (XE_IOCTL_DBG(xe, args->syncs.flags & ~ALL_DRM_XE_SYNCS_FLAGS) ||
> +	    XE_IOCTL_DBG(xe, !*async && args->syncs.num_syncs)) {
> +		err = -EINVAL;
> +		goto free_bind_ops;
> +	}
> +
>   	for (i = 0; i < args->num_binds; ++i) {
>   		u64 range = (*bind_ops)[i].range;
>   		u64 addr = (*bind_ops)[i].addr;
> @@ -2887,18 +2892,6 @@ static int vm_bind_ioctl_check_args(struct xe_device *xe,
>   			goto free_bind_ops;
>   		}
>   
> -		if (i == 0) {
> -			*async = !!(flags & DRM_XE_VM_BIND_FLAG_ASYNC);
> -			if (XE_IOCTL_DBG(xe, !*async && args->num_syncs)) {
> -				err = -EINVAL;
> -				goto free_bind_ops;
> -			}
> -		} else if (XE_IOCTL_DBG(xe, *async !=
> -					!!(flags & DRM_XE_VM_BIND_FLAG_ASYNC))) {
> -			err = -EINVAL;
> -			goto free_bind_ops;
> -		}
> -
>   		if (XE_IOCTL_DBG(xe, op > DRM_XE_VM_BIND_OP_PREFETCH) ||
>   		    XE_IOCTL_DBG(xe, flags & ~SUPPORTED_FLAGS) ||
>   		    XE_IOCTL_DBG(xe, obj && is_null) ||
> @@ -2951,7 +2944,7 @@ static int vm_bind_ioctl_check_args(struct xe_device *xe,
>   static int vm_bind_ioctl_signal_fences(struct xe_vm *vm,
>   				       struct xe_exec_queue *q,
>   				       struct xe_sync_entry *syncs,
> -				       int num_syncs)
> +				       int num_syncs, bool async)
>   {
>   	struct dma_fence *fence;
>   	int i, err = 0;
> @@ -2967,7 +2960,7 @@ static int vm_bind_ioctl_signal_fences(struct xe_vm *vm,
>   	xe_exec_queue_last_fence_set(to_wait_exec_queue(vm, q), vm,
>   				     fence);
>   
> -	if (xe_vm_sync_mode(vm, q)) {
> +	if (!async) {
>   		long timeout = dma_fence_wait(fence, true);
>   
>   		if (timeout < 0)
> @@ -3001,7 +2994,7 @@ int xe_vm_bind_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
>   	if (err)
>   		return err;
>   
> -	if (XE_IOCTL_DBG(xe, args->pad || args->pad2) ||
> +	if (XE_IOCTL_DBG(xe, args->pad) ||
>   	    XE_IOCTL_DBG(xe, args->reserved[0] || args->reserved[1]))
>   		return -EINVAL;
>   
> @@ -3016,12 +3009,6 @@ int xe_vm_bind_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
>   			err = -EINVAL;
>   			goto put_exec_queue;
>   		}
> -
> -		if (XE_IOCTL_DBG(xe, args->num_binds && async !=
> -				 !!(q->flags & EXEC_QUEUE_FLAG_VM_ASYNC))) {
> -			err = -EINVAL;
> -			goto put_exec_queue;
> -		}
>   	}
>   
>   	vm = xe_vm_lookup(xef, args->vm_id);
> @@ -3030,14 +3017,6 @@ int xe_vm_bind_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
>   		goto put_exec_queue;
>   	}
>   
> -	if (!args->exec_queue_id) {
> -		if (XE_IOCTL_DBG(xe, args->num_binds && async !=
> -				 !!(vm->flags & XE_VM_FLAG_ASYNC_DEFAULT))) {
> -			err = -EINVAL;
> -			goto put_vm;
> -		}
> -	}
> -
>   	err = down_write_killable(&vm->lock);
>   	if (err)
>   		goto put_vm;
> @@ -3127,16 +3106,16 @@ int xe_vm_bind_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
>   		}
>   	}
>   
> -	if (args->num_syncs) {
> -		syncs = kcalloc(args->num_syncs, sizeof(*syncs), GFP_KERNEL);
> +	if (args->syncs.num_syncs) {
> +		syncs = kcalloc(args->syncs.num_syncs, sizeof(*syncs), GFP_KERNEL);
>   		if (!syncs) {
>   			err = -ENOMEM;
>   			goto put_obj;
>   		}
>   	}
>   
> -	syncs_user = u64_to_user_ptr(args->syncs);
> -	for (num_syncs = 0; num_syncs < args->num_syncs; num_syncs++) {
> +	syncs_user = u64_to_user_ptr(args->syncs.syncs);
> +	for (num_syncs = 0; num_syncs < args->syncs.num_syncs; num_syncs++) {
>   		err = xe_sync_entry_parse(xe, xef, &syncs[num_syncs],
>   					  &syncs_user[num_syncs],
>   					  (xe_vm_in_lr_mode(vm) ?
> @@ -3210,7 +3189,8 @@ int xe_vm_bind_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
>   	vm_bind_ioctl_ops_unwind(vm, ops, args->num_binds);
>   free_syncs:
>   	if (err == -ENODATA)
> -		err = vm_bind_ioctl_signal_fences(vm, q, syncs, num_syncs);
> +		err = vm_bind_ioctl_signal_fences(vm, q, syncs, num_syncs,
> +						  async);
>   	while (num_syncs--)
>   		xe_sync_entry_cleanup(&syncs[num_syncs]);
>   
> diff --git a/drivers/gpu/drm/xe/xe_vm_types.h b/drivers/gpu/drm/xe/xe_vm_types.h
> index 23abdfd8622f..ce8b9bde7e9c 100644
> --- a/drivers/gpu/drm/xe/xe_vm_types.h
> +++ b/drivers/gpu/drm/xe/xe_vm_types.h
> @@ -167,13 +167,12 @@ struct xe_vm {
>   	 */
>   #define XE_VM_FLAG_64K			BIT(0)
>   #define XE_VM_FLAG_LR_MODE		BIT(1)
> -#define XE_VM_FLAG_ASYNC_DEFAULT	BIT(2)
> -#define XE_VM_FLAG_MIGRATION		BIT(3)
> -#define XE_VM_FLAG_SCRATCH_PAGE		BIT(4)
> -#define XE_VM_FLAG_FAULT_MODE		BIT(5)
> -#define XE_VM_FLAG_BANNED		BIT(6)
> -#define XE_VM_FLAG_TILE_ID(flags)	FIELD_GET(GENMASK(8, 7), flags)
> -#define XE_VM_FLAG_SET_TILE_ID(tile)	FIELD_PREP(GENMASK(8, 7), (tile)->id)
> +#define XE_VM_FLAG_MIGRATION		BIT(2)
> +#define XE_VM_FLAG_SCRATCH_PAGE		BIT(3)
> +#define XE_VM_FLAG_FAULT_MODE		BIT(4)
> +#define XE_VM_FLAG_BANNED		BIT(5)
> +#define XE_VM_FLAG_TILE_ID(flags)	FIELD_GET(GENMASK(7, 6), flags)
> +#define XE_VM_FLAG_SET_TILE_ID(tile)	FIELD_PREP(GENMASK(7, 6), (tile)->id)
>   	unsigned long flags;
>   
>   	/** @composite_fence_ctx: context composite fence */
> @@ -385,6 +384,8 @@ enum xe_vma_op_flags {
>   	XE_VMA_OP_PREV_COMMITTED	= BIT(3),
>   	/** @XE_VMA_OP_NEXT_COMMITTED: Next VMA operation committed */
>   	XE_VMA_OP_NEXT_COMMITTED	= BIT(4),
> +	/** @XE_VMA_OP_ASYNC: operation is async */
> +	XE_VMA_OP_ASYNC			= BIT(5),
>   };
>   
>   /** struct xe_vma_op - VMA operation */
> diff --git a/include/uapi/drm/xe_drm.h b/include/uapi/drm/xe_drm.h
> index eb03a49c17a1..fd8172fe2d9a 100644
> --- a/include/uapi/drm/xe_drm.h
> +++ b/include/uapi/drm/xe_drm.h
> @@ -141,8 +141,7 @@ struct drm_xe_engine_class_instance {
>   	 * Kernel only classes (not actual hardware engine class). Used for
>   	 * creating ordered queues of VM bind operations.
>   	 */
> -#define DRM_XE_ENGINE_CLASS_VM_BIND_ASYNC	5
> -#define DRM_XE_ENGINE_CLASS_VM_BIND_SYNC	6
> +#define DRM_XE_ENGINE_CLASS_VM_BIND		5
>   	__u16 engine_class;
>   
>   	__u16 engine_instance;
> @@ -660,7 +659,6 @@ struct drm_xe_vm_create {
>   	 * still enable recoverable pagefaults if supported by the device.
>   	 */
>   #define DRM_XE_VM_CREATE_FLAG_LR_MODE	        (1 << 1)
> -#define DRM_XE_VM_CREATE_FLAG_ASYNC_DEFAULT	(1 << 2)
>   	/*
>   	 * DRM_XE_VM_CREATE_FLAG_FAULT_MODE requires also
>   	 * DRM_XE_VM_CREATE_FLAG_LR_MODE. It allows memory to be allocated
> @@ -668,7 +666,7 @@ struct drm_xe_vm_create {
>   	 * The xe driver internally uses recoverable pagefaults to implement
>   	 * this.
>   	 */
> -#define DRM_XE_VM_CREATE_FLAG_FAULT_MODE	(1 << 3)
> +#define DRM_XE_VM_CREATE_FLAG_FAULT_MODE	(1 << 2)
>   	/** @flags: Flags */
>   	__u32 flags;
>   
> @@ -776,12 +774,11 @@ struct drm_xe_vm_bind_op {
>   	__u32 op;
>   
>   #define DRM_XE_VM_BIND_FLAG_READONLY	(1 << 0)
> -#define DRM_XE_VM_BIND_FLAG_ASYNC	(1 << 1)
>   	/*
>   	 * Valid on a faulting VM only, do the MAP operation immediately rather
>   	 * than deferring the MAP to the page fault handler.
>   	 */
> -#define DRM_XE_VM_BIND_FLAG_IMMEDIATE	(1 << 2)
> +#define DRM_XE_VM_BIND_FLAG_IMMEDIATE	(1 << 1)
>   	/*
>   	 * When the NULL flag is set, the page tables are setup with a special
>   	 * bit which indicates writes are dropped and all reads return zero.  In
> @@ -789,7 +786,7 @@ struct drm_xe_vm_bind_op {
>   	 * operations, the BO handle MBZ, and the BO offset MBZ. This flag is
>   	 * intended to implement VK sparse bindings.
>   	 */
> -#define DRM_XE_VM_BIND_FLAG_NULL	(1 << 3)
> +#define DRM_XE_VM_BIND_FLAG_NULL	(1 << 2)
>   	/** @flags: Bind flags */
>   	__u32 flags;
>   
> @@ -807,6 +804,27 @@ struct drm_xe_vm_bind_op {
>   	__u64 reserved[3];
>   };
>   
> +/**
> + * struct drm_xe_syncs - In / out syncs for IOCTLs.
> + */
> +struct drm_xe_syncs {
> +	/** @num_syncs: amount of syncs to wait on */
> +	__u32 num_syncs;
> +
> +	/*
> +	 * Block in IOCTL until operation complete, num_syncs MBZ if set.
> +	 */
> +#define DRM_XE_SYNCS_FLAG_WAIT_FOR_OP (1 << 0)
> +	/** @flags: Sync flags */
> +	__u32 flags;
> +
> +	/** @syncs: pointer to struct drm_xe_sync array */
> +	__u64 syncs;
> +
> +	/** @reserved: Reserved */
> +	__u64 reserved[2];
> +};
> +
>   struct drm_xe_vm_bind {
>   	/** @extensions: Pointer to the first extension struct, if any */
>   	__u64 extensions;
> @@ -838,14 +856,8 @@ struct drm_xe_vm_bind {
>   		__u64 vector_of_binds;
>   	};
>   
> -	/** @pad: MBZ */
> -	__u32 pad2;
> -
> -	/** @num_syncs: amount of syncs to wait on */
> -	__u32 num_syncs;
> -
> -	/** @syncs: pointer to struct drm_xe_sync array */
> -	__u64 syncs;
> +	/** @syncs: syncs for bind */
> +	struct drm_xe_syncs syncs;
>   
>   	/** @reserved: Reserved */
>   	__u64 reserved[2];
> @@ -974,14 +986,14 @@ struct drm_xe_exec {
>   	/** @extensions: Pointer to the first extension struct, if any */
>   	__u64 extensions;
>   
> +	/** @pad: MBZ */
> +	__u32 pad;
> +
>   	/** @exec_queue_id: Exec queue ID for the batch buffer */
>   	__u32 exec_queue_id;
>   
> -	/** @num_syncs: Amount of struct drm_xe_sync in array. */
> -	__u32 num_syncs;
> -
> -	/** @syncs: Pointer to struct drm_xe_sync array. */
> -	__u64 syncs;
> +	/** @syncs: syncs for exec */
> +	struct drm_xe_syncs syncs;
>   
>   	/**
>   	 * @address: address of batch buffer if num_batch_buffer == 1 or an
> @@ -995,8 +1007,8 @@ struct drm_xe_exec {
>   	 */
>   	__u16 num_batch_buffer;
>   
> -	/** @pad: MBZ */
> -	__u16 pad[3];
> +	/** @pad2: MBZ */
> +	__u16 pad2[3];
>   
>   	/** @reserved: Reserved */
>   	__u64 reserved[2];

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [Intel-xe] [RFC PATCH 5/7] drm/xe: Take in-syncs into account when num_execs or num_binds == 0
  2023-12-07  5:57 ` [Intel-xe] [RFC PATCH 5/7] drm/xe: Take in-syncs into account when num_execs or num_binds == 0 Matthew Brost
@ 2023-12-08 15:04   ` Thomas Hellström
  2023-12-12 17:18     ` Matthew Brost
  0 siblings, 1 reply; 22+ messages in thread
From: Thomas Hellström @ 2023-12-08 15:04 UTC (permalink / raw)
  To: Matthew Brost, intel-xe


On 12/7/23 06:57, Matthew Brost wrote:
> Wait on in-syncs before signaling out-syncs if num_execs or num_binds ==
> 0 in execbuf IOCTL or VM bind IOCTL respectfully.
>
> v2: Wait on last fence in addition to in-fences (Thomas)
> v3: Use function for in-fence signaling
>
> Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> ---
>   drivers/gpu/drm/xe/xe_exec.c | 10 ++++-
>   drivers/gpu/drm/xe/xe_sync.c | 74 ++++++++++++++++++++++++++++++++++++
>   drivers/gpu/drm/xe/xe_sync.h |  5 +++
>   drivers/gpu/drm/xe/xe_vm.c   | 41 ++++++++++++++++----
>   4 files changed, 121 insertions(+), 9 deletions(-)

Should we move patch 5/7 and 6/7 up the series to that everything works 
as expected when we enable the functionality.

Reviewed-by: Thomas Hellström <thomas.hellstrom@linux.intel.com>



^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [RFC PATCH 7/7] drm/xe/uapi: Uniform async vs sync handling
  2023-12-08 12:24     ` Matthew Brost
@ 2023-12-11 15:34       ` Thomas Hellström
  2023-12-11 16:50         ` Matthew Brost
  0 siblings, 1 reply; 22+ messages in thread
From: Thomas Hellström @ 2023-12-11 15:34 UTC (permalink / raw)
  To: Matthew Brost; +Cc: Francois Dugast, intel-xe, Rodrigo Vivi


On 12/8/23 13:24, Matthew Brost wrote:
> On Fri, Dec 08, 2023 at 04:00:37PM +0100, Thomas Hellström wrote:
> Missed a comment, addressing below.
>
>> On 12/7/23 06:57, Matthew Brost wrote:
>>> Remove concept of async vs sync VM bind queues, rather make async vs
>>> sync a per IOCTL choice. Since this is per IOCTL, it makes sense to have
>>> a singular flag IOCTL rather than per VM bind op flag too. Add
>>> DRM_XE_SYNCS_FLAG_WAIT_FOR_OP which is an input sync flag to support
>>> this. Support this new flag for both the VM bind IOCTL and the exec
>>> IOCTL to match behavior.
>>>
>>> Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
>>> Cc: Thomas Hellström <thomas.hellstrom@linux.intel.com>
>>> Cc: Francois Dugast <francois.dugast@intel.com>
>>> Signed-off-by: Matthew Brost <matthew.brost@intel.com>
>>> ---
>>>    drivers/gpu/drm/xe/xe_exec.c             |  58 ++++++++----
>>>    drivers/gpu/drm/xe/xe_exec_queue.c       |   7 +-
>>>    drivers/gpu/drm/xe/xe_exec_queue_types.h |   2 -
>>>    drivers/gpu/drm/xe/xe_vm.c               | 110 ++++++++++-------------
>>>    drivers/gpu/drm/xe/xe_vm_types.h         |  15 ++--
>>>    include/uapi/drm/xe_drm.h                |  56 +++++++-----
>>>    6 files changed, 129 insertions(+), 119 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/xe/xe_exec.c b/drivers/gpu/drm/xe/xe_exec.c
>>> index 92b0da6580e8..c62cabfaa112 100644
>>> --- a/drivers/gpu/drm/xe/xe_exec.c
>>> +++ b/drivers/gpu/drm/xe/xe_exec.c
>>> @@ -130,12 +130,15 @@ static int xe_exec_begin(struct drm_exec *exec, struct xe_vm *vm)
>>>    	return err;
>>>    }
>>> +#define ALL_DRM_XE_SYNCS_FLAGS (DRM_XE_SYNCS_FLAG_WAIT_FOR_OP)
>>> +
>>>    int xe_exec_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
>>>    {
>>>    	struct xe_device *xe = to_xe_device(dev);
>>>    	struct xe_file *xef = to_xe_file(file);
>>>    	struct drm_xe_exec *args = data;
>>> -	struct drm_xe_sync __user *syncs_user = u64_to_user_ptr(args->syncs);
>>> +	struct drm_xe_sync __user *syncs_user =
>>> +		u64_to_user_ptr(args->syncs.syncs);
>>>    	u64 __user *addresses_user = u64_to_user_ptr(args->address);
>>>    	struct xe_exec_queue *q;
>>>    	struct xe_sync_entry *syncs = NULL;
>>> @@ -143,15 +146,18 @@ int xe_exec_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
>>>    	struct drm_exec exec;
>>>    	u32 i, num_syncs = 0;
>>>    	struct xe_sched_job *job;
>>> -	struct dma_fence *rebind_fence;
>>> +	struct dma_fence *rebind_fence, *job_fence;
>>>    	struct xe_vm *vm;
>>> -	bool write_locked;
>>> +	bool write_locked, skip_job_put = false;
>>> +	bool wait = args->syncs.flags & DRM_XE_SYNCS_FLAG_WAIT_FOR_OP;
>>>    	ktime_t end = 0;
>>>    	int err = 0;
>>>    	if (XE_IOCTL_DBG(xe, args->extensions) ||
>>> -	    XE_IOCTL_DBG(xe, args->pad[0] || args->pad[1] || args->pad[2]) ||
>>> -	    XE_IOCTL_DBG(xe, args->reserved[0] || args->reserved[1]))
>>> +	    XE_IOCTL_DBG(xe, args->pad || args->pad2[0] || args->pad2[1] || args->pad2[2]) ||
>>> +	    XE_IOCTL_DBG(xe, args->reserved[0] || args->reserved[1]) ||
>>> +	    XE_IOCTL_DBG(xe, args->syncs.flags & ~ALL_DRM_XE_SYNCS_FLAGS) ||
>>> +	    XE_IOCTL_DBG(xe, wait && args->syncs.num_syncs))
>>>    		return -EINVAL;
>>>    	q = xe_exec_queue_lookup(xef, args->exec_queue_id);
>>> @@ -170,8 +176,9 @@ int xe_exec_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
>>>    		goto err_exec_queue;
>>>    	}
>>> -	if (args->num_syncs) {
>>> -		syncs = kcalloc(args->num_syncs, sizeof(*syncs), GFP_KERNEL);
>>> +	if (args->syncs.num_syncs) {
>>> +		syncs = kcalloc(args->syncs.num_syncs, sizeof(*syncs),
>>> +				GFP_KERNEL);
>>>    		if (!syncs) {
>>>    			err = -ENOMEM;
>>>    			goto err_exec_queue;
>>> @@ -180,7 +187,7 @@ int xe_exec_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
>>>    	vm = q->vm;
>>> -	for (i = 0; i < args->num_syncs; i++) {
>>> +	for (i = 0; i < args->syncs.num_syncs; i++) {
>>>    		err = xe_sync_entry_parse(xe, xef, &syncs[num_syncs++],
>>>    					  &syncs_user[i], SYNC_PARSE_FLAG_EXEC |
>>>    					  (xe_vm_in_lr_mode(vm) ?
>>> @@ -245,9 +252,17 @@ int xe_exec_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
>>>    				err = PTR_ERR(fence);
>>>    				goto err_exec;
>>>    			}
>>> +
>>>    			for (i = 0; i < num_syncs; i++)
>>>    				xe_sync_entry_signal(&syncs[i], NULL, fence);
>>> +
>>>    			xe_exec_queue_last_fence_set(q, vm, fence);
>>> +			if (wait) {
>>> +				long timeout = dma_fence_wait(fence, true);
>>> +
>>> +				if (timeout < 0)
>>> +					err = -EINTR;
>>> +			}
>> Here it looks like we will rerun the same IOCTL again if we return -EINTR.
>> The user-space expected action on -EINTR is to just restart the IOCTL
>> without any argument changes. Solution is to add an ioctl argument cookie
>> (or to skip sync vm binds and have the user just use the 0 batch buffers /
>> vm_binds calls or wait for an out-fence). If you go for the cookie solution
>> then IMO we should keep the -ERESTARTSYS returned from dma_fence_wait()
>> since it's converted to -EINTR on return-to-user-space, and the kernel
>> restarts the IOCTL automatically if there was no requested-for-delivery
>> signal pending.
>>
>> I think the simplest solution at this point is to skip the sync behaviour,
>> in particular if we enable the 0 batch / bind possibility.
>>
>> If we still want to provide it, we could add a cookie address as an
>> extension to the ioctl and activate sync if present? (Just throwing up ideas
>> here).
>>
>>>    			dma_fence_put(fence);
>>>    		}
>>> @@ -331,42 +346,51 @@ int xe_exec_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
>>>    	 * the job and let the DRM scheduler / backend clean up the job.
>>>    	 */
>>>    	xe_sched_job_arm(job);
>>> +	job_fence = &job->drm.s_fence->finished;
>>> +	if (wait)
>>> +		dma_fence_get(job_fence);
>>>    	if (!xe_vm_in_lr_mode(vm)) {
>>>    		/* Block userptr invalidations / BO eviction */
>>> -		dma_resv_add_fence(&vm->resv,
>>> -				   &job->drm.s_fence->finished,
>>> +		dma_resv_add_fence(&vm->resv, job_fence,
>>>    				   DMA_RESV_USAGE_BOOKKEEP);
>>>    		/*
>>>    		 * Make implicit sync work across drivers, assuming all external
>>>    		 * BOs are written as we don't pass in a read / write list.
>>>    		 */
>>> -		xe_vm_fence_all_extobjs(vm, &job->drm.s_fence->finished,
>>> -					DMA_RESV_USAGE_WRITE);
>>> +		xe_vm_fence_all_extobjs(vm, job_fence, DMA_RESV_USAGE_WRITE);
>>>    	}
>>>    	for (i = 0; i < num_syncs; i++)
>>> -		xe_sync_entry_signal(&syncs[i], job,
>>> -				     &job->drm.s_fence->finished);
>>> +		xe_sync_entry_signal(&syncs[i], job, job_fence);
>>>    	if (xe_exec_queue_is_lr(q))
>>>    		q->ring_ops->emit_job(job);
>>>    	if (!xe_vm_in_lr_mode(vm))
>>> -		xe_exec_queue_last_fence_set(q, vm, &job->drm.s_fence->finished);
>>> +		xe_exec_queue_last_fence_set(q, vm, job_fence);
>>>    	xe_sched_job_push(job);
>>>    	xe_vm_reactivate_rebind(vm);
>>> -	if (!err && !xe_vm_in_lr_mode(vm)) {
>>> +	if (!xe_vm_in_lr_mode(vm)) {
>>>    		spin_lock(&xe->ttm.lru_lock);
>>>    		ttm_lru_bulk_move_tail(&vm->lru_bulk_move);
>>>    		spin_unlock(&xe->ttm.lru_lock);
>>>    	}
>>> +	skip_job_put = true;
>>> +	if (wait) {
>>> +		long timeout = dma_fence_wait(job_fence, true);
>>> +
>>> +		dma_fence_put(job_fence);
>>> +		if (timeout < 0)
>>> +			err = -EINTR;
>>> +	}
>>> +
>>>    err_repin:
>>>    	if (!xe_vm_in_lr_mode(vm))
>>>    		up_read(&vm->userptr.notifier_lock);
>>>    err_put_job:
>>> -	if (err)
>>> +	if (err && !skip_job_put)
>>>    		xe_sched_job_put(job);
>>>    err_exec:
>>>    	drm_exec_fini(&exec);
>>> diff --git a/drivers/gpu/drm/xe/xe_exec_queue.c b/drivers/gpu/drm/xe/xe_exec_queue.c
>>> index 3911d14522ee..98776d02d634 100644
>>> --- a/drivers/gpu/drm/xe/xe_exec_queue.c
>>> +++ b/drivers/gpu/drm/xe/xe_exec_queue.c
>>> @@ -625,10 +625,7 @@ int xe_exec_queue_create_ioctl(struct drm_device *dev, void *data,
>>>    	if (XE_IOCTL_DBG(xe, eci[0].gt_id >= xe->info.gt_count))
>>>    		return -EINVAL;
>>> -	if (eci[0].engine_class >= DRM_XE_ENGINE_CLASS_VM_BIND_ASYNC) {
>>> -		bool sync = eci[0].engine_class ==
>>> -			DRM_XE_ENGINE_CLASS_VM_BIND_SYNC;
>>> -
>>> +	if (eci[0].engine_class == DRM_XE_ENGINE_CLASS_VM_BIND) {
>>>    		for_each_gt(gt, xe, id) {
>>>    			struct xe_exec_queue *new;
>>> @@ -654,8 +651,6 @@ int xe_exec_queue_create_ioctl(struct drm_device *dev, void *data,
>>>    						   args->width, hwe,
>>>    						   EXEC_QUEUE_FLAG_PERSISTENT |
>>>    						   EXEC_QUEUE_FLAG_VM |
>>> -						   (sync ? 0 :
>>> -						    EXEC_QUEUE_FLAG_VM_ASYNC) |
>>>    						   (id ?
>>>    						    EXEC_QUEUE_FLAG_BIND_ENGINE_CHILD :
>>>    						    0));
>>> diff --git a/drivers/gpu/drm/xe/xe_exec_queue_types.h b/drivers/gpu/drm/xe/xe_exec_queue_types.h
>>> index 52f0927d0d9b..c78f6e8b41c4 100644
>>> --- a/drivers/gpu/drm/xe/xe_exec_queue_types.h
>>> +++ b/drivers/gpu/drm/xe/xe_exec_queue_types.h
>>> @@ -74,8 +74,6 @@ struct xe_exec_queue {
>>>    #define EXEC_QUEUE_FLAG_VM			BIT(4)
>>>    /* child of VM queue for multi-tile VM jobs */
>>>    #define EXEC_QUEUE_FLAG_BIND_ENGINE_CHILD	BIT(5)
>>> -/* VM jobs for this queue are asynchronous */
>>> -#define EXEC_QUEUE_FLAG_VM_ASYNC		BIT(6)
>>>    	/**
>>>    	 * @flags: flags for this exec queue, should statically setup aside from ban
>>> diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c
>>> index cf2eb44a71db..4b0c976c003a 100644
>>> --- a/drivers/gpu/drm/xe/xe_vm.c
>>> +++ b/drivers/gpu/drm/xe/xe_vm.c
>>> @@ -1433,9 +1433,7 @@ struct xe_vm *xe_vm_create(struct xe_device *xe, u32 flags)
>>>    			struct xe_gt *gt = tile->primary_gt;
>>>    			struct xe_vm *migrate_vm;
>>>    			struct xe_exec_queue *q;
>>> -			u32 create_flags = EXEC_QUEUE_FLAG_VM |
>>> -				((flags & XE_VM_FLAG_ASYNC_DEFAULT) ?
>>> -				EXEC_QUEUE_FLAG_VM_ASYNC : 0);
>>> +			u32 create_flags = EXEC_QUEUE_FLAG_VM;
>>>    			if (!vm->pt_root[id])
>>>    				continue;
>>> @@ -1835,16 +1833,10 @@ xe_vm_bind_vma(struct xe_vma *vma, struct xe_exec_queue *q,
>>>    	return ERR_PTR(err);
>>>    }
>>> -static bool xe_vm_sync_mode(struct xe_vm *vm, struct xe_exec_queue *q)
>>> -{
>>> -	return q ? !(q->flags & EXEC_QUEUE_FLAG_VM_ASYNC) :
>>> -		!(vm->flags & XE_VM_FLAG_ASYNC_DEFAULT);
>>> -}
>>> -
>>>    static int __xe_vm_bind(struct xe_vm *vm, struct xe_vma *vma,
>>>    			struct xe_exec_queue *q, struct xe_sync_entry *syncs,
>>>    			u32 num_syncs, bool immediate, bool first_op,
>>> -			bool last_op)
>>> +			bool last_op, bool async)
>>>    {
>>>    	struct dma_fence *fence;
>>>    	struct xe_exec_queue *wait_exec_queue = to_wait_exec_queue(vm, q);
>>> @@ -1870,7 +1862,7 @@ static int __xe_vm_bind(struct xe_vm *vm, struct xe_vma *vma,
>>>    	if (last_op)
>>>    		xe_exec_queue_last_fence_set(wait_exec_queue, vm, fence);
>>> -	if (last_op && xe_vm_sync_mode(vm, q))
>>> +	if (last_op && !async)
>>>    		dma_fence_wait(fence, true);
>>>    	dma_fence_put(fence);
>>> @@ -1880,7 +1872,7 @@ static int __xe_vm_bind(struct xe_vm *vm, struct xe_vma *vma,
>>>    static int xe_vm_bind(struct xe_vm *vm, struct xe_vma *vma, struct xe_exec_queue *q,
>>>    		      struct xe_bo *bo, struct xe_sync_entry *syncs,
>>>    		      u32 num_syncs, bool immediate, bool first_op,
>>> -		      bool last_op)
>>> +		      bool last_op, bool async)
>>>    {
>>>    	int err;
>>> @@ -1894,12 +1886,12 @@ static int xe_vm_bind(struct xe_vm *vm, struct xe_vma *vma, struct xe_exec_queue
>>>    	}
>>>    	return __xe_vm_bind(vm, vma, q, syncs, num_syncs, immediate, first_op,
>>> -			    last_op);
>>> +			    last_op, async);
>>>    }
>>>    static int xe_vm_unbind(struct xe_vm *vm, struct xe_vma *vma,
>>>    			struct xe_exec_queue *q, struct xe_sync_entry *syncs,
>>> -			u32 num_syncs, bool first_op, bool last_op)
>>> +			u32 num_syncs, bool first_op, bool last_op, bool async)
>>>    {
>>>    	struct dma_fence *fence;
>>>    	struct xe_exec_queue *wait_exec_queue = to_wait_exec_queue(vm, q);
>>> @@ -1914,7 +1906,7 @@ static int xe_vm_unbind(struct xe_vm *vm, struct xe_vma *vma,
>>>    	xe_vma_destroy(vma, fence);
>>>    	if (last_op)
>>>    		xe_exec_queue_last_fence_set(wait_exec_queue, vm, fence);
>>> -	if (last_op && xe_vm_sync_mode(vm, q))
>>> +	if (last_op && !async)
>>>    		dma_fence_wait(fence, true);
>> It looks like we're dropping the error return code here.
>>
> I am aware of this. This is fixed in the larger refactor of the VM bind
> error handling [1]. The idea with this series is land the uAPI and get
> the implementation 100% correct in the larger follow up series.
>
> Matt
>
> [1] https://patchwork.freedesktop.org/series/125608/

Then I think we should wait uninterruptible until that is complete.

/Thomas


>
>>>    	dma_fence_put(fence);
>>> @@ -1923,7 +1915,6 @@ static int xe_vm_unbind(struct xe_vm *vm, struct xe_vma *vma,
>>>    #define ALL_DRM_XE_VM_CREATE_FLAGS (DRM_XE_VM_CREATE_FLAG_SCRATCH_PAGE | \
>>>    				    DRM_XE_VM_CREATE_FLAG_LR_MODE | \
>>> -				    DRM_XE_VM_CREATE_FLAG_ASYNC_DEFAULT | \
>>>    				    DRM_XE_VM_CREATE_FLAG_FAULT_MODE)
>>>    int xe_vm_create_ioctl(struct drm_device *dev, void *data,
>>> @@ -1977,8 +1968,6 @@ int xe_vm_create_ioctl(struct drm_device *dev, void *data,
>>>    		flags |= XE_VM_FLAG_SCRATCH_PAGE;
>>>    	if (args->flags & DRM_XE_VM_CREATE_FLAG_LR_MODE)
>>>    		flags |= XE_VM_FLAG_LR_MODE;
>>> -	if (args->flags & DRM_XE_VM_CREATE_FLAG_ASYNC_DEFAULT)
>>> -		flags |= XE_VM_FLAG_ASYNC_DEFAULT;
>>>    	if (args->flags & DRM_XE_VM_CREATE_FLAG_FAULT_MODE)
>>>    		flags |= XE_VM_FLAG_FAULT_MODE;
>>> @@ -2062,7 +2051,7 @@ static const u32 region_to_mem_type[] = {
>>>    static int xe_vm_prefetch(struct xe_vm *vm, struct xe_vma *vma,
>>>    			  struct xe_exec_queue *q, u32 region,
>>>    			  struct xe_sync_entry *syncs, u32 num_syncs,
>>> -			  bool first_op, bool last_op)
>>> +			  bool first_op, bool last_op, bool async)
>>>    {
>>>    	struct xe_exec_queue *wait_exec_queue = to_wait_exec_queue(vm, q);
>>>    	int err;
>>> @@ -2077,7 +2066,7 @@ static int xe_vm_prefetch(struct xe_vm *vm, struct xe_vma *vma,
>>>    	if (vma->tile_mask != (vma->tile_present & ~vma->usm.tile_invalidated)) {
>>>    		return xe_vm_bind(vm, vma, q, xe_vma_bo(vma), syncs, num_syncs,
>>> -				  true, first_op, last_op);
>>> +				  true, first_op, last_op, async);
>>>    	} else {
>>>    		int i;
>>> @@ -2400,6 +2389,8 @@ static int vm_bind_ioctl_ops_parse(struct xe_vm *vm, struct xe_exec_queue *q,
>>>    		}
>>>    		op->q = q;
>>> +		if (async)
>>> +			op->flags |= XE_VMA_OP_ASYNC;
>>>    		switch (op->base.op) {
>>>    		case DRM_GPUVA_OP_MAP:
>>> @@ -2538,7 +2529,8 @@ static int op_execute(struct drm_exec *exec, struct xe_vm *vm,
>>>    				 op->syncs, op->num_syncs,
>>>    				 op->map.immediate || !xe_vm_in_fault_mode(vm),
>>>    				 op->flags & XE_VMA_OP_FIRST,
>>> -				 op->flags & XE_VMA_OP_LAST);
>>> +				 op->flags & XE_VMA_OP_LAST,
>>> +				 op->flags & XE_VMA_OP_ASYNC);
>>>    		break;
>>>    	case DRM_GPUVA_OP_REMAP:
>>>    	{
>>> @@ -2552,7 +2544,8 @@ static int op_execute(struct drm_exec *exec, struct xe_vm *vm,
>>>    					   op->num_syncs,
>>>    					   op->flags & XE_VMA_OP_FIRST,
>>>    					   op->flags & XE_VMA_OP_LAST &&
>>> -					   !prev && !next);
>>> +					   !prev && !next,
>>> +					   op->flags & XE_VMA_OP_ASYNC);
>>>    			if (err)
>>>    				break;
>>>    			op->remap.unmap_done = true;
>>> @@ -2563,7 +2556,8 @@ static int op_execute(struct drm_exec *exec, struct xe_vm *vm,
>>>    			err = xe_vm_bind(vm, op->remap.prev, op->q,
>>>    					 xe_vma_bo(op->remap.prev), op->syncs,
>>>    					 op->num_syncs, true, false,
>>> -					 op->flags & XE_VMA_OP_LAST && !next);
>>> +					 op->flags & XE_VMA_OP_LAST && !next,
>>> +					 op->flags & XE_VMA_OP_ASYNC);
>>>    			op->remap.prev->gpuva.flags &= ~XE_VMA_LAST_REBIND;
>>>    			if (err)
>>>    				break;
>>> @@ -2576,7 +2570,8 @@ static int op_execute(struct drm_exec *exec, struct xe_vm *vm,
>>>    					 xe_vma_bo(op->remap.next),
>>>    					 op->syncs, op->num_syncs,
>>>    					 true, false,
>>> -					 op->flags & XE_VMA_OP_LAST);
>>> +					 op->flags & XE_VMA_OP_LAST,
>>> +					 op->flags & XE_VMA_OP_ASYNC);
>>>    			op->remap.next->gpuva.flags &= ~XE_VMA_LAST_REBIND;
>>>    			if (err)
>>>    				break;
>>> @@ -2588,13 +2583,15 @@ static int op_execute(struct drm_exec *exec, struct xe_vm *vm,
>>>    	case DRM_GPUVA_OP_UNMAP:
>>>    		err = xe_vm_unbind(vm, vma, op->q, op->syncs,
>>>    				   op->num_syncs, op->flags & XE_VMA_OP_FIRST,
>>> -				   op->flags & XE_VMA_OP_LAST);
>>> +				   op->flags & XE_VMA_OP_LAST,
>>> +				   op->flags & XE_VMA_OP_ASYNC);
>>>    		break;
>>>    	case DRM_GPUVA_OP_PREFETCH:
>>>    		err = xe_vm_prefetch(vm, vma, op->q, op->prefetch.region,
>>>    				     op->syncs, op->num_syncs,
>>>    				     op->flags & XE_VMA_OP_FIRST,
>>> -				     op->flags & XE_VMA_OP_LAST);
>>> +				     op->flags & XE_VMA_OP_LAST,
>>> +				     op->flags & XE_VMA_OP_ASYNC);
>>>    		break;
>>>    	default:
>>>    		drm_warn(&vm->xe->drm, "NOT POSSIBLE");
>>> @@ -2808,16 +2805,16 @@ static int vm_bind_ioctl_ops_execute(struct xe_vm *vm,
>>>    #ifdef TEST_VM_ASYNC_OPS_ERROR
>>>    #define SUPPORTED_FLAGS	\
>>> -	(FORCE_ASYNC_OP_ERROR | DRM_XE_VM_BIND_FLAG_ASYNC | \
>>> -	 DRM_XE_VM_BIND_FLAG_READONLY | DRM_XE_VM_BIND_FLAG_IMMEDIATE | \
>>> -	 DRM_XE_VM_BIND_FLAG_NULL | 0xffff)
>>> +	(FORCE_ASYNC_OP_ERROR | DRM_XE_VM_BIND_FLAG_READONLY | \
>>> +	 DRM_XE_VM_BIND_FLAG_IMMEDIATE | DRM_XE_VM_BIND_FLAG_NULL | 0xffff)
>>>    #else
>>>    #define SUPPORTED_FLAGS	\
>>> -	(DRM_XE_VM_BIND_FLAG_ASYNC | DRM_XE_VM_BIND_FLAG_READONLY | \
>>> +	(DRM_XE_VM_BIND_FLAG_READONLY | \
>>>    	 DRM_XE_VM_BIND_FLAG_IMMEDIATE | DRM_XE_VM_BIND_FLAG_NULL | \
>>>    	 0xffff)
>>>    #endif
>>>    #define XE_64K_PAGE_MASK 0xffffull
>>> +#define ALL_DRM_XE_SYNCS_FLAGS (DRM_XE_SYNCS_FLAG_WAIT_FOR_OP)
>>>    #define MAX_BINDS	512	/* FIXME: Picking random upper limit */
>>> @@ -2829,7 +2826,7 @@ static int vm_bind_ioctl_check_args(struct xe_device *xe,
>>>    	int err;
>>>    	int i;
>>> -	if (XE_IOCTL_DBG(xe, args->pad || args->pad2) ||
>>> +	if (XE_IOCTL_DBG(xe, args->pad) ||
>>>    	    XE_IOCTL_DBG(xe, args->reserved[0] || args->reserved[1]))
>>>    		return -EINVAL;
>>> @@ -2857,6 +2854,14 @@ static int vm_bind_ioctl_check_args(struct xe_device *xe,
>>>    		*bind_ops = &args->bind;
>>>    	}
>>> +	*async = !(args->syncs.flags & DRM_XE_SYNCS_FLAG_WAIT_FOR_OP);
>>> +
>>> +	if (XE_IOCTL_DBG(xe, args->syncs.flags & ~ALL_DRM_XE_SYNCS_FLAGS) ||
>>> +	    XE_IOCTL_DBG(xe, !*async && args->syncs.num_syncs)) {
>>> +		err = -EINVAL;
>>> +		goto free_bind_ops;
>>> +	}
>>> +
>>>    	for (i = 0; i < args->num_binds; ++i) {
>>>    		u64 range = (*bind_ops)[i].range;
>>>    		u64 addr = (*bind_ops)[i].addr;
>>> @@ -2887,18 +2892,6 @@ static int vm_bind_ioctl_check_args(struct xe_device *xe,
>>>    			goto free_bind_ops;
>>>    		}
>>> -		if (i == 0) {
>>> -			*async = !!(flags & DRM_XE_VM_BIND_FLAG_ASYNC);
>>> -			if (XE_IOCTL_DBG(xe, !*async && args->num_syncs)) {
>>> -				err = -EINVAL;
>>> -				goto free_bind_ops;
>>> -			}
>>> -		} else if (XE_IOCTL_DBG(xe, *async !=
>>> -					!!(flags & DRM_XE_VM_BIND_FLAG_ASYNC))) {
>>> -			err = -EINVAL;
>>> -			goto free_bind_ops;
>>> -		}
>>> -
>>>    		if (XE_IOCTL_DBG(xe, op > DRM_XE_VM_BIND_OP_PREFETCH) ||
>>>    		    XE_IOCTL_DBG(xe, flags & ~SUPPORTED_FLAGS) ||
>>>    		    XE_IOCTL_DBG(xe, obj && is_null) ||
>>> @@ -2951,7 +2944,7 @@ static int vm_bind_ioctl_check_args(struct xe_device *xe,
>>>    static int vm_bind_ioctl_signal_fences(struct xe_vm *vm,
>>>    				       struct xe_exec_queue *q,
>>>    				       struct xe_sync_entry *syncs,
>>> -				       int num_syncs)
>>> +				       int num_syncs, bool async)
>>>    {
>>>    	struct dma_fence *fence;
>>>    	int i, err = 0;
>>> @@ -2967,7 +2960,7 @@ static int vm_bind_ioctl_signal_fences(struct xe_vm *vm,
>>>    	xe_exec_queue_last_fence_set(to_wait_exec_queue(vm, q), vm,
>>>    				     fence);
>>> -	if (xe_vm_sync_mode(vm, q)) {
>>> +	if (!async) {
>>>    		long timeout = dma_fence_wait(fence, true);
>>>    		if (timeout < 0)
>>> @@ -3001,7 +2994,7 @@ int xe_vm_bind_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
>>>    	if (err)
>>>    		return err;
>>> -	if (XE_IOCTL_DBG(xe, args->pad || args->pad2) ||
>>> +	if (XE_IOCTL_DBG(xe, args->pad) ||
>>>    	    XE_IOCTL_DBG(xe, args->reserved[0] || args->reserved[1]))
>>>    		return -EINVAL;
>>> @@ -3016,12 +3009,6 @@ int xe_vm_bind_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
>>>    			err = -EINVAL;
>>>    			goto put_exec_queue;
>>>    		}
>>> -
>>> -		if (XE_IOCTL_DBG(xe, args->num_binds && async !=
>>> -				 !!(q->flags & EXEC_QUEUE_FLAG_VM_ASYNC))) {
>>> -			err = -EINVAL;
>>> -			goto put_exec_queue;
>>> -		}
>>>    	}
>>>    	vm = xe_vm_lookup(xef, args->vm_id);
>>> @@ -3030,14 +3017,6 @@ int xe_vm_bind_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
>>>    		goto put_exec_queue;
>>>    	}
>>> -	if (!args->exec_queue_id) {
>>> -		if (XE_IOCTL_DBG(xe, args->num_binds && async !=
>>> -				 !!(vm->flags & XE_VM_FLAG_ASYNC_DEFAULT))) {
>>> -			err = -EINVAL;
>>> -			goto put_vm;
>>> -		}
>>> -	}
>>> -
>>>    	err = down_write_killable(&vm->lock);
>>>    	if (err)
>>>    		goto put_vm;
>>> @@ -3127,16 +3106,16 @@ int xe_vm_bind_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
>>>    		}
>>>    	}
>>> -	if (args->num_syncs) {
>>> -		syncs = kcalloc(args->num_syncs, sizeof(*syncs), GFP_KERNEL);
>>> +	if (args->syncs.num_syncs) {
>>> +		syncs = kcalloc(args->syncs.num_syncs, sizeof(*syncs), GFP_KERNEL);
>>>    		if (!syncs) {
>>>    			err = -ENOMEM;
>>>    			goto put_obj;
>>>    		}
>>>    	}
>>> -	syncs_user = u64_to_user_ptr(args->syncs);
>>> -	for (num_syncs = 0; num_syncs < args->num_syncs; num_syncs++) {
>>> +	syncs_user = u64_to_user_ptr(args->syncs.syncs);
>>> +	for (num_syncs = 0; num_syncs < args->syncs.num_syncs; num_syncs++) {
>>>    		err = xe_sync_entry_parse(xe, xef, &syncs[num_syncs],
>>>    					  &syncs_user[num_syncs],
>>>    					  (xe_vm_in_lr_mode(vm) ?
>>> @@ -3210,7 +3189,8 @@ int xe_vm_bind_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
>>>    	vm_bind_ioctl_ops_unwind(vm, ops, args->num_binds);
>>>    free_syncs:
>>>    	if (err == -ENODATA)
>>> -		err = vm_bind_ioctl_signal_fences(vm, q, syncs, num_syncs);
>>> +		err = vm_bind_ioctl_signal_fences(vm, q, syncs, num_syncs,
>>> +						  async);
>>>    	while (num_syncs--)
>>>    		xe_sync_entry_cleanup(&syncs[num_syncs]);
>>> diff --git a/drivers/gpu/drm/xe/xe_vm_types.h b/drivers/gpu/drm/xe/xe_vm_types.h
>>> index 23abdfd8622f..ce8b9bde7e9c 100644
>>> --- a/drivers/gpu/drm/xe/xe_vm_types.h
>>> +++ b/drivers/gpu/drm/xe/xe_vm_types.h
>>> @@ -167,13 +167,12 @@ struct xe_vm {
>>>    	 */
>>>    #define XE_VM_FLAG_64K			BIT(0)
>>>    #define XE_VM_FLAG_LR_MODE		BIT(1)
>>> -#define XE_VM_FLAG_ASYNC_DEFAULT	BIT(2)
>>> -#define XE_VM_FLAG_MIGRATION		BIT(3)
>>> -#define XE_VM_FLAG_SCRATCH_PAGE		BIT(4)
>>> -#define XE_VM_FLAG_FAULT_MODE		BIT(5)
>>> -#define XE_VM_FLAG_BANNED		BIT(6)
>>> -#define XE_VM_FLAG_TILE_ID(flags)	FIELD_GET(GENMASK(8, 7), flags)
>>> -#define XE_VM_FLAG_SET_TILE_ID(tile)	FIELD_PREP(GENMASK(8, 7), (tile)->id)
>>> +#define XE_VM_FLAG_MIGRATION		BIT(2)
>>> +#define XE_VM_FLAG_SCRATCH_PAGE		BIT(3)
>>> +#define XE_VM_FLAG_FAULT_MODE		BIT(4)
>>> +#define XE_VM_FLAG_BANNED		BIT(5)
>>> +#define XE_VM_FLAG_TILE_ID(flags)	FIELD_GET(GENMASK(7, 6), flags)
>>> +#define XE_VM_FLAG_SET_TILE_ID(tile)	FIELD_PREP(GENMASK(7, 6), (tile)->id)
>>>    	unsigned long flags;
>>>    	/** @composite_fence_ctx: context composite fence */
>>> @@ -385,6 +384,8 @@ enum xe_vma_op_flags {
>>>    	XE_VMA_OP_PREV_COMMITTED	= BIT(3),
>>>    	/** @XE_VMA_OP_NEXT_COMMITTED: Next VMA operation committed */
>>>    	XE_VMA_OP_NEXT_COMMITTED	= BIT(4),
>>> +	/** @XE_VMA_OP_ASYNC: operation is async */
>>> +	XE_VMA_OP_ASYNC			= BIT(5),
>>>    };
>>>    /** struct xe_vma_op - VMA operation */
>>> diff --git a/include/uapi/drm/xe_drm.h b/include/uapi/drm/xe_drm.h
>>> index eb03a49c17a1..fd8172fe2d9a 100644
>>> --- a/include/uapi/drm/xe_drm.h
>>> +++ b/include/uapi/drm/xe_drm.h
>>> @@ -141,8 +141,7 @@ struct drm_xe_engine_class_instance {
>>>    	 * Kernel only classes (not actual hardware engine class). Used for
>>>    	 * creating ordered queues of VM bind operations.
>>>    	 */
>>> -#define DRM_XE_ENGINE_CLASS_VM_BIND_ASYNC	5
>>> -#define DRM_XE_ENGINE_CLASS_VM_BIND_SYNC	6
>>> +#define DRM_XE_ENGINE_CLASS_VM_BIND		5
>>>    	__u16 engine_class;
>>>    	__u16 engine_instance;
>>> @@ -660,7 +659,6 @@ struct drm_xe_vm_create {
>>>    	 * still enable recoverable pagefaults if supported by the device.
>>>    	 */
>>>    #define DRM_XE_VM_CREATE_FLAG_LR_MODE	        (1 << 1)
>>> -#define DRM_XE_VM_CREATE_FLAG_ASYNC_DEFAULT	(1 << 2)
>>>    	/*
>>>    	 * DRM_XE_VM_CREATE_FLAG_FAULT_MODE requires also
>>>    	 * DRM_XE_VM_CREATE_FLAG_LR_MODE. It allows memory to be allocated
>>> @@ -668,7 +666,7 @@ struct drm_xe_vm_create {
>>>    	 * The xe driver internally uses recoverable pagefaults to implement
>>>    	 * this.
>>>    	 */
>>> -#define DRM_XE_VM_CREATE_FLAG_FAULT_MODE	(1 << 3)
>>> +#define DRM_XE_VM_CREATE_FLAG_FAULT_MODE	(1 << 2)
>>>    	/** @flags: Flags */
>>>    	__u32 flags;
>>> @@ -776,12 +774,11 @@ struct drm_xe_vm_bind_op {
>>>    	__u32 op;
>>>    #define DRM_XE_VM_BIND_FLAG_READONLY	(1 << 0)
>>> -#define DRM_XE_VM_BIND_FLAG_ASYNC	(1 << 1)
>>>    	/*
>>>    	 * Valid on a faulting VM only, do the MAP operation immediately rather
>>>    	 * than deferring the MAP to the page fault handler.
>>>    	 */
>>> -#define DRM_XE_VM_BIND_FLAG_IMMEDIATE	(1 << 2)
>>> +#define DRM_XE_VM_BIND_FLAG_IMMEDIATE	(1 << 1)
>>>    	/*
>>>    	 * When the NULL flag is set, the page tables are setup with a special
>>>    	 * bit which indicates writes are dropped and all reads return zero.  In
>>> @@ -789,7 +786,7 @@ struct drm_xe_vm_bind_op {
>>>    	 * operations, the BO handle MBZ, and the BO offset MBZ. This flag is
>>>    	 * intended to implement VK sparse bindings.
>>>    	 */
>>> -#define DRM_XE_VM_BIND_FLAG_NULL	(1 << 3)
>>> +#define DRM_XE_VM_BIND_FLAG_NULL	(1 << 2)
>>>    	/** @flags: Bind flags */
>>>    	__u32 flags;
>>> @@ -807,6 +804,27 @@ struct drm_xe_vm_bind_op {
>>>    	__u64 reserved[3];
>>>    };
>>> +/**
>>> + * struct drm_xe_syncs - In / out syncs for IOCTLs.
>>> + */
>>> +struct drm_xe_syncs {
>>> +	/** @num_syncs: amount of syncs to wait on */
>>> +	__u32 num_syncs;
>>> +
>>> +	/*
>>> +	 * Block in IOCTL until operation complete, num_syncs MBZ if set.
>>> +	 */
>>> +#define DRM_XE_SYNCS_FLAG_WAIT_FOR_OP (1 << 0)
>>> +	/** @flags: Sync flags */
>>> +	__u32 flags;
>>> +
>>> +	/** @syncs: pointer to struct drm_xe_sync array */
>>> +	__u64 syncs;
>>> +
>>> +	/** @reserved: Reserved */
>>> +	__u64 reserved[2];
>>> +};
>>> +
>>>    struct drm_xe_vm_bind {
>>>    	/** @extensions: Pointer to the first extension struct, if any */
>>>    	__u64 extensions;
>>> @@ -838,14 +856,8 @@ struct drm_xe_vm_bind {
>>>    		__u64 vector_of_binds;
>>>    	};
>>> -	/** @pad: MBZ */
>>> -	__u32 pad2;
>>> -
>>> -	/** @num_syncs: amount of syncs to wait on */
>>> -	__u32 num_syncs;
>>> -
>>> -	/** @syncs: pointer to struct drm_xe_sync array */
>>> -	__u64 syncs;
>>> +	/** @syncs: syncs for bind */
>>> +	struct drm_xe_syncs syncs;
>>>    	/** @reserved: Reserved */
>>>    	__u64 reserved[2];
>>> @@ -974,14 +986,14 @@ struct drm_xe_exec {
>>>    	/** @extensions: Pointer to the first extension struct, if any */
>>>    	__u64 extensions;
>>> +	/** @pad: MBZ */
>>> +	__u32 pad;
>>> +
>>>    	/** @exec_queue_id: Exec queue ID for the batch buffer */
>>>    	__u32 exec_queue_id;
>>> -	/** @num_syncs: Amount of struct drm_xe_sync in array. */
>>> -	__u32 num_syncs;
>>> -
>>> -	/** @syncs: Pointer to struct drm_xe_sync array. */
>>> -	__u64 syncs;
>>> +	/** @syncs: syncs for exec */
>>> +	struct drm_xe_syncs syncs;
>>>    	/**
>>>    	 * @address: address of batch buffer if num_batch_buffer == 1 or an
>>> @@ -995,8 +1007,8 @@ struct drm_xe_exec {
>>>    	 */
>>>    	__u16 num_batch_buffer;
>>> -	/** @pad: MBZ */
>>> -	__u16 pad[3];
>>> +	/** @pad2: MBZ */
>>> +	__u16 pad2[3];
>>>    	/** @reserved: Reserved */
>>>    	__u64 reserved[2];

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [RFC PATCH 7/7] drm/xe/uapi: Uniform async vs sync handling
  2023-12-08  9:45     ` Matthew Brost
@ 2023-12-11 15:43       ` Thomas Hellström
  2023-12-11 16:49         ` Matthew Brost
  0 siblings, 1 reply; 22+ messages in thread
From: Thomas Hellström @ 2023-12-11 15:43 UTC (permalink / raw)
  To: Matthew Brost; +Cc: Francois Dugast, intel-xe, Rodrigo Vivi


On 12/8/23 10:45, Matthew Brost wrote:
> On Fri, Dec 08, 2023 at 04:00:37PM +0100, Thomas Hellström wrote:
>> On 12/7/23 06:57, Matthew Brost wrote:
>>> Remove concept of async vs sync VM bind queues, rather make async vs
>>> sync a per IOCTL choice. Since this is per IOCTL, it makes sense to have
>>> a singular flag IOCTL rather than per VM bind op flag too. Add
>>> DRM_XE_SYNCS_FLAG_WAIT_FOR_OP which is an input sync flag to support
>>> this. Support this new flag for both the VM bind IOCTL and the exec
>>> IOCTL to match behavior.
>>>
>>> Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
>>> Cc: Thomas Hellström <thomas.hellstrom@linux.intel.com>
>>> Cc: Francois Dugast <francois.dugast@intel.com>
>>> Signed-off-by: Matthew Brost <matthew.brost@intel.com>
>>> ---
>>>    drivers/gpu/drm/xe/xe_exec.c             |  58 ++++++++----
>>>    drivers/gpu/drm/xe/xe_exec_queue.c       |   7 +-
>>>    drivers/gpu/drm/xe/xe_exec_queue_types.h |   2 -
>>>    drivers/gpu/drm/xe/xe_vm.c               | 110 ++++++++++-------------
>>>    drivers/gpu/drm/xe/xe_vm_types.h         |  15 ++--
>>>    include/uapi/drm/xe_drm.h                |  56 +++++++-----
>>>    6 files changed, 129 insertions(+), 119 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/xe/xe_exec.c b/drivers/gpu/drm/xe/xe_exec.c
>>> index 92b0da6580e8..c62cabfaa112 100644
>>> --- a/drivers/gpu/drm/xe/xe_exec.c
>>> +++ b/drivers/gpu/drm/xe/xe_exec.c
>>> @@ -130,12 +130,15 @@ static int xe_exec_begin(struct drm_exec *exec, struct xe_vm *vm)
>>>    	return err;
>>>    }
>>> +#define ALL_DRM_XE_SYNCS_FLAGS (DRM_XE_SYNCS_FLAG_WAIT_FOR_OP)
>>> +
>>>    int xe_exec_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
>>>    {
>>>    	struct xe_device *xe = to_xe_device(dev);
>>>    	struct xe_file *xef = to_xe_file(file);
>>>    	struct drm_xe_exec *args = data;
>>> -	struct drm_xe_sync __user *syncs_user = u64_to_user_ptr(args->syncs);
>>> +	struct drm_xe_sync __user *syncs_user =
>>> +		u64_to_user_ptr(args->syncs.syncs);
>>>    	u64 __user *addresses_user = u64_to_user_ptr(args->address);
>>>    	struct xe_exec_queue *q;
>>>    	struct xe_sync_entry *syncs = NULL;
>>> @@ -143,15 +146,18 @@ int xe_exec_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
>>>    	struct drm_exec exec;
>>>    	u32 i, num_syncs = 0;
>>>    	struct xe_sched_job *job;
>>> -	struct dma_fence *rebind_fence;
>>> +	struct dma_fence *rebind_fence, *job_fence;
>>>    	struct xe_vm *vm;
>>> -	bool write_locked;
>>> +	bool write_locked, skip_job_put = false;
>>> +	bool wait = args->syncs.flags & DRM_XE_SYNCS_FLAG_WAIT_FOR_OP;
>>>    	ktime_t end = 0;
>>>    	int err = 0;
>>>    	if (XE_IOCTL_DBG(xe, args->extensions) ||
>>> -	    XE_IOCTL_DBG(xe, args->pad[0] || args->pad[1] || args->pad[2]) ||
>>> -	    XE_IOCTL_DBG(xe, args->reserved[0] || args->reserved[1]))
>>> +	    XE_IOCTL_DBG(xe, args->pad || args->pad2[0] || args->pad2[1] || args->pad2[2]) ||
>>> +	    XE_IOCTL_DBG(xe, args->reserved[0] || args->reserved[1]) ||
>>> +	    XE_IOCTL_DBG(xe, args->syncs.flags & ~ALL_DRM_XE_SYNCS_FLAGS) ||
>>> +	    XE_IOCTL_DBG(xe, wait && args->syncs.num_syncs))
>>>    		return -EINVAL;
>>>    	q = xe_exec_queue_lookup(xef, args->exec_queue_id);
>>> @@ -170,8 +176,9 @@ int xe_exec_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
>>>    		goto err_exec_queue;
>>>    	}
>>> -	if (args->num_syncs) {
>>> -		syncs = kcalloc(args->num_syncs, sizeof(*syncs), GFP_KERNEL);
>>> +	if (args->syncs.num_syncs) {
>>> +		syncs = kcalloc(args->syncs.num_syncs, sizeof(*syncs),
>>> +				GFP_KERNEL);
>>>    		if (!syncs) {
>>>    			err = -ENOMEM;
>>>    			goto err_exec_queue;
>>> @@ -180,7 +187,7 @@ int xe_exec_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
>>>    	vm = q->vm;
>>> -	for (i = 0; i < args->num_syncs; i++) {
>>> +	for (i = 0; i < args->syncs.num_syncs; i++) {
>>>    		err = xe_sync_entry_parse(xe, xef, &syncs[num_syncs++],
>>>    					  &syncs_user[i], SYNC_PARSE_FLAG_EXEC |
>>>    					  (xe_vm_in_lr_mode(vm) ?
>>> @@ -245,9 +252,17 @@ int xe_exec_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
>>>    				err = PTR_ERR(fence);
>>>    				goto err_exec;
>>>    			}
>>> +
>>>    			for (i = 0; i < num_syncs; i++)
>>>    				xe_sync_entry_signal(&syncs[i], NULL, fence);
>>> +
>>>    			xe_exec_queue_last_fence_set(q, vm, fence);
>>> +			if (wait) {
>>> +				long timeout = dma_fence_wait(fence, true);
>>> +
>>> +				if (timeout < 0)
>>> +					err = -EINTR;
>>> +			}
>> Here it looks like we will rerun the same IOCTL again if we return -EINTR.
>> The user-space expected action on -EINTR is to just restart the IOCTL
>> without any argument changes. Solution is to add an ioctl argument cookie
>> (or to skip sync vm binds and have the user just use the 0 batch buffers /
>> vm_binds calls or wait for an out-fence). If you go for the cookie solution
>> then IMO we should keep the -ERESTARTSYS returned from dma_fence_wait()
>> since it's converted to -EINTR on return-to-user-space, and the kernel
>> restarts the IOCTL automatically if there was no requested-for-delivery
>> signal pending.
>>
>> I think the simplest solution at this point is to skip the sync behaviour,
>> in particular if we enable the 0 batch / bind possibility.
>>
>> If we still want to provide it, we could add a cookie address as an
>> extension to the ioctl and activate sync if present? (Just throwing up ideas
>> here).
>>
> Hmm, forgot about this. A cookie is fairly easy, what about something like this:
>
>   807 /**
>   808  * struct drm_xe_syncs - In / out syncs for IOCTLs.
>   809  */
>   810 struct drm_xe_syncs {
>   811         /** @num_syncs: amount of syncs to wait on */
>   812         __u32 num_syncs;
>   813
>   814         /*
>   815          * Block in IOCTL until operation complete, num_syncs MBZ if set.
>   816          */
>   817 #define DRM_XE_SYNCS_IN_FLAG_WAIT_FOR_OP (1 << 0)
>   818         /** @in_flags: Input Sync flags */
>   819         __u16 in_flags;
>   820
>   821         /*
>   822          * IOCTL operation has started (no need for user to resubmit on
>   823          * -ERESTARTSYS)
>   824          */
>   825 #define DRM_XE_SYNCS_OUT_FLAG_OP_COMMITTED (1 << 0)
>   826         /** @out_flags: Output Sync flags */
>   827         __u16 out_flags;
>   828
>   829         /** @syncs: pointer to struct drm_xe_sync array */
>   830         __u64 syncs;
>   831
>   832         /** @reserved: Reserved */
>   833         __u64 reserved[2];
>   834 };
>
> DRM_XE_SYNCS_OUT_FLAG_OP_COMMITTED gets set in exec / bind IOCTL after
> the job is committed or in the of zero ops last-fence updated on the
> queue. Note that for binds we don't yet do 1 job per IOCTL but after
> landing some version of [1]
>
> After DRM_XE_SYNCS_OUT_FLAG_OP_COMMITTED is set we return -ERESTARTSYS if
> the wait is interrupted and -EINTR is still
> DRM_XE_SYNCS_OUT_FLAG_OP_COMMITTED (interrupted before job is
> committed).
>
> I'd rather go with patch as we have to change the uAPI here regardless
> so we might as well make this complete.
>
> Matt
>
> [1] https://patchwork.freedesktop.org/series/125608/

Yeah as we discussed in the meeting that means making the ioctl RW 
instead of W with some copying overhead.

I also think we should leave the EXEC ioctl out of this, meaning just 
having a single field in the VM_BIND ioctl. Basically the reason is that 
waiting like this after submission is a bit weird and does not align 
well with how -EINTR is typically used.

So either a pointer to a cookie in the ioctl,

or perhaps dig up again the idea we had of mostly waiting before the 
submission:

1) Pull out the last_op fence for the queue from under the relevant lock.
2) Wait for all dependencies without any locks.
3) Lock, and (optionally) if the last_op fence changed, wait for it.
4) Submit
5) Wait for completion uninterruptible.

I actually like this last one best, but we'd recommend UMD to uses 
out-fences whenever possible.

Thoughts?

>
>>>    			dma_fence_put(fence);
>>>    		}
>>> @@ -331,42 +346,51 @@ int xe_exec_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
>>>    	 * the job and let the DRM scheduler / backend clean up the job.
>>>    	 */
>>>    	xe_sched_job_arm(job);
>>> +	job_fence = &job->drm.s_fence->finished;
>>> +	if (wait)
>>> +		dma_fence_get(job_fence);
>>>    	if (!xe_vm_in_lr_mode(vm)) {
>>>    		/* Block userptr invalidations / BO eviction */
>>> -		dma_resv_add_fence(&vm->resv,
>>> -				   &job->drm.s_fence->finished,
>>> +		dma_resv_add_fence(&vm->resv, job_fence,
>>>    				   DMA_RESV_USAGE_BOOKKEEP);
>>>    		/*
>>>    		 * Make implicit sync work across drivers, assuming all external
>>>    		 * BOs are written as we don't pass in a read / write list.
>>>    		 */
>>> -		xe_vm_fence_all_extobjs(vm, &job->drm.s_fence->finished,
>>> -					DMA_RESV_USAGE_WRITE);
>>> +		xe_vm_fence_all_extobjs(vm, job_fence, DMA_RESV_USAGE_WRITE);
>>>    	}
>>>    	for (i = 0; i < num_syncs; i++)
>>> -		xe_sync_entry_signal(&syncs[i], job,
>>> -				     &job->drm.s_fence->finished);
>>> +		xe_sync_entry_signal(&syncs[i], job, job_fence);
>>>    	if (xe_exec_queue_is_lr(q))
>>>    		q->ring_ops->emit_job(job);
>>>    	if (!xe_vm_in_lr_mode(vm))
>>> -		xe_exec_queue_last_fence_set(q, vm, &job->drm.s_fence->finished);
>>> +		xe_exec_queue_last_fence_set(q, vm, job_fence);
>>>    	xe_sched_job_push(job);
>>>    	xe_vm_reactivate_rebind(vm);
>>> -	if (!err && !xe_vm_in_lr_mode(vm)) {
>>> +	if (!xe_vm_in_lr_mode(vm)) {
>>>    		spin_lock(&xe->ttm.lru_lock);
>>>    		ttm_lru_bulk_move_tail(&vm->lru_bulk_move);
>>>    		spin_unlock(&xe->ttm.lru_lock);
>>>    	}
>>> +	skip_job_put = true;
>>> +	if (wait) {
>>> +		long timeout = dma_fence_wait(job_fence, true);
>>> +
>>> +		dma_fence_put(job_fence);
>>> +		if (timeout < 0)
>>> +			err = -EINTR;
>>> +	}
>>> +
>>>    err_repin:
>>>    	if (!xe_vm_in_lr_mode(vm))
>>>    		up_read(&vm->userptr.notifier_lock);
>>>    err_put_job:
>>> -	if (err)
>>> +	if (err && !skip_job_put)
>>>    		xe_sched_job_put(job);
>>>    err_exec:
>>>    	drm_exec_fini(&exec);
>>> diff --git a/drivers/gpu/drm/xe/xe_exec_queue.c b/drivers/gpu/drm/xe/xe_exec_queue.c
>>> index 3911d14522ee..98776d02d634 100644
>>> --- a/drivers/gpu/drm/xe/xe_exec_queue.c
>>> +++ b/drivers/gpu/drm/xe/xe_exec_queue.c
>>> @@ -625,10 +625,7 @@ int xe_exec_queue_create_ioctl(struct drm_device *dev, void *data,
>>>    	if (XE_IOCTL_DBG(xe, eci[0].gt_id >= xe->info.gt_count))
>>>    		return -EINVAL;
>>> -	if (eci[0].engine_class >= DRM_XE_ENGINE_CLASS_VM_BIND_ASYNC) {
>>> -		bool sync = eci[0].engine_class ==
>>> -			DRM_XE_ENGINE_CLASS_VM_BIND_SYNC;
>>> -
>>> +	if (eci[0].engine_class == DRM_XE_ENGINE_CLASS_VM_BIND) {
>>>    		for_each_gt(gt, xe, id) {
>>>    			struct xe_exec_queue *new;
>>> @@ -654,8 +651,6 @@ int xe_exec_queue_create_ioctl(struct drm_device *dev, void *data,
>>>    						   args->width, hwe,
>>>    						   EXEC_QUEUE_FLAG_PERSISTENT |
>>>    						   EXEC_QUEUE_FLAG_VM |
>>> -						   (sync ? 0 :
>>> -						    EXEC_QUEUE_FLAG_VM_ASYNC) |
>>>    						   (id ?
>>>    						    EXEC_QUEUE_FLAG_BIND_ENGINE_CHILD :
>>>    						    0));
>>> diff --git a/drivers/gpu/drm/xe/xe_exec_queue_types.h b/drivers/gpu/drm/xe/xe_exec_queue_types.h
>>> index 52f0927d0d9b..c78f6e8b41c4 100644
>>> --- a/drivers/gpu/drm/xe/xe_exec_queue_types.h
>>> +++ b/drivers/gpu/drm/xe/xe_exec_queue_types.h
>>> @@ -74,8 +74,6 @@ struct xe_exec_queue {
>>>    #define EXEC_QUEUE_FLAG_VM			BIT(4)
>>>    /* child of VM queue for multi-tile VM jobs */
>>>    #define EXEC_QUEUE_FLAG_BIND_ENGINE_CHILD	BIT(5)
>>> -/* VM jobs for this queue are asynchronous */
>>> -#define EXEC_QUEUE_FLAG_VM_ASYNC		BIT(6)
>>>    	/**
>>>    	 * @flags: flags for this exec queue, should statically setup aside from ban
>>> diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c
>>> index cf2eb44a71db..4b0c976c003a 100644
>>> --- a/drivers/gpu/drm/xe/xe_vm.c
>>> +++ b/drivers/gpu/drm/xe/xe_vm.c
>>> @@ -1433,9 +1433,7 @@ struct xe_vm *xe_vm_create(struct xe_device *xe, u32 flags)
>>>    			struct xe_gt *gt = tile->primary_gt;
>>>    			struct xe_vm *migrate_vm;
>>>    			struct xe_exec_queue *q;
>>> -			u32 create_flags = EXEC_QUEUE_FLAG_VM |
>>> -				((flags & XE_VM_FLAG_ASYNC_DEFAULT) ?
>>> -				EXEC_QUEUE_FLAG_VM_ASYNC : 0);
>>> +			u32 create_flags = EXEC_QUEUE_FLAG_VM;
>>>    			if (!vm->pt_root[id])
>>>    				continue;
>>> @@ -1835,16 +1833,10 @@ xe_vm_bind_vma(struct xe_vma *vma, struct xe_exec_queue *q,
>>>    	return ERR_PTR(err);
>>>    }
>>> -static bool xe_vm_sync_mode(struct xe_vm *vm, struct xe_exec_queue *q)
>>> -{
>>> -	return q ? !(q->flags & EXEC_QUEUE_FLAG_VM_ASYNC) :
>>> -		!(vm->flags & XE_VM_FLAG_ASYNC_DEFAULT);
>>> -}
>>> -
>>>    static int __xe_vm_bind(struct xe_vm *vm, struct xe_vma *vma,
>>>    			struct xe_exec_queue *q, struct xe_sync_entry *syncs,
>>>    			u32 num_syncs, bool immediate, bool first_op,
>>> -			bool last_op)
>>> +			bool last_op, bool async)
>>>    {
>>>    	struct dma_fence *fence;
>>>    	struct xe_exec_queue *wait_exec_queue = to_wait_exec_queue(vm, q);
>>> @@ -1870,7 +1862,7 @@ static int __xe_vm_bind(struct xe_vm *vm, struct xe_vma *vma,
>>>    	if (last_op)
>>>    		xe_exec_queue_last_fence_set(wait_exec_queue, vm, fence);
>>> -	if (last_op && xe_vm_sync_mode(vm, q))
>>> +	if (last_op && !async)
>>>    		dma_fence_wait(fence, true);
>>>    	dma_fence_put(fence);
>>> @@ -1880,7 +1872,7 @@ static int __xe_vm_bind(struct xe_vm *vm, struct xe_vma *vma,
>>>    static int xe_vm_bind(struct xe_vm *vm, struct xe_vma *vma, struct xe_exec_queue *q,
>>>    		      struct xe_bo *bo, struct xe_sync_entry *syncs,
>>>    		      u32 num_syncs, bool immediate, bool first_op,
>>> -		      bool last_op)
>>> +		      bool last_op, bool async)
>>>    {
>>>    	int err;
>>> @@ -1894,12 +1886,12 @@ static int xe_vm_bind(struct xe_vm *vm, struct xe_vma *vma, struct xe_exec_queue
>>>    	}
>>>    	return __xe_vm_bind(vm, vma, q, syncs, num_syncs, immediate, first_op,
>>> -			    last_op);
>>> +			    last_op, async);
>>>    }
>>>    static int xe_vm_unbind(struct xe_vm *vm, struct xe_vma *vma,
>>>    			struct xe_exec_queue *q, struct xe_sync_entry *syncs,
>>> -			u32 num_syncs, bool first_op, bool last_op)
>>> +			u32 num_syncs, bool first_op, bool last_op, bool async)
>>>    {
>>>    	struct dma_fence *fence;
>>>    	struct xe_exec_queue *wait_exec_queue = to_wait_exec_queue(vm, q);
>>> @@ -1914,7 +1906,7 @@ static int xe_vm_unbind(struct xe_vm *vm, struct xe_vma *vma,
>>>    	xe_vma_destroy(vma, fence);
>>>    	if (last_op)
>>>    		xe_exec_queue_last_fence_set(wait_exec_queue, vm, fence);
>>> -	if (last_op && xe_vm_sync_mode(vm, q))
>>> +	if (last_op && !async)
>>>    		dma_fence_wait(fence, true);
>> It looks like we're dropping the error return code here.
>>
>>
>>>    	dma_fence_put(fence);
>>> @@ -1923,7 +1915,6 @@ static int xe_vm_unbind(struct xe_vm *vm, struct xe_vma *vma,
>>>    #define ALL_DRM_XE_VM_CREATE_FLAGS (DRM_XE_VM_CREATE_FLAG_SCRATCH_PAGE | \
>>>    				    DRM_XE_VM_CREATE_FLAG_LR_MODE | \
>>> -				    DRM_XE_VM_CREATE_FLAG_ASYNC_DEFAULT | \
>>>    				    DRM_XE_VM_CREATE_FLAG_FAULT_MODE)
>>>    int xe_vm_create_ioctl(struct drm_device *dev, void *data,
>>> @@ -1977,8 +1968,6 @@ int xe_vm_create_ioctl(struct drm_device *dev, void *data,
>>>    		flags |= XE_VM_FLAG_SCRATCH_PAGE;
>>>    	if (args->flags & DRM_XE_VM_CREATE_FLAG_LR_MODE)
>>>    		flags |= XE_VM_FLAG_LR_MODE;
>>> -	if (args->flags & DRM_XE_VM_CREATE_FLAG_ASYNC_DEFAULT)
>>> -		flags |= XE_VM_FLAG_ASYNC_DEFAULT;
>>>    	if (args->flags & DRM_XE_VM_CREATE_FLAG_FAULT_MODE)
>>>    		flags |= XE_VM_FLAG_FAULT_MODE;
>>> @@ -2062,7 +2051,7 @@ static const u32 region_to_mem_type[] = {
>>>    static int xe_vm_prefetch(struct xe_vm *vm, struct xe_vma *vma,
>>>    			  struct xe_exec_queue *q, u32 region,
>>>    			  struct xe_sync_entry *syncs, u32 num_syncs,
>>> -			  bool first_op, bool last_op)
>>> +			  bool first_op, bool last_op, bool async)
>>>    {
>>>    	struct xe_exec_queue *wait_exec_queue = to_wait_exec_queue(vm, q);
>>>    	int err;
>>> @@ -2077,7 +2066,7 @@ static int xe_vm_prefetch(struct xe_vm *vm, struct xe_vma *vma,
>>>    	if (vma->tile_mask != (vma->tile_present & ~vma->usm.tile_invalidated)) {
>>>    		return xe_vm_bind(vm, vma, q, xe_vma_bo(vma), syncs, num_syncs,
>>> -				  true, first_op, last_op);
>>> +				  true, first_op, last_op, async);
>>>    	} else {
>>>    		int i;
>>> @@ -2400,6 +2389,8 @@ static int vm_bind_ioctl_ops_parse(struct xe_vm *vm, struct xe_exec_queue *q,
>>>    		}
>>>    		op->q = q;
>>> +		if (async)
>>> +			op->flags |= XE_VMA_OP_ASYNC;
>>>    		switch (op->base.op) {
>>>    		case DRM_GPUVA_OP_MAP:
>>> @@ -2538,7 +2529,8 @@ static int op_execute(struct drm_exec *exec, struct xe_vm *vm,
>>>    				 op->syncs, op->num_syncs,
>>>    				 op->map.immediate || !xe_vm_in_fault_mode(vm),
>>>    				 op->flags & XE_VMA_OP_FIRST,
>>> -				 op->flags & XE_VMA_OP_LAST);
>>> +				 op->flags & XE_VMA_OP_LAST,
>>> +				 op->flags & XE_VMA_OP_ASYNC);
>>>    		break;
>>>    	case DRM_GPUVA_OP_REMAP:
>>>    	{
>>> @@ -2552,7 +2544,8 @@ static int op_execute(struct drm_exec *exec, struct xe_vm *vm,
>>>    					   op->num_syncs,
>>>    					   op->flags & XE_VMA_OP_FIRST,
>>>    					   op->flags & XE_VMA_OP_LAST &&
>>> -					   !prev && !next);
>>> +					   !prev && !next,
>>> +					   op->flags & XE_VMA_OP_ASYNC);
>>>    			if (err)
>>>    				break;
>>>    			op->remap.unmap_done = true;
>>> @@ -2563,7 +2556,8 @@ static int op_execute(struct drm_exec *exec, struct xe_vm *vm,
>>>    			err = xe_vm_bind(vm, op->remap.prev, op->q,
>>>    					 xe_vma_bo(op->remap.prev), op->syncs,
>>>    					 op->num_syncs, true, false,
>>> -					 op->flags & XE_VMA_OP_LAST && !next);
>>> +					 op->flags & XE_VMA_OP_LAST && !next,
>>> +					 op->flags & XE_VMA_OP_ASYNC);
>>>    			op->remap.prev->gpuva.flags &= ~XE_VMA_LAST_REBIND;
>>>    			if (err)
>>>    				break;
>>> @@ -2576,7 +2570,8 @@ static int op_execute(struct drm_exec *exec, struct xe_vm *vm,
>>>    					 xe_vma_bo(op->remap.next),
>>>    					 op->syncs, op->num_syncs,
>>>    					 true, false,
>>> -					 op->flags & XE_VMA_OP_LAST);
>>> +					 op->flags & XE_VMA_OP_LAST,
>>> +					 op->flags & XE_VMA_OP_ASYNC);
>>>    			op->remap.next->gpuva.flags &= ~XE_VMA_LAST_REBIND;
>>>    			if (err)
>>>    				break;
>>> @@ -2588,13 +2583,15 @@ static int op_execute(struct drm_exec *exec, struct xe_vm *vm,
>>>    	case DRM_GPUVA_OP_UNMAP:
>>>    		err = xe_vm_unbind(vm, vma, op->q, op->syncs,
>>>    				   op->num_syncs, op->flags & XE_VMA_OP_FIRST,
>>> -				   op->flags & XE_VMA_OP_LAST);
>>> +				   op->flags & XE_VMA_OP_LAST,
>>> +				   op->flags & XE_VMA_OP_ASYNC);
>>>    		break;
>>>    	case DRM_GPUVA_OP_PREFETCH:
>>>    		err = xe_vm_prefetch(vm, vma, op->q, op->prefetch.region,
>>>    				     op->syncs, op->num_syncs,
>>>    				     op->flags & XE_VMA_OP_FIRST,
>>> -				     op->flags & XE_VMA_OP_LAST);
>>> +				     op->flags & XE_VMA_OP_LAST,
>>> +				     op->flags & XE_VMA_OP_ASYNC);
>>>    		break;
>>>    	default:
>>>    		drm_warn(&vm->xe->drm, "NOT POSSIBLE");
>>> @@ -2808,16 +2805,16 @@ static int vm_bind_ioctl_ops_execute(struct xe_vm *vm,
>>>    #ifdef TEST_VM_ASYNC_OPS_ERROR
>>>    #define SUPPORTED_FLAGS	\
>>> -	(FORCE_ASYNC_OP_ERROR | DRM_XE_VM_BIND_FLAG_ASYNC | \
>>> -	 DRM_XE_VM_BIND_FLAG_READONLY | DRM_XE_VM_BIND_FLAG_IMMEDIATE | \
>>> -	 DRM_XE_VM_BIND_FLAG_NULL | 0xffff)
>>> +	(FORCE_ASYNC_OP_ERROR | DRM_XE_VM_BIND_FLAG_READONLY | \
>>> +	 DRM_XE_VM_BIND_FLAG_IMMEDIATE | DRM_XE_VM_BIND_FLAG_NULL | 0xffff)
>>>    #else
>>>    #define SUPPORTED_FLAGS	\
>>> -	(DRM_XE_VM_BIND_FLAG_ASYNC | DRM_XE_VM_BIND_FLAG_READONLY | \
>>> +	(DRM_XE_VM_BIND_FLAG_READONLY | \
>>>    	 DRM_XE_VM_BIND_FLAG_IMMEDIATE | DRM_XE_VM_BIND_FLAG_NULL | \
>>>    	 0xffff)
>>>    #endif
>>>    #define XE_64K_PAGE_MASK 0xffffull
>>> +#define ALL_DRM_XE_SYNCS_FLAGS (DRM_XE_SYNCS_FLAG_WAIT_FOR_OP)
>>>    #define MAX_BINDS	512	/* FIXME: Picking random upper limit */
>>> @@ -2829,7 +2826,7 @@ static int vm_bind_ioctl_check_args(struct xe_device *xe,
>>>    	int err;
>>>    	int i;
>>> -	if (XE_IOCTL_DBG(xe, args->pad || args->pad2) ||
>>> +	if (XE_IOCTL_DBG(xe, args->pad) ||
>>>    	    XE_IOCTL_DBG(xe, args->reserved[0] || args->reserved[1]))
>>>    		return -EINVAL;
>>> @@ -2857,6 +2854,14 @@ static int vm_bind_ioctl_check_args(struct xe_device *xe,
>>>    		*bind_ops = &args->bind;
>>>    	}
>>> +	*async = !(args->syncs.flags & DRM_XE_SYNCS_FLAG_WAIT_FOR_OP);
>>> +
>>> +	if (XE_IOCTL_DBG(xe, args->syncs.flags & ~ALL_DRM_XE_SYNCS_FLAGS) ||
>>> +	    XE_IOCTL_DBG(xe, !*async && args->syncs.num_syncs)) {
>>> +		err = -EINVAL;
>>> +		goto free_bind_ops;
>>> +	}
>>> +
>>>    	for (i = 0; i < args->num_binds; ++i) {
>>>    		u64 range = (*bind_ops)[i].range;
>>>    		u64 addr = (*bind_ops)[i].addr;
>>> @@ -2887,18 +2892,6 @@ static int vm_bind_ioctl_check_args(struct xe_device *xe,
>>>    			goto free_bind_ops;
>>>    		}
>>> -		if (i == 0) {
>>> -			*async = !!(flags & DRM_XE_VM_BIND_FLAG_ASYNC);
>>> -			if (XE_IOCTL_DBG(xe, !*async && args->num_syncs)) {
>>> -				err = -EINVAL;
>>> -				goto free_bind_ops;
>>> -			}
>>> -		} else if (XE_IOCTL_DBG(xe, *async !=
>>> -					!!(flags & DRM_XE_VM_BIND_FLAG_ASYNC))) {
>>> -			err = -EINVAL;
>>> -			goto free_bind_ops;
>>> -		}
>>> -
>>>    		if (XE_IOCTL_DBG(xe, op > DRM_XE_VM_BIND_OP_PREFETCH) ||
>>>    		    XE_IOCTL_DBG(xe, flags & ~SUPPORTED_FLAGS) ||
>>>    		    XE_IOCTL_DBG(xe, obj && is_null) ||
>>> @@ -2951,7 +2944,7 @@ static int vm_bind_ioctl_check_args(struct xe_device *xe,
>>>    static int vm_bind_ioctl_signal_fences(struct xe_vm *vm,
>>>    				       struct xe_exec_queue *q,
>>>    				       struct xe_sync_entry *syncs,
>>> -				       int num_syncs)
>>> +				       int num_syncs, bool async)
>>>    {
>>>    	struct dma_fence *fence;
>>>    	int i, err = 0;
>>> @@ -2967,7 +2960,7 @@ static int vm_bind_ioctl_signal_fences(struct xe_vm *vm,
>>>    	xe_exec_queue_last_fence_set(to_wait_exec_queue(vm, q), vm,
>>>    				     fence);
>>> -	if (xe_vm_sync_mode(vm, q)) {
>>> +	if (!async) {
>>>    		long timeout = dma_fence_wait(fence, true);
>>>    		if (timeout < 0)
>>> @@ -3001,7 +2994,7 @@ int xe_vm_bind_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
>>>    	if (err)
>>>    		return err;
>>> -	if (XE_IOCTL_DBG(xe, args->pad || args->pad2) ||
>>> +	if (XE_IOCTL_DBG(xe, args->pad) ||
>>>    	    XE_IOCTL_DBG(xe, args->reserved[0] || args->reserved[1]))
>>>    		return -EINVAL;
>>> @@ -3016,12 +3009,6 @@ int xe_vm_bind_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
>>>    			err = -EINVAL;
>>>    			goto put_exec_queue;
>>>    		}
>>> -
>>> -		if (XE_IOCTL_DBG(xe, args->num_binds && async !=
>>> -				 !!(q->flags & EXEC_QUEUE_FLAG_VM_ASYNC))) {
>>> -			err = -EINVAL;
>>> -			goto put_exec_queue;
>>> -		}
>>>    	}
>>>    	vm = xe_vm_lookup(xef, args->vm_id);
>>> @@ -3030,14 +3017,6 @@ int xe_vm_bind_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
>>>    		goto put_exec_queue;
>>>    	}
>>> -	if (!args->exec_queue_id) {
>>> -		if (XE_IOCTL_DBG(xe, args->num_binds && async !=
>>> -				 !!(vm->flags & XE_VM_FLAG_ASYNC_DEFAULT))) {
>>> -			err = -EINVAL;
>>> -			goto put_vm;
>>> -		}
>>> -	}
>>> -
>>>    	err = down_write_killable(&vm->lock);
>>>    	if (err)
>>>    		goto put_vm;
>>> @@ -3127,16 +3106,16 @@ int xe_vm_bind_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
>>>    		}
>>>    	}
>>> -	if (args->num_syncs) {
>>> -		syncs = kcalloc(args->num_syncs, sizeof(*syncs), GFP_KERNEL);
>>> +	if (args->syncs.num_syncs) {
>>> +		syncs = kcalloc(args->syncs.num_syncs, sizeof(*syncs), GFP_KERNEL);
>>>    		if (!syncs) {
>>>    			err = -ENOMEM;
>>>    			goto put_obj;
>>>    		}
>>>    	}
>>> -	syncs_user = u64_to_user_ptr(args->syncs);
>>> -	for (num_syncs = 0; num_syncs < args->num_syncs; num_syncs++) {
>>> +	syncs_user = u64_to_user_ptr(args->syncs.syncs);
>>> +	for (num_syncs = 0; num_syncs < args->syncs.num_syncs; num_syncs++) {
>>>    		err = xe_sync_entry_parse(xe, xef, &syncs[num_syncs],
>>>    					  &syncs_user[num_syncs],
>>>    					  (xe_vm_in_lr_mode(vm) ?
>>> @@ -3210,7 +3189,8 @@ int xe_vm_bind_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
>>>    	vm_bind_ioctl_ops_unwind(vm, ops, args->num_binds);
>>>    free_syncs:
>>>    	if (err == -ENODATA)
>>> -		err = vm_bind_ioctl_signal_fences(vm, q, syncs, num_syncs);
>>> +		err = vm_bind_ioctl_signal_fences(vm, q, syncs, num_syncs,
>>> +						  async);
>>>    	while (num_syncs--)
>>>    		xe_sync_entry_cleanup(&syncs[num_syncs]);
>>> diff --git a/drivers/gpu/drm/xe/xe_vm_types.h b/drivers/gpu/drm/xe/xe_vm_types.h
>>> index 23abdfd8622f..ce8b9bde7e9c 100644
>>> --- a/drivers/gpu/drm/xe/xe_vm_types.h
>>> +++ b/drivers/gpu/drm/xe/xe_vm_types.h
>>> @@ -167,13 +167,12 @@ struct xe_vm {
>>>    	 */
>>>    #define XE_VM_FLAG_64K			BIT(0)
>>>    #define XE_VM_FLAG_LR_MODE		BIT(1)
>>> -#define XE_VM_FLAG_ASYNC_DEFAULT	BIT(2)
>>> -#define XE_VM_FLAG_MIGRATION		BIT(3)
>>> -#define XE_VM_FLAG_SCRATCH_PAGE		BIT(4)
>>> -#define XE_VM_FLAG_FAULT_MODE		BIT(5)
>>> -#define XE_VM_FLAG_BANNED		BIT(6)
>>> -#define XE_VM_FLAG_TILE_ID(flags)	FIELD_GET(GENMASK(8, 7), flags)
>>> -#define XE_VM_FLAG_SET_TILE_ID(tile)	FIELD_PREP(GENMASK(8, 7), (tile)->id)
>>> +#define XE_VM_FLAG_MIGRATION		BIT(2)
>>> +#define XE_VM_FLAG_SCRATCH_PAGE		BIT(3)
>>> +#define XE_VM_FLAG_FAULT_MODE		BIT(4)
>>> +#define XE_VM_FLAG_BANNED		BIT(5)
>>> +#define XE_VM_FLAG_TILE_ID(flags)	FIELD_GET(GENMASK(7, 6), flags)
>>> +#define XE_VM_FLAG_SET_TILE_ID(tile)	FIELD_PREP(GENMASK(7, 6), (tile)->id)
>>>    	unsigned long flags;
>>>    	/** @composite_fence_ctx: context composite fence */
>>> @@ -385,6 +384,8 @@ enum xe_vma_op_flags {
>>>    	XE_VMA_OP_PREV_COMMITTED	= BIT(3),
>>>    	/** @XE_VMA_OP_NEXT_COMMITTED: Next VMA operation committed */
>>>    	XE_VMA_OP_NEXT_COMMITTED	= BIT(4),
>>> +	/** @XE_VMA_OP_ASYNC: operation is async */
>>> +	XE_VMA_OP_ASYNC			= BIT(5),
>>>    };
>>>    /** struct xe_vma_op - VMA operation */
>>> diff --git a/include/uapi/drm/xe_drm.h b/include/uapi/drm/xe_drm.h
>>> index eb03a49c17a1..fd8172fe2d9a 100644
>>> --- a/include/uapi/drm/xe_drm.h
>>> +++ b/include/uapi/drm/xe_drm.h
>>> @@ -141,8 +141,7 @@ struct drm_xe_engine_class_instance {
>>>    	 * Kernel only classes (not actual hardware engine class). Used for
>>>    	 * creating ordered queues of VM bind operations.
>>>    	 */
>>> -#define DRM_XE_ENGINE_CLASS_VM_BIND_ASYNC	5
>>> -#define DRM_XE_ENGINE_CLASS_VM_BIND_SYNC	6
>>> +#define DRM_XE_ENGINE_CLASS_VM_BIND		5
>>>    	__u16 engine_class;
>>>    	__u16 engine_instance;
>>> @@ -660,7 +659,6 @@ struct drm_xe_vm_create {
>>>    	 * still enable recoverable pagefaults if supported by the device.
>>>    	 */
>>>    #define DRM_XE_VM_CREATE_FLAG_LR_MODE	        (1 << 1)
>>> -#define DRM_XE_VM_CREATE_FLAG_ASYNC_DEFAULT	(1 << 2)
>>>    	/*
>>>    	 * DRM_XE_VM_CREATE_FLAG_FAULT_MODE requires also
>>>    	 * DRM_XE_VM_CREATE_FLAG_LR_MODE. It allows memory to be allocated
>>> @@ -668,7 +666,7 @@ struct drm_xe_vm_create {
>>>    	 * The xe driver internally uses recoverable pagefaults to implement
>>>    	 * this.
>>>    	 */
>>> -#define DRM_XE_VM_CREATE_FLAG_FAULT_MODE	(1 << 3)
>>> +#define DRM_XE_VM_CREATE_FLAG_FAULT_MODE	(1 << 2)
>>>    	/** @flags: Flags */
>>>    	__u32 flags;
>>> @@ -776,12 +774,11 @@ struct drm_xe_vm_bind_op {
>>>    	__u32 op;
>>>    #define DRM_XE_VM_BIND_FLAG_READONLY	(1 << 0)
>>> -#define DRM_XE_VM_BIND_FLAG_ASYNC	(1 << 1)
>>>    	/*
>>>    	 * Valid on a faulting VM only, do the MAP operation immediately rather
>>>    	 * than deferring the MAP to the page fault handler.
>>>    	 */
>>> -#define DRM_XE_VM_BIND_FLAG_IMMEDIATE	(1 << 2)
>>> +#define DRM_XE_VM_BIND_FLAG_IMMEDIATE	(1 << 1)
>>>    	/*
>>>    	 * When the NULL flag is set, the page tables are setup with a special
>>>    	 * bit which indicates writes are dropped and all reads return zero.  In
>>> @@ -789,7 +786,7 @@ struct drm_xe_vm_bind_op {
>>>    	 * operations, the BO handle MBZ, and the BO offset MBZ. This flag is
>>>    	 * intended to implement VK sparse bindings.
>>>    	 */
>>> -#define DRM_XE_VM_BIND_FLAG_NULL	(1 << 3)
>>> +#define DRM_XE_VM_BIND_FLAG_NULL	(1 << 2)
>>>    	/** @flags: Bind flags */
>>>    	__u32 flags;
>>> @@ -807,6 +804,27 @@ struct drm_xe_vm_bind_op {
>>>    	__u64 reserved[3];
>>>    };
>>> +/**
>>> + * struct drm_xe_syncs - In / out syncs for IOCTLs.
>>> + */
>>> +struct drm_xe_syncs {
>>> +	/** @num_syncs: amount of syncs to wait on */
>>> +	__u32 num_syncs;
>>> +
>>> +	/*
>>> +	 * Block in IOCTL until operation complete, num_syncs MBZ if set.
>>> +	 */
>>> +#define DRM_XE_SYNCS_FLAG_WAIT_FOR_OP (1 << 0)
>>> +	/** @flags: Sync flags */
>>> +	__u32 flags;
>>> +
>>> +	/** @syncs: pointer to struct drm_xe_sync array */
>>> +	__u64 syncs;
>>> +
>>> +	/** @reserved: Reserved */
>>> +	__u64 reserved[2];
>>> +};
>>> +
>>>    struct drm_xe_vm_bind {
>>>    	/** @extensions: Pointer to the first extension struct, if any */
>>>    	__u64 extensions;
>>> @@ -838,14 +856,8 @@ struct drm_xe_vm_bind {
>>>    		__u64 vector_of_binds;
>>>    	};
>>> -	/** @pad: MBZ */
>>> -	__u32 pad2;
>>> -
>>> -	/** @num_syncs: amount of syncs to wait on */
>>> -	__u32 num_syncs;
>>> -
>>> -	/** @syncs: pointer to struct drm_xe_sync array */
>>> -	__u64 syncs;
>>> +	/** @syncs: syncs for bind */
>>> +	struct drm_xe_syncs syncs;
>>>    	/** @reserved: Reserved */
>>>    	__u64 reserved[2];
>>> @@ -974,14 +986,14 @@ struct drm_xe_exec {
>>>    	/** @extensions: Pointer to the first extension struct, if any */
>>>    	__u64 extensions;
>>> +	/** @pad: MBZ */
>>> +	__u32 pad;
>>> +
>>>    	/** @exec_queue_id: Exec queue ID for the batch buffer */
>>>    	__u32 exec_queue_id;
>>> -	/** @num_syncs: Amount of struct drm_xe_sync in array. */
>>> -	__u32 num_syncs;
>>> -
>>> -	/** @syncs: Pointer to struct drm_xe_sync array. */
>>> -	__u64 syncs;
>>> +	/** @syncs: syncs for exec */
>>> +	struct drm_xe_syncs syncs;
>>>    	/**
>>>    	 * @address: address of batch buffer if num_batch_buffer == 1 or an
>>> @@ -995,8 +1007,8 @@ struct drm_xe_exec {
>>>    	 */
>>>    	__u16 num_batch_buffer;
>>> -	/** @pad: MBZ */
>>> -	__u16 pad[3];
>>> +	/** @pad2: MBZ */
>>> +	__u16 pad2[3];
>>>    	/** @reserved: Reserved */
>>>    	__u64 reserved[2];

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [RFC PATCH 7/7] drm/xe/uapi: Uniform async vs sync handling
  2023-12-11 15:43       ` Thomas Hellström
@ 2023-12-11 16:49         ` Matthew Brost
  2023-12-11 18:11           ` Thomas Hellström
  0 siblings, 1 reply; 22+ messages in thread
From: Matthew Brost @ 2023-12-11 16:49 UTC (permalink / raw)
  To: Thomas Hellström; +Cc: Francois Dugast, intel-xe, Rodrigo Vivi

On Mon, Dec 11, 2023 at 04:43:06PM +0100, Thomas Hellström wrote:
> 
> On 12/8/23 10:45, Matthew Brost wrote:
> > On Fri, Dec 08, 2023 at 04:00:37PM +0100, Thomas Hellström wrote:
> > > On 12/7/23 06:57, Matthew Brost wrote:
> > > > Remove concept of async vs sync VM bind queues, rather make async vs
> > > > sync a per IOCTL choice. Since this is per IOCTL, it makes sense to have
> > > > a singular flag IOCTL rather than per VM bind op flag too. Add
> > > > DRM_XE_SYNCS_FLAG_WAIT_FOR_OP which is an input sync flag to support
> > > > this. Support this new flag for both the VM bind IOCTL and the exec
> > > > IOCTL to match behavior.
> > > > 
> > > > Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
> > > > Cc: Thomas Hellström <thomas.hellstrom@linux.intel.com>
> > > > Cc: Francois Dugast <francois.dugast@intel.com>
> > > > Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> > > > ---
> > > >    drivers/gpu/drm/xe/xe_exec.c             |  58 ++++++++----
> > > >    drivers/gpu/drm/xe/xe_exec_queue.c       |   7 +-
> > > >    drivers/gpu/drm/xe/xe_exec_queue_types.h |   2 -
> > > >    drivers/gpu/drm/xe/xe_vm.c               | 110 ++++++++++-------------
> > > >    drivers/gpu/drm/xe/xe_vm_types.h         |  15 ++--
> > > >    include/uapi/drm/xe_drm.h                |  56 +++++++-----
> > > >    6 files changed, 129 insertions(+), 119 deletions(-)
> > > > 
> > > > diff --git a/drivers/gpu/drm/xe/xe_exec.c b/drivers/gpu/drm/xe/xe_exec.c
> > > > index 92b0da6580e8..c62cabfaa112 100644
> > > > --- a/drivers/gpu/drm/xe/xe_exec.c
> > > > +++ b/drivers/gpu/drm/xe/xe_exec.c
> > > > @@ -130,12 +130,15 @@ static int xe_exec_begin(struct drm_exec *exec, struct xe_vm *vm)
> > > >    	return err;
> > > >    }
> > > > +#define ALL_DRM_XE_SYNCS_FLAGS (DRM_XE_SYNCS_FLAG_WAIT_FOR_OP)
> > > > +
> > > >    int xe_exec_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
> > > >    {
> > > >    	struct xe_device *xe = to_xe_device(dev);
> > > >    	struct xe_file *xef = to_xe_file(file);
> > > >    	struct drm_xe_exec *args = data;
> > > > -	struct drm_xe_sync __user *syncs_user = u64_to_user_ptr(args->syncs);
> > > > +	struct drm_xe_sync __user *syncs_user =
> > > > +		u64_to_user_ptr(args->syncs.syncs);
> > > >    	u64 __user *addresses_user = u64_to_user_ptr(args->address);
> > > >    	struct xe_exec_queue *q;
> > > >    	struct xe_sync_entry *syncs = NULL;
> > > > @@ -143,15 +146,18 @@ int xe_exec_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
> > > >    	struct drm_exec exec;
> > > >    	u32 i, num_syncs = 0;
> > > >    	struct xe_sched_job *job;
> > > > -	struct dma_fence *rebind_fence;
> > > > +	struct dma_fence *rebind_fence, *job_fence;
> > > >    	struct xe_vm *vm;
> > > > -	bool write_locked;
> > > > +	bool write_locked, skip_job_put = false;
> > > > +	bool wait = args->syncs.flags & DRM_XE_SYNCS_FLAG_WAIT_FOR_OP;
> > > >    	ktime_t end = 0;
> > > >    	int err = 0;
> > > >    	if (XE_IOCTL_DBG(xe, args->extensions) ||
> > > > -	    XE_IOCTL_DBG(xe, args->pad[0] || args->pad[1] || args->pad[2]) ||
> > > > -	    XE_IOCTL_DBG(xe, args->reserved[0] || args->reserved[1]))
> > > > +	    XE_IOCTL_DBG(xe, args->pad || args->pad2[0] || args->pad2[1] || args->pad2[2]) ||
> > > > +	    XE_IOCTL_DBG(xe, args->reserved[0] || args->reserved[1]) ||
> > > > +	    XE_IOCTL_DBG(xe, args->syncs.flags & ~ALL_DRM_XE_SYNCS_FLAGS) ||
> > > > +	    XE_IOCTL_DBG(xe, wait && args->syncs.num_syncs))
> > > >    		return -EINVAL;
> > > >    	q = xe_exec_queue_lookup(xef, args->exec_queue_id);
> > > > @@ -170,8 +176,9 @@ int xe_exec_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
> > > >    		goto err_exec_queue;
> > > >    	}
> > > > -	if (args->num_syncs) {
> > > > -		syncs = kcalloc(args->num_syncs, sizeof(*syncs), GFP_KERNEL);
> > > > +	if (args->syncs.num_syncs) {
> > > > +		syncs = kcalloc(args->syncs.num_syncs, sizeof(*syncs),
> > > > +				GFP_KERNEL);
> > > >    		if (!syncs) {
> > > >    			err = -ENOMEM;
> > > >    			goto err_exec_queue;
> > > > @@ -180,7 +187,7 @@ int xe_exec_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
> > > >    	vm = q->vm;
> > > > -	for (i = 0; i < args->num_syncs; i++) {
> > > > +	for (i = 0; i < args->syncs.num_syncs; i++) {
> > > >    		err = xe_sync_entry_parse(xe, xef, &syncs[num_syncs++],
> > > >    					  &syncs_user[i], SYNC_PARSE_FLAG_EXEC |
> > > >    					  (xe_vm_in_lr_mode(vm) ?
> > > > @@ -245,9 +252,17 @@ int xe_exec_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
> > > >    				err = PTR_ERR(fence);
> > > >    				goto err_exec;
> > > >    			}
> > > > +
> > > >    			for (i = 0; i < num_syncs; i++)
> > > >    				xe_sync_entry_signal(&syncs[i], NULL, fence);
> > > > +
> > > >    			xe_exec_queue_last_fence_set(q, vm, fence);
> > > > +			if (wait) {
> > > > +				long timeout = dma_fence_wait(fence, true);
> > > > +
> > > > +				if (timeout < 0)
> > > > +					err = -EINTR;
> > > > +			}
> > > Here it looks like we will rerun the same IOCTL again if we return -EINTR.
> > > The user-space expected action on -EINTR is to just restart the IOCTL
> > > without any argument changes. Solution is to add an ioctl argument cookie
> > > (or to skip sync vm binds and have the user just use the 0 batch buffers /
> > > vm_binds calls or wait for an out-fence). If you go for the cookie solution
> > > then IMO we should keep the -ERESTARTSYS returned from dma_fence_wait()
> > > since it's converted to -EINTR on return-to-user-space, and the kernel
> > > restarts the IOCTL automatically if there was no requested-for-delivery
> > > signal pending.
> > > 
> > > I think the simplest solution at this point is to skip the sync behaviour,
> > > in particular if we enable the 0 batch / bind possibility.
> > > 
> > > If we still want to provide it, we could add a cookie address as an
> > > extension to the ioctl and activate sync if present? (Just throwing up ideas
> > > here).
> > > 
> > Hmm, forgot about this. A cookie is fairly easy, what about something like this:
> > 
> >   807 /**
> >   808  * struct drm_xe_syncs - In / out syncs for IOCTLs.
> >   809  */
> >   810 struct drm_xe_syncs {
> >   811         /** @num_syncs: amount of syncs to wait on */
> >   812         __u32 num_syncs;
> >   813
> >   814         /*
> >   815          * Block in IOCTL until operation complete, num_syncs MBZ if set.
> >   816          */
> >   817 #define DRM_XE_SYNCS_IN_FLAG_WAIT_FOR_OP (1 << 0)
> >   818         /** @in_flags: Input Sync flags */
> >   819         __u16 in_flags;
> >   820
> >   821         /*
> >   822          * IOCTL operation has started (no need for user to resubmit on
> >   823          * -ERESTARTSYS)
> >   824          */
> >   825 #define DRM_XE_SYNCS_OUT_FLAG_OP_COMMITTED (1 << 0)
> >   826         /** @out_flags: Output Sync flags */
> >   827         __u16 out_flags;
> >   828
> >   829         /** @syncs: pointer to struct drm_xe_sync array */
> >   830         __u64 syncs;
> >   831
> >   832         /** @reserved: Reserved */
> >   833         __u64 reserved[2];
> >   834 };
> > 
> > DRM_XE_SYNCS_OUT_FLAG_OP_COMMITTED gets set in exec / bind IOCTL after
> > the job is committed or in the of zero ops last-fence updated on the
> > queue. Note that for binds we don't yet do 1 job per IOCTL but after
> > landing some version of [1]
> > 
> > After DRM_XE_SYNCS_OUT_FLAG_OP_COMMITTED is set we return -ERESTARTSYS if
> > the wait is interrupted and -EINTR is still
> > DRM_XE_SYNCS_OUT_FLAG_OP_COMMITTED (interrupted before job is
> > committed).
> > 
> > I'd rather go with patch as we have to change the uAPI here regardless
> > so we might as well make this complete.
> > 
> > Matt
> > 
> > [1] https://patchwork.freedesktop.org/series/125608/
> 
> Yeah as we discussed in the meeting that means making the ioctl RW instead
> of W with some copying overhead.
> 
> I also think we should leave the EXEC ioctl out of this, meaning just having
> a single field in the VM_BIND ioctl. Basically the reason is that waiting
> like this after submission is a bit weird and does not align well with how
> -EINTR is typically used.
> 

I kinda like uniform behavior between exec and binds with the behavior
defined in a common sync structure. 

> So either a pointer to a cookie in the ioctl,
> 

What about:

119 > >   807 /**
120 > >   808  * struct drm_xe_syncs - In / out syncs for IOCTLs.
121 > >   809  */
122 > >   810 struct drm_xe_syncs {
123 > >   811         /** @num_syncs: amount of syncs to wait on */
124 > >   812         __u32 num_syncs;
125 > >   813
126 > >   814         /*
127 > >   815          * Block in IOCTL until operation complete, num_syncs MBZ if set.
128 > >   816          */
129 > >   817 #define DRM_XE_SYNCS_IN_FLAG_WAIT_FOR_OP (1 << 0)
130 > >   818         /** @flags: Sync flags */
131 > >   819         __u32 in_flags;
132 > >   820
138 > >   826         /** @cookie: userptr cookie written back with non-zero value once operation committed, only valid when IOCTL returns -EINTR */
139 > >   827         __u64 cookie;
140 > >   828
141 > >   829         /** @syncs: pointer to struct drm_xe_sync array */
142 > >   830         __u64 syncs;
143 > >   831
144 > >   832         /** @reserved: Reserved */
145 > >   833         __u64 reserved[2];
146 > >   834 };

Also if cookie is 0, we wait uninterruptable once the op is committed?

> or perhaps dig up again the idea we had of mostly waiting before the
> submission:
> 
> 1) Pull out the last_op fence for the queue from under the relevant lock.
> 2) Wait for all dependencies without any locks.
> 3) Lock, and (optionally) if the last_op fence changed, wait for it.
> 4) Submit
> 5) Wait for completion uninterruptible.
> 

We can always change the internal implementation to something like this
after [1]. That series makes refactors like this quite a bit easier.

Matt

[1] https://patchwork.freedesktop.org/series/125608/ 

> I actually like this last one best, but we'd recommend UMD to uses
> out-fences whenever possible.
> 
> Thoughts?
> 
> > 
> > > >    			dma_fence_put(fence);
> > > >    		}
> > > > @@ -331,42 +346,51 @@ int xe_exec_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
> > > >    	 * the job and let the DRM scheduler / backend clean up the job.
> > > >    	 */
> > > >    	xe_sched_job_arm(job);
> > > > +	job_fence = &job->drm.s_fence->finished;
> > > > +	if (wait)
> > > > +		dma_fence_get(job_fence);
> > > >    	if (!xe_vm_in_lr_mode(vm)) {
> > > >    		/* Block userptr invalidations / BO eviction */
> > > > -		dma_resv_add_fence(&vm->resv,
> > > > -				   &job->drm.s_fence->finished,
> > > > +		dma_resv_add_fence(&vm->resv, job_fence,
> > > >    				   DMA_RESV_USAGE_BOOKKEEP);
> > > >    		/*
> > > >    		 * Make implicit sync work across drivers, assuming all external
> > > >    		 * BOs are written as we don't pass in a read / write list.
> > > >    		 */
> > > > -		xe_vm_fence_all_extobjs(vm, &job->drm.s_fence->finished,
> > > > -					DMA_RESV_USAGE_WRITE);
> > > > +		xe_vm_fence_all_extobjs(vm, job_fence, DMA_RESV_USAGE_WRITE);
> > > >    	}
> > > >    	for (i = 0; i < num_syncs; i++)
> > > > -		xe_sync_entry_signal(&syncs[i], job,
> > > > -				     &job->drm.s_fence->finished);
> > > > +		xe_sync_entry_signal(&syncs[i], job, job_fence);
> > > >    	if (xe_exec_queue_is_lr(q))
> > > >    		q->ring_ops->emit_job(job);
> > > >    	if (!xe_vm_in_lr_mode(vm))
> > > > -		xe_exec_queue_last_fence_set(q, vm, &job->drm.s_fence->finished);
> > > > +		xe_exec_queue_last_fence_set(q, vm, job_fence);
> > > >    	xe_sched_job_push(job);
> > > >    	xe_vm_reactivate_rebind(vm);
> > > > -	if (!err && !xe_vm_in_lr_mode(vm)) {
> > > > +	if (!xe_vm_in_lr_mode(vm)) {
> > > >    		spin_lock(&xe->ttm.lru_lock);
> > > >    		ttm_lru_bulk_move_tail(&vm->lru_bulk_move);
> > > >    		spin_unlock(&xe->ttm.lru_lock);
> > > >    	}
> > > > +	skip_job_put = true;
> > > > +	if (wait) {
> > > > +		long timeout = dma_fence_wait(job_fence, true);
> > > > +
> > > > +		dma_fence_put(job_fence);
> > > > +		if (timeout < 0)
> > > > +			err = -EINTR;
> > > > +	}
> > > > +
> > > >    err_repin:
> > > >    	if (!xe_vm_in_lr_mode(vm))
> > > >    		up_read(&vm->userptr.notifier_lock);
> > > >    err_put_job:
> > > > -	if (err)
> > > > +	if (err && !skip_job_put)
> > > >    		xe_sched_job_put(job);
> > > >    err_exec:
> > > >    	drm_exec_fini(&exec);
> > > > diff --git a/drivers/gpu/drm/xe/xe_exec_queue.c b/drivers/gpu/drm/xe/xe_exec_queue.c
> > > > index 3911d14522ee..98776d02d634 100644
> > > > --- a/drivers/gpu/drm/xe/xe_exec_queue.c
> > > > +++ b/drivers/gpu/drm/xe/xe_exec_queue.c
> > > > @@ -625,10 +625,7 @@ int xe_exec_queue_create_ioctl(struct drm_device *dev, void *data,
> > > >    	if (XE_IOCTL_DBG(xe, eci[0].gt_id >= xe->info.gt_count))
> > > >    		return -EINVAL;
> > > > -	if (eci[0].engine_class >= DRM_XE_ENGINE_CLASS_VM_BIND_ASYNC) {
> > > > -		bool sync = eci[0].engine_class ==
> > > > -			DRM_XE_ENGINE_CLASS_VM_BIND_SYNC;
> > > > -
> > > > +	if (eci[0].engine_class == DRM_XE_ENGINE_CLASS_VM_BIND) {
> > > >    		for_each_gt(gt, xe, id) {
> > > >    			struct xe_exec_queue *new;
> > > > @@ -654,8 +651,6 @@ int xe_exec_queue_create_ioctl(struct drm_device *dev, void *data,
> > > >    						   args->width, hwe,
> > > >    						   EXEC_QUEUE_FLAG_PERSISTENT |
> > > >    						   EXEC_QUEUE_FLAG_VM |
> > > > -						   (sync ? 0 :
> > > > -						    EXEC_QUEUE_FLAG_VM_ASYNC) |
> > > >    						   (id ?
> > > >    						    EXEC_QUEUE_FLAG_BIND_ENGINE_CHILD :
> > > >    						    0));
> > > > diff --git a/drivers/gpu/drm/xe/xe_exec_queue_types.h b/drivers/gpu/drm/xe/xe_exec_queue_types.h
> > > > index 52f0927d0d9b..c78f6e8b41c4 100644
> > > > --- a/drivers/gpu/drm/xe/xe_exec_queue_types.h
> > > > +++ b/drivers/gpu/drm/xe/xe_exec_queue_types.h
> > > > @@ -74,8 +74,6 @@ struct xe_exec_queue {
> > > >    #define EXEC_QUEUE_FLAG_VM			BIT(4)
> > > >    /* child of VM queue for multi-tile VM jobs */
> > > >    #define EXEC_QUEUE_FLAG_BIND_ENGINE_CHILD	BIT(5)
> > > > -/* VM jobs for this queue are asynchronous */
> > > > -#define EXEC_QUEUE_FLAG_VM_ASYNC		BIT(6)
> > > >    	/**
> > > >    	 * @flags: flags for this exec queue, should statically setup aside from ban
> > > > diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c
> > > > index cf2eb44a71db..4b0c976c003a 100644
> > > > --- a/drivers/gpu/drm/xe/xe_vm.c
> > > > +++ b/drivers/gpu/drm/xe/xe_vm.c
> > > > @@ -1433,9 +1433,7 @@ struct xe_vm *xe_vm_create(struct xe_device *xe, u32 flags)
> > > >    			struct xe_gt *gt = tile->primary_gt;
> > > >    			struct xe_vm *migrate_vm;
> > > >    			struct xe_exec_queue *q;
> > > > -			u32 create_flags = EXEC_QUEUE_FLAG_VM |
> > > > -				((flags & XE_VM_FLAG_ASYNC_DEFAULT) ?
> > > > -				EXEC_QUEUE_FLAG_VM_ASYNC : 0);
> > > > +			u32 create_flags = EXEC_QUEUE_FLAG_VM;
> > > >    			if (!vm->pt_root[id])
> > > >    				continue;
> > > > @@ -1835,16 +1833,10 @@ xe_vm_bind_vma(struct xe_vma *vma, struct xe_exec_queue *q,
> > > >    	return ERR_PTR(err);
> > > >    }
> > > > -static bool xe_vm_sync_mode(struct xe_vm *vm, struct xe_exec_queue *q)
> > > > -{
> > > > -	return q ? !(q->flags & EXEC_QUEUE_FLAG_VM_ASYNC) :
> > > > -		!(vm->flags & XE_VM_FLAG_ASYNC_DEFAULT);
> > > > -}
> > > > -
> > > >    static int __xe_vm_bind(struct xe_vm *vm, struct xe_vma *vma,
> > > >    			struct xe_exec_queue *q, struct xe_sync_entry *syncs,
> > > >    			u32 num_syncs, bool immediate, bool first_op,
> > > > -			bool last_op)
> > > > +			bool last_op, bool async)
> > > >    {
> > > >    	struct dma_fence *fence;
> > > >    	struct xe_exec_queue *wait_exec_queue = to_wait_exec_queue(vm, q);
> > > > @@ -1870,7 +1862,7 @@ static int __xe_vm_bind(struct xe_vm *vm, struct xe_vma *vma,
> > > >    	if (last_op)
> > > >    		xe_exec_queue_last_fence_set(wait_exec_queue, vm, fence);
> > > > -	if (last_op && xe_vm_sync_mode(vm, q))
> > > > +	if (last_op && !async)
> > > >    		dma_fence_wait(fence, true);
> > > >    	dma_fence_put(fence);
> > > > @@ -1880,7 +1872,7 @@ static int __xe_vm_bind(struct xe_vm *vm, struct xe_vma *vma,
> > > >    static int xe_vm_bind(struct xe_vm *vm, struct xe_vma *vma, struct xe_exec_queue *q,
> > > >    		      struct xe_bo *bo, struct xe_sync_entry *syncs,
> > > >    		      u32 num_syncs, bool immediate, bool first_op,
> > > > -		      bool last_op)
> > > > +		      bool last_op, bool async)
> > > >    {
> > > >    	int err;
> > > > @@ -1894,12 +1886,12 @@ static int xe_vm_bind(struct xe_vm *vm, struct xe_vma *vma, struct xe_exec_queue
> > > >    	}
> > > >    	return __xe_vm_bind(vm, vma, q, syncs, num_syncs, immediate, first_op,
> > > > -			    last_op);
> > > > +			    last_op, async);
> > > >    }
> > > >    static int xe_vm_unbind(struct xe_vm *vm, struct xe_vma *vma,
> > > >    			struct xe_exec_queue *q, struct xe_sync_entry *syncs,
> > > > -			u32 num_syncs, bool first_op, bool last_op)
> > > > +			u32 num_syncs, bool first_op, bool last_op, bool async)
> > > >    {
> > > >    	struct dma_fence *fence;
> > > >    	struct xe_exec_queue *wait_exec_queue = to_wait_exec_queue(vm, q);
> > > > @@ -1914,7 +1906,7 @@ static int xe_vm_unbind(struct xe_vm *vm, struct xe_vma *vma,
> > > >    	xe_vma_destroy(vma, fence);
> > > >    	if (last_op)
> > > >    		xe_exec_queue_last_fence_set(wait_exec_queue, vm, fence);
> > > > -	if (last_op && xe_vm_sync_mode(vm, q))
> > > > +	if (last_op && !async)
> > > >    		dma_fence_wait(fence, true);
> > > It looks like we're dropping the error return code here.
> > > 
> > > 
> > > >    	dma_fence_put(fence);
> > > > @@ -1923,7 +1915,6 @@ static int xe_vm_unbind(struct xe_vm *vm, struct xe_vma *vma,
> > > >    #define ALL_DRM_XE_VM_CREATE_FLAGS (DRM_XE_VM_CREATE_FLAG_SCRATCH_PAGE | \
> > > >    				    DRM_XE_VM_CREATE_FLAG_LR_MODE | \
> > > > -				    DRM_XE_VM_CREATE_FLAG_ASYNC_DEFAULT | \
> > > >    				    DRM_XE_VM_CREATE_FLAG_FAULT_MODE)
> > > >    int xe_vm_create_ioctl(struct drm_device *dev, void *data,
> > > > @@ -1977,8 +1968,6 @@ int xe_vm_create_ioctl(struct drm_device *dev, void *data,
> > > >    		flags |= XE_VM_FLAG_SCRATCH_PAGE;
> > > >    	if (args->flags & DRM_XE_VM_CREATE_FLAG_LR_MODE)
> > > >    		flags |= XE_VM_FLAG_LR_MODE;
> > > > -	if (args->flags & DRM_XE_VM_CREATE_FLAG_ASYNC_DEFAULT)
> > > > -		flags |= XE_VM_FLAG_ASYNC_DEFAULT;
> > > >    	if (args->flags & DRM_XE_VM_CREATE_FLAG_FAULT_MODE)
> > > >    		flags |= XE_VM_FLAG_FAULT_MODE;
> > > > @@ -2062,7 +2051,7 @@ static const u32 region_to_mem_type[] = {
> > > >    static int xe_vm_prefetch(struct xe_vm *vm, struct xe_vma *vma,
> > > >    			  struct xe_exec_queue *q, u32 region,
> > > >    			  struct xe_sync_entry *syncs, u32 num_syncs,
> > > > -			  bool first_op, bool last_op)
> > > > +			  bool first_op, bool last_op, bool async)
> > > >    {
> > > >    	struct xe_exec_queue *wait_exec_queue = to_wait_exec_queue(vm, q);
> > > >    	int err;
> > > > @@ -2077,7 +2066,7 @@ static int xe_vm_prefetch(struct xe_vm *vm, struct xe_vma *vma,
> > > >    	if (vma->tile_mask != (vma->tile_present & ~vma->usm.tile_invalidated)) {
> > > >    		return xe_vm_bind(vm, vma, q, xe_vma_bo(vma), syncs, num_syncs,
> > > > -				  true, first_op, last_op);
> > > > +				  true, first_op, last_op, async);
> > > >    	} else {
> > > >    		int i;
> > > > @@ -2400,6 +2389,8 @@ static int vm_bind_ioctl_ops_parse(struct xe_vm *vm, struct xe_exec_queue *q,
> > > >    		}
> > > >    		op->q = q;
> > > > +		if (async)
> > > > +			op->flags |= XE_VMA_OP_ASYNC;
> > > >    		switch (op->base.op) {
> > > >    		case DRM_GPUVA_OP_MAP:
> > > > @@ -2538,7 +2529,8 @@ static int op_execute(struct drm_exec *exec, struct xe_vm *vm,
> > > >    				 op->syncs, op->num_syncs,
> > > >    				 op->map.immediate || !xe_vm_in_fault_mode(vm),
> > > >    				 op->flags & XE_VMA_OP_FIRST,
> > > > -				 op->flags & XE_VMA_OP_LAST);
> > > > +				 op->flags & XE_VMA_OP_LAST,
> > > > +				 op->flags & XE_VMA_OP_ASYNC);
> > > >    		break;
> > > >    	case DRM_GPUVA_OP_REMAP:
> > > >    	{
> > > > @@ -2552,7 +2544,8 @@ static int op_execute(struct drm_exec *exec, struct xe_vm *vm,
> > > >    					   op->num_syncs,
> > > >    					   op->flags & XE_VMA_OP_FIRST,
> > > >    					   op->flags & XE_VMA_OP_LAST &&
> > > > -					   !prev && !next);
> > > > +					   !prev && !next,
> > > > +					   op->flags & XE_VMA_OP_ASYNC);
> > > >    			if (err)
> > > >    				break;
> > > >    			op->remap.unmap_done = true;
> > > > @@ -2563,7 +2556,8 @@ static int op_execute(struct drm_exec *exec, struct xe_vm *vm,
> > > >    			err = xe_vm_bind(vm, op->remap.prev, op->q,
> > > >    					 xe_vma_bo(op->remap.prev), op->syncs,
> > > >    					 op->num_syncs, true, false,
> > > > -					 op->flags & XE_VMA_OP_LAST && !next);
> > > > +					 op->flags & XE_VMA_OP_LAST && !next,
> > > > +					 op->flags & XE_VMA_OP_ASYNC);
> > > >    			op->remap.prev->gpuva.flags &= ~XE_VMA_LAST_REBIND;
> > > >    			if (err)
> > > >    				break;
> > > > @@ -2576,7 +2570,8 @@ static int op_execute(struct drm_exec *exec, struct xe_vm *vm,
> > > >    					 xe_vma_bo(op->remap.next),
> > > >    					 op->syncs, op->num_syncs,
> > > >    					 true, false,
> > > > -					 op->flags & XE_VMA_OP_LAST);
> > > > +					 op->flags & XE_VMA_OP_LAST,
> > > > +					 op->flags & XE_VMA_OP_ASYNC);
> > > >    			op->remap.next->gpuva.flags &= ~XE_VMA_LAST_REBIND;
> > > >    			if (err)
> > > >    				break;
> > > > @@ -2588,13 +2583,15 @@ static int op_execute(struct drm_exec *exec, struct xe_vm *vm,
> > > >    	case DRM_GPUVA_OP_UNMAP:
> > > >    		err = xe_vm_unbind(vm, vma, op->q, op->syncs,
> > > >    				   op->num_syncs, op->flags & XE_VMA_OP_FIRST,
> > > > -				   op->flags & XE_VMA_OP_LAST);
> > > > +				   op->flags & XE_VMA_OP_LAST,
> > > > +				   op->flags & XE_VMA_OP_ASYNC);
> > > >    		break;
> > > >    	case DRM_GPUVA_OP_PREFETCH:
> > > >    		err = xe_vm_prefetch(vm, vma, op->q, op->prefetch.region,
> > > >    				     op->syncs, op->num_syncs,
> > > >    				     op->flags & XE_VMA_OP_FIRST,
> > > > -				     op->flags & XE_VMA_OP_LAST);
> > > > +				     op->flags & XE_VMA_OP_LAST,
> > > > +				     op->flags & XE_VMA_OP_ASYNC);
> > > >    		break;
> > > >    	default:
> > > >    		drm_warn(&vm->xe->drm, "NOT POSSIBLE");
> > > > @@ -2808,16 +2805,16 @@ static int vm_bind_ioctl_ops_execute(struct xe_vm *vm,
> > > >    #ifdef TEST_VM_ASYNC_OPS_ERROR
> > > >    #define SUPPORTED_FLAGS	\
> > > > -	(FORCE_ASYNC_OP_ERROR | DRM_XE_VM_BIND_FLAG_ASYNC | \
> > > > -	 DRM_XE_VM_BIND_FLAG_READONLY | DRM_XE_VM_BIND_FLAG_IMMEDIATE | \
> > > > -	 DRM_XE_VM_BIND_FLAG_NULL | 0xffff)
> > > > +	(FORCE_ASYNC_OP_ERROR | DRM_XE_VM_BIND_FLAG_READONLY | \
> > > > +	 DRM_XE_VM_BIND_FLAG_IMMEDIATE | DRM_XE_VM_BIND_FLAG_NULL | 0xffff)
> > > >    #else
> > > >    #define SUPPORTED_FLAGS	\
> > > > -	(DRM_XE_VM_BIND_FLAG_ASYNC | DRM_XE_VM_BIND_FLAG_READONLY | \
> > > > +	(DRM_XE_VM_BIND_FLAG_READONLY | \
> > > >    	 DRM_XE_VM_BIND_FLAG_IMMEDIATE | DRM_XE_VM_BIND_FLAG_NULL | \
> > > >    	 0xffff)
> > > >    #endif
> > > >    #define XE_64K_PAGE_MASK 0xffffull
> > > > +#define ALL_DRM_XE_SYNCS_FLAGS (DRM_XE_SYNCS_FLAG_WAIT_FOR_OP)
> > > >    #define MAX_BINDS	512	/* FIXME: Picking random upper limit */
> > > > @@ -2829,7 +2826,7 @@ static int vm_bind_ioctl_check_args(struct xe_device *xe,
> > > >    	int err;
> > > >    	int i;
> > > > -	if (XE_IOCTL_DBG(xe, args->pad || args->pad2) ||
> > > > +	if (XE_IOCTL_DBG(xe, args->pad) ||
> > > >    	    XE_IOCTL_DBG(xe, args->reserved[0] || args->reserved[1]))
> > > >    		return -EINVAL;
> > > > @@ -2857,6 +2854,14 @@ static int vm_bind_ioctl_check_args(struct xe_device *xe,
> > > >    		*bind_ops = &args->bind;
> > > >    	}
> > > > +	*async = !(args->syncs.flags & DRM_XE_SYNCS_FLAG_WAIT_FOR_OP);
> > > > +
> > > > +	if (XE_IOCTL_DBG(xe, args->syncs.flags & ~ALL_DRM_XE_SYNCS_FLAGS) ||
> > > > +	    XE_IOCTL_DBG(xe, !*async && args->syncs.num_syncs)) {
> > > > +		err = -EINVAL;
> > > > +		goto free_bind_ops;
> > > > +	}
> > > > +
> > > >    	for (i = 0; i < args->num_binds; ++i) {
> > > >    		u64 range = (*bind_ops)[i].range;
> > > >    		u64 addr = (*bind_ops)[i].addr;
> > > > @@ -2887,18 +2892,6 @@ static int vm_bind_ioctl_check_args(struct xe_device *xe,
> > > >    			goto free_bind_ops;
> > > >    		}
> > > > -		if (i == 0) {
> > > > -			*async = !!(flags & DRM_XE_VM_BIND_FLAG_ASYNC);
> > > > -			if (XE_IOCTL_DBG(xe, !*async && args->num_syncs)) {
> > > > -				err = -EINVAL;
> > > > -				goto free_bind_ops;
> > > > -			}
> > > > -		} else if (XE_IOCTL_DBG(xe, *async !=
> > > > -					!!(flags & DRM_XE_VM_BIND_FLAG_ASYNC))) {
> > > > -			err = -EINVAL;
> > > > -			goto free_bind_ops;
> > > > -		}
> > > > -
> > > >    		if (XE_IOCTL_DBG(xe, op > DRM_XE_VM_BIND_OP_PREFETCH) ||
> > > >    		    XE_IOCTL_DBG(xe, flags & ~SUPPORTED_FLAGS) ||
> > > >    		    XE_IOCTL_DBG(xe, obj && is_null) ||
> > > > @@ -2951,7 +2944,7 @@ static int vm_bind_ioctl_check_args(struct xe_device *xe,
> > > >    static int vm_bind_ioctl_signal_fences(struct xe_vm *vm,
> > > >    				       struct xe_exec_queue *q,
> > > >    				       struct xe_sync_entry *syncs,
> > > > -				       int num_syncs)
> > > > +				       int num_syncs, bool async)
> > > >    {
> > > >    	struct dma_fence *fence;
> > > >    	int i, err = 0;
> > > > @@ -2967,7 +2960,7 @@ static int vm_bind_ioctl_signal_fences(struct xe_vm *vm,
> > > >    	xe_exec_queue_last_fence_set(to_wait_exec_queue(vm, q), vm,
> > > >    				     fence);
> > > > -	if (xe_vm_sync_mode(vm, q)) {
> > > > +	if (!async) {
> > > >    		long timeout = dma_fence_wait(fence, true);
> > > >    		if (timeout < 0)
> > > > @@ -3001,7 +2994,7 @@ int xe_vm_bind_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
> > > >    	if (err)
> > > >    		return err;
> > > > -	if (XE_IOCTL_DBG(xe, args->pad || args->pad2) ||
> > > > +	if (XE_IOCTL_DBG(xe, args->pad) ||
> > > >    	    XE_IOCTL_DBG(xe, args->reserved[0] || args->reserved[1]))
> > > >    		return -EINVAL;
> > > > @@ -3016,12 +3009,6 @@ int xe_vm_bind_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
> > > >    			err = -EINVAL;
> > > >    			goto put_exec_queue;
> > > >    		}
> > > > -
> > > > -		if (XE_IOCTL_DBG(xe, args->num_binds && async !=
> > > > -				 !!(q->flags & EXEC_QUEUE_FLAG_VM_ASYNC))) {
> > > > -			err = -EINVAL;
> > > > -			goto put_exec_queue;
> > > > -		}
> > > >    	}
> > > >    	vm = xe_vm_lookup(xef, args->vm_id);
> > > > @@ -3030,14 +3017,6 @@ int xe_vm_bind_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
> > > >    		goto put_exec_queue;
> > > >    	}
> > > > -	if (!args->exec_queue_id) {
> > > > -		if (XE_IOCTL_DBG(xe, args->num_binds && async !=
> > > > -				 !!(vm->flags & XE_VM_FLAG_ASYNC_DEFAULT))) {
> > > > -			err = -EINVAL;
> > > > -			goto put_vm;
> > > > -		}
> > > > -	}
> > > > -
> > > >    	err = down_write_killable(&vm->lock);
> > > >    	if (err)
> > > >    		goto put_vm;
> > > > @@ -3127,16 +3106,16 @@ int xe_vm_bind_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
> > > >    		}
> > > >    	}
> > > > -	if (args->num_syncs) {
> > > > -		syncs = kcalloc(args->num_syncs, sizeof(*syncs), GFP_KERNEL);
> > > > +	if (args->syncs.num_syncs) {
> > > > +		syncs = kcalloc(args->syncs.num_syncs, sizeof(*syncs), GFP_KERNEL);
> > > >    		if (!syncs) {
> > > >    			err = -ENOMEM;
> > > >    			goto put_obj;
> > > >    		}
> > > >    	}
> > > > -	syncs_user = u64_to_user_ptr(args->syncs);
> > > > -	for (num_syncs = 0; num_syncs < args->num_syncs; num_syncs++) {
> > > > +	syncs_user = u64_to_user_ptr(args->syncs.syncs);
> > > > +	for (num_syncs = 0; num_syncs < args->syncs.num_syncs; num_syncs++) {
> > > >    		err = xe_sync_entry_parse(xe, xef, &syncs[num_syncs],
> > > >    					  &syncs_user[num_syncs],
> > > >    					  (xe_vm_in_lr_mode(vm) ?
> > > > @@ -3210,7 +3189,8 @@ int xe_vm_bind_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
> > > >    	vm_bind_ioctl_ops_unwind(vm, ops, args->num_binds);
> > > >    free_syncs:
> > > >    	if (err == -ENODATA)
> > > > -		err = vm_bind_ioctl_signal_fences(vm, q, syncs, num_syncs);
> > > > +		err = vm_bind_ioctl_signal_fences(vm, q, syncs, num_syncs,
> > > > +						  async);
> > > >    	while (num_syncs--)
> > > >    		xe_sync_entry_cleanup(&syncs[num_syncs]);
> > > > diff --git a/drivers/gpu/drm/xe/xe_vm_types.h b/drivers/gpu/drm/xe/xe_vm_types.h
> > > > index 23abdfd8622f..ce8b9bde7e9c 100644
> > > > --- a/drivers/gpu/drm/xe/xe_vm_types.h
> > > > +++ b/drivers/gpu/drm/xe/xe_vm_types.h
> > > > @@ -167,13 +167,12 @@ struct xe_vm {
> > > >    	 */
> > > >    #define XE_VM_FLAG_64K			BIT(0)
> > > >    #define XE_VM_FLAG_LR_MODE		BIT(1)
> > > > -#define XE_VM_FLAG_ASYNC_DEFAULT	BIT(2)
> > > > -#define XE_VM_FLAG_MIGRATION		BIT(3)
> > > > -#define XE_VM_FLAG_SCRATCH_PAGE		BIT(4)
> > > > -#define XE_VM_FLAG_FAULT_MODE		BIT(5)
> > > > -#define XE_VM_FLAG_BANNED		BIT(6)
> > > > -#define XE_VM_FLAG_TILE_ID(flags)	FIELD_GET(GENMASK(8, 7), flags)
> > > > -#define XE_VM_FLAG_SET_TILE_ID(tile)	FIELD_PREP(GENMASK(8, 7), (tile)->id)
> > > > +#define XE_VM_FLAG_MIGRATION		BIT(2)
> > > > +#define XE_VM_FLAG_SCRATCH_PAGE		BIT(3)
> > > > +#define XE_VM_FLAG_FAULT_MODE		BIT(4)
> > > > +#define XE_VM_FLAG_BANNED		BIT(5)
> > > > +#define XE_VM_FLAG_TILE_ID(flags)	FIELD_GET(GENMASK(7, 6), flags)
> > > > +#define XE_VM_FLAG_SET_TILE_ID(tile)	FIELD_PREP(GENMASK(7, 6), (tile)->id)
> > > >    	unsigned long flags;
> > > >    	/** @composite_fence_ctx: context composite fence */
> > > > @@ -385,6 +384,8 @@ enum xe_vma_op_flags {
> > > >    	XE_VMA_OP_PREV_COMMITTED	= BIT(3),
> > > >    	/** @XE_VMA_OP_NEXT_COMMITTED: Next VMA operation committed */
> > > >    	XE_VMA_OP_NEXT_COMMITTED	= BIT(4),
> > > > +	/** @XE_VMA_OP_ASYNC: operation is async */
> > > > +	XE_VMA_OP_ASYNC			= BIT(5),
> > > >    };
> > > >    /** struct xe_vma_op - VMA operation */
> > > > diff --git a/include/uapi/drm/xe_drm.h b/include/uapi/drm/xe_drm.h
> > > > index eb03a49c17a1..fd8172fe2d9a 100644
> > > > --- a/include/uapi/drm/xe_drm.h
> > > > +++ b/include/uapi/drm/xe_drm.h
> > > > @@ -141,8 +141,7 @@ struct drm_xe_engine_class_instance {
> > > >    	 * Kernel only classes (not actual hardware engine class). Used for
> > > >    	 * creating ordered queues of VM bind operations.
> > > >    	 */
> > > > -#define DRM_XE_ENGINE_CLASS_VM_BIND_ASYNC	5
> > > > -#define DRM_XE_ENGINE_CLASS_VM_BIND_SYNC	6
> > > > +#define DRM_XE_ENGINE_CLASS_VM_BIND		5
> > > >    	__u16 engine_class;
> > > >    	__u16 engine_instance;
> > > > @@ -660,7 +659,6 @@ struct drm_xe_vm_create {
> > > >    	 * still enable recoverable pagefaults if supported by the device.
> > > >    	 */
> > > >    #define DRM_XE_VM_CREATE_FLAG_LR_MODE	        (1 << 1)
> > > > -#define DRM_XE_VM_CREATE_FLAG_ASYNC_DEFAULT	(1 << 2)
> > > >    	/*
> > > >    	 * DRM_XE_VM_CREATE_FLAG_FAULT_MODE requires also
> > > >    	 * DRM_XE_VM_CREATE_FLAG_LR_MODE. It allows memory to be allocated
> > > > @@ -668,7 +666,7 @@ struct drm_xe_vm_create {
> > > >    	 * The xe driver internally uses recoverable pagefaults to implement
> > > >    	 * this.
> > > >    	 */
> > > > -#define DRM_XE_VM_CREATE_FLAG_FAULT_MODE	(1 << 3)
> > > > +#define DRM_XE_VM_CREATE_FLAG_FAULT_MODE	(1 << 2)
> > > >    	/** @flags: Flags */
> > > >    	__u32 flags;
> > > > @@ -776,12 +774,11 @@ struct drm_xe_vm_bind_op {
> > > >    	__u32 op;
> > > >    #define DRM_XE_VM_BIND_FLAG_READONLY	(1 << 0)
> > > > -#define DRM_XE_VM_BIND_FLAG_ASYNC	(1 << 1)
> > > >    	/*
> > > >    	 * Valid on a faulting VM only, do the MAP operation immediately rather
> > > >    	 * than deferring the MAP to the page fault handler.
> > > >    	 */
> > > > -#define DRM_XE_VM_BIND_FLAG_IMMEDIATE	(1 << 2)
> > > > +#define DRM_XE_VM_BIND_FLAG_IMMEDIATE	(1 << 1)
> > > >    	/*
> > > >    	 * When the NULL flag is set, the page tables are setup with a special
> > > >    	 * bit which indicates writes are dropped and all reads return zero.  In
> > > > @@ -789,7 +786,7 @@ struct drm_xe_vm_bind_op {
> > > >    	 * operations, the BO handle MBZ, and the BO offset MBZ. This flag is
> > > >    	 * intended to implement VK sparse bindings.
> > > >    	 */
> > > > -#define DRM_XE_VM_BIND_FLAG_NULL	(1 << 3)
> > > > +#define DRM_XE_VM_BIND_FLAG_NULL	(1 << 2)
> > > >    	/** @flags: Bind flags */
> > > >    	__u32 flags;
> > > > @@ -807,6 +804,27 @@ struct drm_xe_vm_bind_op {
> > > >    	__u64 reserved[3];
> > > >    };
> > > > +/**
> > > > + * struct drm_xe_syncs - In / out syncs for IOCTLs.
> > > > + */
> > > > +struct drm_xe_syncs {
> > > > +	/** @num_syncs: amount of syncs to wait on */
> > > > +	__u32 num_syncs;
> > > > +
> > > > +	/*
> > > > +	 * Block in IOCTL until operation complete, num_syncs MBZ if set.
> > > > +	 */
> > > > +#define DRM_XE_SYNCS_FLAG_WAIT_FOR_OP (1 << 0)
> > > > +	/** @flags: Sync flags */
> > > > +	__u32 flags;
> > > > +
> > > > +	/** @syncs: pointer to struct drm_xe_sync array */
> > > > +	__u64 syncs;
> > > > +
> > > > +	/** @reserved: Reserved */
> > > > +	__u64 reserved[2];
> > > > +};
> > > > +
> > > >    struct drm_xe_vm_bind {
> > > >    	/** @extensions: Pointer to the first extension struct, if any */
> > > >    	__u64 extensions;
> > > > @@ -838,14 +856,8 @@ struct drm_xe_vm_bind {
> > > >    		__u64 vector_of_binds;
> > > >    	};
> > > > -	/** @pad: MBZ */
> > > > -	__u32 pad2;
> > > > -
> > > > -	/** @num_syncs: amount of syncs to wait on */
> > > > -	__u32 num_syncs;
> > > > -
> > > > -	/** @syncs: pointer to struct drm_xe_sync array */
> > > > -	__u64 syncs;
> > > > +	/** @syncs: syncs for bind */
> > > > +	struct drm_xe_syncs syncs;
> > > >    	/** @reserved: Reserved */
> > > >    	__u64 reserved[2];
> > > > @@ -974,14 +986,14 @@ struct drm_xe_exec {
> > > >    	/** @extensions: Pointer to the first extension struct, if any */
> > > >    	__u64 extensions;
> > > > +	/** @pad: MBZ */
> > > > +	__u32 pad;
> > > > +
> > > >    	/** @exec_queue_id: Exec queue ID for the batch buffer */
> > > >    	__u32 exec_queue_id;
> > > > -	/** @num_syncs: Amount of struct drm_xe_sync in array. */
> > > > -	__u32 num_syncs;
> > > > -
> > > > -	/** @syncs: Pointer to struct drm_xe_sync array. */
> > > > -	__u64 syncs;
> > > > +	/** @syncs: syncs for exec */
> > > > +	struct drm_xe_syncs syncs;
> > > >    	/**
> > > >    	 * @address: address of batch buffer if num_batch_buffer == 1 or an
> > > > @@ -995,8 +1007,8 @@ struct drm_xe_exec {
> > > >    	 */
> > > >    	__u16 num_batch_buffer;
> > > > -	/** @pad: MBZ */
> > > > -	__u16 pad[3];
> > > > +	/** @pad2: MBZ */
> > > > +	__u16 pad2[3];
> > > >    	/** @reserved: Reserved */
> > > >    	__u64 reserved[2];

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [RFC PATCH 7/7] drm/xe/uapi: Uniform async vs sync handling
  2023-12-11 15:34       ` Thomas Hellström
@ 2023-12-11 16:50         ` Matthew Brost
  0 siblings, 0 replies; 22+ messages in thread
From: Matthew Brost @ 2023-12-11 16:50 UTC (permalink / raw)
  To: Thomas Hellström; +Cc: Francois Dugast, intel-xe, Rodrigo Vivi

On Mon, Dec 11, 2023 at 04:34:54PM +0100, Thomas Hellström wrote:
> 
> On 12/8/23 13:24, Matthew Brost wrote:
> > On Fri, Dec 08, 2023 at 04:00:37PM +0100, Thomas Hellström wrote:
> > Missed a comment, addressing below.
> > 
> > > On 12/7/23 06:57, Matthew Brost wrote:
> > > > Remove concept of async vs sync VM bind queues, rather make async vs
> > > > sync a per IOCTL choice. Since this is per IOCTL, it makes sense to have
> > > > a singular flag IOCTL rather than per VM bind op flag too. Add
> > > > DRM_XE_SYNCS_FLAG_WAIT_FOR_OP which is an input sync flag to support
> > > > this. Support this new flag for both the VM bind IOCTL and the exec
> > > > IOCTL to match behavior.
> > > > 
> > > > Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
> > > > Cc: Thomas Hellström <thomas.hellstrom@linux.intel.com>
> > > > Cc: Francois Dugast <francois.dugast@intel.com>
> > > > Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> > > > ---
> > > >    drivers/gpu/drm/xe/xe_exec.c             |  58 ++++++++----
> > > >    drivers/gpu/drm/xe/xe_exec_queue.c       |   7 +-
> > > >    drivers/gpu/drm/xe/xe_exec_queue_types.h |   2 -
> > > >    drivers/gpu/drm/xe/xe_vm.c               | 110 ++++++++++-------------
> > > >    drivers/gpu/drm/xe/xe_vm_types.h         |  15 ++--
> > > >    include/uapi/drm/xe_drm.h                |  56 +++++++-----
> > > >    6 files changed, 129 insertions(+), 119 deletions(-)
> > > > 
> > > > diff --git a/drivers/gpu/drm/xe/xe_exec.c b/drivers/gpu/drm/xe/xe_exec.c
> > > > index 92b0da6580e8..c62cabfaa112 100644
> > > > --- a/drivers/gpu/drm/xe/xe_exec.c
> > > > +++ b/drivers/gpu/drm/xe/xe_exec.c
> > > > @@ -130,12 +130,15 @@ static int xe_exec_begin(struct drm_exec *exec, struct xe_vm *vm)
> > > >    	return err;
> > > >    }
> > > > +#define ALL_DRM_XE_SYNCS_FLAGS (DRM_XE_SYNCS_FLAG_WAIT_FOR_OP)
> > > > +
> > > >    int xe_exec_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
> > > >    {
> > > >    	struct xe_device *xe = to_xe_device(dev);
> > > >    	struct xe_file *xef = to_xe_file(file);
> > > >    	struct drm_xe_exec *args = data;
> > > > -	struct drm_xe_sync __user *syncs_user = u64_to_user_ptr(args->syncs);
> > > > +	struct drm_xe_sync __user *syncs_user =
> > > > +		u64_to_user_ptr(args->syncs.syncs);
> > > >    	u64 __user *addresses_user = u64_to_user_ptr(args->address);
> > > >    	struct xe_exec_queue *q;
> > > >    	struct xe_sync_entry *syncs = NULL;
> > > > @@ -143,15 +146,18 @@ int xe_exec_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
> > > >    	struct drm_exec exec;
> > > >    	u32 i, num_syncs = 0;
> > > >    	struct xe_sched_job *job;
> > > > -	struct dma_fence *rebind_fence;
> > > > +	struct dma_fence *rebind_fence, *job_fence;
> > > >    	struct xe_vm *vm;
> > > > -	bool write_locked;
> > > > +	bool write_locked, skip_job_put = false;
> > > > +	bool wait = args->syncs.flags & DRM_XE_SYNCS_FLAG_WAIT_FOR_OP;
> > > >    	ktime_t end = 0;
> > > >    	int err = 0;
> > > >    	if (XE_IOCTL_DBG(xe, args->extensions) ||
> > > > -	    XE_IOCTL_DBG(xe, args->pad[0] || args->pad[1] || args->pad[2]) ||
> > > > -	    XE_IOCTL_DBG(xe, args->reserved[0] || args->reserved[1]))
> > > > +	    XE_IOCTL_DBG(xe, args->pad || args->pad2[0] || args->pad2[1] || args->pad2[2]) ||
> > > > +	    XE_IOCTL_DBG(xe, args->reserved[0] || args->reserved[1]) ||
> > > > +	    XE_IOCTL_DBG(xe, args->syncs.flags & ~ALL_DRM_XE_SYNCS_FLAGS) ||
> > > > +	    XE_IOCTL_DBG(xe, wait && args->syncs.num_syncs))
> > > >    		return -EINVAL;
> > > >    	q = xe_exec_queue_lookup(xef, args->exec_queue_id);
> > > > @@ -170,8 +176,9 @@ int xe_exec_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
> > > >    		goto err_exec_queue;
> > > >    	}
> > > > -	if (args->num_syncs) {
> > > > -		syncs = kcalloc(args->num_syncs, sizeof(*syncs), GFP_KERNEL);
> > > > +	if (args->syncs.num_syncs) {
> > > > +		syncs = kcalloc(args->syncs.num_syncs, sizeof(*syncs),
> > > > +				GFP_KERNEL);
> > > >    		if (!syncs) {
> > > >    			err = -ENOMEM;
> > > >    			goto err_exec_queue;
> > > > @@ -180,7 +187,7 @@ int xe_exec_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
> > > >    	vm = q->vm;
> > > > -	for (i = 0; i < args->num_syncs; i++) {
> > > > +	for (i = 0; i < args->syncs.num_syncs; i++) {
> > > >    		err = xe_sync_entry_parse(xe, xef, &syncs[num_syncs++],
> > > >    					  &syncs_user[i], SYNC_PARSE_FLAG_EXEC |
> > > >    					  (xe_vm_in_lr_mode(vm) ?
> > > > @@ -245,9 +252,17 @@ int xe_exec_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
> > > >    				err = PTR_ERR(fence);
> > > >    				goto err_exec;
> > > >    			}
> > > > +
> > > >    			for (i = 0; i < num_syncs; i++)
> > > >    				xe_sync_entry_signal(&syncs[i], NULL, fence);
> > > > +
> > > >    			xe_exec_queue_last_fence_set(q, vm, fence);
> > > > +			if (wait) {
> > > > +				long timeout = dma_fence_wait(fence, true);
> > > > +
> > > > +				if (timeout < 0)
> > > > +					err = -EINTR;
> > > > +			}
> > > Here it looks like we will rerun the same IOCTL again if we return -EINTR.
> > > The user-space expected action on -EINTR is to just restart the IOCTL
> > > without any argument changes. Solution is to add an ioctl argument cookie
> > > (or to skip sync vm binds and have the user just use the 0 batch buffers /
> > > vm_binds calls or wait for an out-fence). If you go for the cookie solution
> > > then IMO we should keep the -ERESTARTSYS returned from dma_fence_wait()
> > > since it's converted to -EINTR on return-to-user-space, and the kernel
> > > restarts the IOCTL automatically if there was no requested-for-delivery
> > > signal pending.
> > > 
> > > I think the simplest solution at this point is to skip the sync behaviour,
> > > in particular if we enable the 0 batch / bind possibility.
> > > 
> > > If we still want to provide it, we could add a cookie address as an
> > > extension to the ioctl and activate sync if present? (Just throwing up ideas
> > > here).
> > > 
> > > >    			dma_fence_put(fence);
> > > >    		}
> > > > @@ -331,42 +346,51 @@ int xe_exec_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
> > > >    	 * the job and let the DRM scheduler / backend clean up the job.
> > > >    	 */
> > > >    	xe_sched_job_arm(job);
> > > > +	job_fence = &job->drm.s_fence->finished;
> > > > +	if (wait)
> > > > +		dma_fence_get(job_fence);
> > > >    	if (!xe_vm_in_lr_mode(vm)) {
> > > >    		/* Block userptr invalidations / BO eviction */
> > > > -		dma_resv_add_fence(&vm->resv,
> > > > -				   &job->drm.s_fence->finished,
> > > > +		dma_resv_add_fence(&vm->resv, job_fence,
> > > >    				   DMA_RESV_USAGE_BOOKKEEP);
> > > >    		/*
> > > >    		 * Make implicit sync work across drivers, assuming all external
> > > >    		 * BOs are written as we don't pass in a read / write list.
> > > >    		 */
> > > > -		xe_vm_fence_all_extobjs(vm, &job->drm.s_fence->finished,
> > > > -					DMA_RESV_USAGE_WRITE);
> > > > +		xe_vm_fence_all_extobjs(vm, job_fence, DMA_RESV_USAGE_WRITE);
> > > >    	}
> > > >    	for (i = 0; i < num_syncs; i++)
> > > > -		xe_sync_entry_signal(&syncs[i], job,
> > > > -				     &job->drm.s_fence->finished);
> > > > +		xe_sync_entry_signal(&syncs[i], job, job_fence);
> > > >    	if (xe_exec_queue_is_lr(q))
> > > >    		q->ring_ops->emit_job(job);
> > > >    	if (!xe_vm_in_lr_mode(vm))
> > > > -		xe_exec_queue_last_fence_set(q, vm, &job->drm.s_fence->finished);
> > > > +		xe_exec_queue_last_fence_set(q, vm, job_fence);
> > > >    	xe_sched_job_push(job);
> > > >    	xe_vm_reactivate_rebind(vm);
> > > > -	if (!err && !xe_vm_in_lr_mode(vm)) {
> > > > +	if (!xe_vm_in_lr_mode(vm)) {
> > > >    		spin_lock(&xe->ttm.lru_lock);
> > > >    		ttm_lru_bulk_move_tail(&vm->lru_bulk_move);
> > > >    		spin_unlock(&xe->ttm.lru_lock);
> > > >    	}
> > > > +	skip_job_put = true;
> > > > +	if (wait) {
> > > > +		long timeout = dma_fence_wait(job_fence, true);
> > > > +
> > > > +		dma_fence_put(job_fence);
> > > > +		if (timeout < 0)
> > > > +			err = -EINTR;
> > > > +	}
> > > > +
> > > >    err_repin:
> > > >    	if (!xe_vm_in_lr_mode(vm))
> > > >    		up_read(&vm->userptr.notifier_lock);
> > > >    err_put_job:
> > > > -	if (err)
> > > > +	if (err && !skip_job_put)
> > > >    		xe_sched_job_put(job);
> > > >    err_exec:
> > > >    	drm_exec_fini(&exec);
> > > > diff --git a/drivers/gpu/drm/xe/xe_exec_queue.c b/drivers/gpu/drm/xe/xe_exec_queue.c
> > > > index 3911d14522ee..98776d02d634 100644
> > > > --- a/drivers/gpu/drm/xe/xe_exec_queue.c
> > > > +++ b/drivers/gpu/drm/xe/xe_exec_queue.c
> > > > @@ -625,10 +625,7 @@ int xe_exec_queue_create_ioctl(struct drm_device *dev, void *data,
> > > >    	if (XE_IOCTL_DBG(xe, eci[0].gt_id >= xe->info.gt_count))
> > > >    		return -EINVAL;
> > > > -	if (eci[0].engine_class >= DRM_XE_ENGINE_CLASS_VM_BIND_ASYNC) {
> > > > -		bool sync = eci[0].engine_class ==
> > > > -			DRM_XE_ENGINE_CLASS_VM_BIND_SYNC;
> > > > -
> > > > +	if (eci[0].engine_class == DRM_XE_ENGINE_CLASS_VM_BIND) {
> > > >    		for_each_gt(gt, xe, id) {
> > > >    			struct xe_exec_queue *new;
> > > > @@ -654,8 +651,6 @@ int xe_exec_queue_create_ioctl(struct drm_device *dev, void *data,
> > > >    						   args->width, hwe,
> > > >    						   EXEC_QUEUE_FLAG_PERSISTENT |
> > > >    						   EXEC_QUEUE_FLAG_VM |
> > > > -						   (sync ? 0 :
> > > > -						    EXEC_QUEUE_FLAG_VM_ASYNC) |
> > > >    						   (id ?
> > > >    						    EXEC_QUEUE_FLAG_BIND_ENGINE_CHILD :
> > > >    						    0));
> > > > diff --git a/drivers/gpu/drm/xe/xe_exec_queue_types.h b/drivers/gpu/drm/xe/xe_exec_queue_types.h
> > > > index 52f0927d0d9b..c78f6e8b41c4 100644
> > > > --- a/drivers/gpu/drm/xe/xe_exec_queue_types.h
> > > > +++ b/drivers/gpu/drm/xe/xe_exec_queue_types.h
> > > > @@ -74,8 +74,6 @@ struct xe_exec_queue {
> > > >    #define EXEC_QUEUE_FLAG_VM			BIT(4)
> > > >    /* child of VM queue for multi-tile VM jobs */
> > > >    #define EXEC_QUEUE_FLAG_BIND_ENGINE_CHILD	BIT(5)
> > > > -/* VM jobs for this queue are asynchronous */
> > > > -#define EXEC_QUEUE_FLAG_VM_ASYNC		BIT(6)
> > > >    	/**
> > > >    	 * @flags: flags for this exec queue, should statically setup aside from ban
> > > > diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c
> > > > index cf2eb44a71db..4b0c976c003a 100644
> > > > --- a/drivers/gpu/drm/xe/xe_vm.c
> > > > +++ b/drivers/gpu/drm/xe/xe_vm.c
> > > > @@ -1433,9 +1433,7 @@ struct xe_vm *xe_vm_create(struct xe_device *xe, u32 flags)
> > > >    			struct xe_gt *gt = tile->primary_gt;
> > > >    			struct xe_vm *migrate_vm;
> > > >    			struct xe_exec_queue *q;
> > > > -			u32 create_flags = EXEC_QUEUE_FLAG_VM |
> > > > -				((flags & XE_VM_FLAG_ASYNC_DEFAULT) ?
> > > > -				EXEC_QUEUE_FLAG_VM_ASYNC : 0);
> > > > +			u32 create_flags = EXEC_QUEUE_FLAG_VM;
> > > >    			if (!vm->pt_root[id])
> > > >    				continue;
> > > > @@ -1835,16 +1833,10 @@ xe_vm_bind_vma(struct xe_vma *vma, struct xe_exec_queue *q,
> > > >    	return ERR_PTR(err);
> > > >    }
> > > > -static bool xe_vm_sync_mode(struct xe_vm *vm, struct xe_exec_queue *q)
> > > > -{
> > > > -	return q ? !(q->flags & EXEC_QUEUE_FLAG_VM_ASYNC) :
> > > > -		!(vm->flags & XE_VM_FLAG_ASYNC_DEFAULT);
> > > > -}
> > > > -
> > > >    static int __xe_vm_bind(struct xe_vm *vm, struct xe_vma *vma,
> > > >    			struct xe_exec_queue *q, struct xe_sync_entry *syncs,
> > > >    			u32 num_syncs, bool immediate, bool first_op,
> > > > -			bool last_op)
> > > > +			bool last_op, bool async)
> > > >    {
> > > >    	struct dma_fence *fence;
> > > >    	struct xe_exec_queue *wait_exec_queue = to_wait_exec_queue(vm, q);
> > > > @@ -1870,7 +1862,7 @@ static int __xe_vm_bind(struct xe_vm *vm, struct xe_vma *vma,
> > > >    	if (last_op)
> > > >    		xe_exec_queue_last_fence_set(wait_exec_queue, vm, fence);
> > > > -	if (last_op && xe_vm_sync_mode(vm, q))
> > > > +	if (last_op && !async)
> > > >    		dma_fence_wait(fence, true);
> > > >    	dma_fence_put(fence);
> > > > @@ -1880,7 +1872,7 @@ static int __xe_vm_bind(struct xe_vm *vm, struct xe_vma *vma,
> > > >    static int xe_vm_bind(struct xe_vm *vm, struct xe_vma *vma, struct xe_exec_queue *q,
> > > >    		      struct xe_bo *bo, struct xe_sync_entry *syncs,
> > > >    		      u32 num_syncs, bool immediate, bool first_op,
> > > > -		      bool last_op)
> > > > +		      bool last_op, bool async)
> > > >    {
> > > >    	int err;
> > > > @@ -1894,12 +1886,12 @@ static int xe_vm_bind(struct xe_vm *vm, struct xe_vma *vma, struct xe_exec_queue
> > > >    	}
> > > >    	return __xe_vm_bind(vm, vma, q, syncs, num_syncs, immediate, first_op,
> > > > -			    last_op);
> > > > +			    last_op, async);
> > > >    }
> > > >    static int xe_vm_unbind(struct xe_vm *vm, struct xe_vma *vma,
> > > >    			struct xe_exec_queue *q, struct xe_sync_entry *syncs,
> > > > -			u32 num_syncs, bool first_op, bool last_op)
> > > > +			u32 num_syncs, bool first_op, bool last_op, bool async)
> > > >    {
> > > >    	struct dma_fence *fence;
> > > >    	struct xe_exec_queue *wait_exec_queue = to_wait_exec_queue(vm, q);
> > > > @@ -1914,7 +1906,7 @@ static int xe_vm_unbind(struct xe_vm *vm, struct xe_vma *vma,
> > > >    	xe_vma_destroy(vma, fence);
> > > >    	if (last_op)
> > > >    		xe_exec_queue_last_fence_set(wait_exec_queue, vm, fence);
> > > > -	if (last_op && xe_vm_sync_mode(vm, q))
> > > > +	if (last_op && !async)
> > > >    		dma_fence_wait(fence, true);
> > > It looks like we're dropping the error return code here.
> > > 
> > I am aware of this. This is fixed in the larger refactor of the VM bind
> > error handling [1]. The idea with this series is land the uAPI and get
> > the implementation 100% correct in the larger follow up series.
> > 
> > Matt
> > 
> > [1] https://patchwork.freedesktop.org/series/125608/
> 
> Then I think we should wait uninterruptible until that is complete.
> 
> /Thomas
> 

That's an options, let's settle on the uAPI and then figure out the implementaion details / plan.

Matt

> 
> > 
> > > >    	dma_fence_put(fence);
> > > > @@ -1923,7 +1915,6 @@ static int xe_vm_unbind(struct xe_vm *vm, struct xe_vma *vma,
> > > >    #define ALL_DRM_XE_VM_CREATE_FLAGS (DRM_XE_VM_CREATE_FLAG_SCRATCH_PAGE | \
> > > >    				    DRM_XE_VM_CREATE_FLAG_LR_MODE | \
> > > > -				    DRM_XE_VM_CREATE_FLAG_ASYNC_DEFAULT | \
> > > >    				    DRM_XE_VM_CREATE_FLAG_FAULT_MODE)
> > > >    int xe_vm_create_ioctl(struct drm_device *dev, void *data,
> > > > @@ -1977,8 +1968,6 @@ int xe_vm_create_ioctl(struct drm_device *dev, void *data,
> > > >    		flags |= XE_VM_FLAG_SCRATCH_PAGE;
> > > >    	if (args->flags & DRM_XE_VM_CREATE_FLAG_LR_MODE)
> > > >    		flags |= XE_VM_FLAG_LR_MODE;
> > > > -	if (args->flags & DRM_XE_VM_CREATE_FLAG_ASYNC_DEFAULT)
> > > > -		flags |= XE_VM_FLAG_ASYNC_DEFAULT;
> > > >    	if (args->flags & DRM_XE_VM_CREATE_FLAG_FAULT_MODE)
> > > >    		flags |= XE_VM_FLAG_FAULT_MODE;
> > > > @@ -2062,7 +2051,7 @@ static const u32 region_to_mem_type[] = {
> > > >    static int xe_vm_prefetch(struct xe_vm *vm, struct xe_vma *vma,
> > > >    			  struct xe_exec_queue *q, u32 region,
> > > >    			  struct xe_sync_entry *syncs, u32 num_syncs,
> > > > -			  bool first_op, bool last_op)
> > > > +			  bool first_op, bool last_op, bool async)
> > > >    {
> > > >    	struct xe_exec_queue *wait_exec_queue = to_wait_exec_queue(vm, q);
> > > >    	int err;
> > > > @@ -2077,7 +2066,7 @@ static int xe_vm_prefetch(struct xe_vm *vm, struct xe_vma *vma,
> > > >    	if (vma->tile_mask != (vma->tile_present & ~vma->usm.tile_invalidated)) {
> > > >    		return xe_vm_bind(vm, vma, q, xe_vma_bo(vma), syncs, num_syncs,
> > > > -				  true, first_op, last_op);
> > > > +				  true, first_op, last_op, async);
> > > >    	} else {
> > > >    		int i;
> > > > @@ -2400,6 +2389,8 @@ static int vm_bind_ioctl_ops_parse(struct xe_vm *vm, struct xe_exec_queue *q,
> > > >    		}
> > > >    		op->q = q;
> > > > +		if (async)
> > > > +			op->flags |= XE_VMA_OP_ASYNC;
> > > >    		switch (op->base.op) {
> > > >    		case DRM_GPUVA_OP_MAP:
> > > > @@ -2538,7 +2529,8 @@ static int op_execute(struct drm_exec *exec, struct xe_vm *vm,
> > > >    				 op->syncs, op->num_syncs,
> > > >    				 op->map.immediate || !xe_vm_in_fault_mode(vm),
> > > >    				 op->flags & XE_VMA_OP_FIRST,
> > > > -				 op->flags & XE_VMA_OP_LAST);
> > > > +				 op->flags & XE_VMA_OP_LAST,
> > > > +				 op->flags & XE_VMA_OP_ASYNC);
> > > >    		break;
> > > >    	case DRM_GPUVA_OP_REMAP:
> > > >    	{
> > > > @@ -2552,7 +2544,8 @@ static int op_execute(struct drm_exec *exec, struct xe_vm *vm,
> > > >    					   op->num_syncs,
> > > >    					   op->flags & XE_VMA_OP_FIRST,
> > > >    					   op->flags & XE_VMA_OP_LAST &&
> > > > -					   !prev && !next);
> > > > +					   !prev && !next,
> > > > +					   op->flags & XE_VMA_OP_ASYNC);
> > > >    			if (err)
> > > >    				break;
> > > >    			op->remap.unmap_done = true;
> > > > @@ -2563,7 +2556,8 @@ static int op_execute(struct drm_exec *exec, struct xe_vm *vm,
> > > >    			err = xe_vm_bind(vm, op->remap.prev, op->q,
> > > >    					 xe_vma_bo(op->remap.prev), op->syncs,
> > > >    					 op->num_syncs, true, false,
> > > > -					 op->flags & XE_VMA_OP_LAST && !next);
> > > > +					 op->flags & XE_VMA_OP_LAST && !next,
> > > > +					 op->flags & XE_VMA_OP_ASYNC);
> > > >    			op->remap.prev->gpuva.flags &= ~XE_VMA_LAST_REBIND;
> > > >    			if (err)
> > > >    				break;
> > > > @@ -2576,7 +2570,8 @@ static int op_execute(struct drm_exec *exec, struct xe_vm *vm,
> > > >    					 xe_vma_bo(op->remap.next),
> > > >    					 op->syncs, op->num_syncs,
> > > >    					 true, false,
> > > > -					 op->flags & XE_VMA_OP_LAST);
> > > > +					 op->flags & XE_VMA_OP_LAST,
> > > > +					 op->flags & XE_VMA_OP_ASYNC);
> > > >    			op->remap.next->gpuva.flags &= ~XE_VMA_LAST_REBIND;
> > > >    			if (err)
> > > >    				break;
> > > > @@ -2588,13 +2583,15 @@ static int op_execute(struct drm_exec *exec, struct xe_vm *vm,
> > > >    	case DRM_GPUVA_OP_UNMAP:
> > > >    		err = xe_vm_unbind(vm, vma, op->q, op->syncs,
> > > >    				   op->num_syncs, op->flags & XE_VMA_OP_FIRST,
> > > > -				   op->flags & XE_VMA_OP_LAST);
> > > > +				   op->flags & XE_VMA_OP_LAST,
> > > > +				   op->flags & XE_VMA_OP_ASYNC);
> > > >    		break;
> > > >    	case DRM_GPUVA_OP_PREFETCH:
> > > >    		err = xe_vm_prefetch(vm, vma, op->q, op->prefetch.region,
> > > >    				     op->syncs, op->num_syncs,
> > > >    				     op->flags & XE_VMA_OP_FIRST,
> > > > -				     op->flags & XE_VMA_OP_LAST);
> > > > +				     op->flags & XE_VMA_OP_LAST,
> > > > +				     op->flags & XE_VMA_OP_ASYNC);
> > > >    		break;
> > > >    	default:
> > > >    		drm_warn(&vm->xe->drm, "NOT POSSIBLE");
> > > > @@ -2808,16 +2805,16 @@ static int vm_bind_ioctl_ops_execute(struct xe_vm *vm,
> > > >    #ifdef TEST_VM_ASYNC_OPS_ERROR
> > > >    #define SUPPORTED_FLAGS	\
> > > > -	(FORCE_ASYNC_OP_ERROR | DRM_XE_VM_BIND_FLAG_ASYNC | \
> > > > -	 DRM_XE_VM_BIND_FLAG_READONLY | DRM_XE_VM_BIND_FLAG_IMMEDIATE | \
> > > > -	 DRM_XE_VM_BIND_FLAG_NULL | 0xffff)
> > > > +	(FORCE_ASYNC_OP_ERROR | DRM_XE_VM_BIND_FLAG_READONLY | \
> > > > +	 DRM_XE_VM_BIND_FLAG_IMMEDIATE | DRM_XE_VM_BIND_FLAG_NULL | 0xffff)
> > > >    #else
> > > >    #define SUPPORTED_FLAGS	\
> > > > -	(DRM_XE_VM_BIND_FLAG_ASYNC | DRM_XE_VM_BIND_FLAG_READONLY | \
> > > > +	(DRM_XE_VM_BIND_FLAG_READONLY | \
> > > >    	 DRM_XE_VM_BIND_FLAG_IMMEDIATE | DRM_XE_VM_BIND_FLAG_NULL | \
> > > >    	 0xffff)
> > > >    #endif
> > > >    #define XE_64K_PAGE_MASK 0xffffull
> > > > +#define ALL_DRM_XE_SYNCS_FLAGS (DRM_XE_SYNCS_FLAG_WAIT_FOR_OP)
> > > >    #define MAX_BINDS	512	/* FIXME: Picking random upper limit */
> > > > @@ -2829,7 +2826,7 @@ static int vm_bind_ioctl_check_args(struct xe_device *xe,
> > > >    	int err;
> > > >    	int i;
> > > > -	if (XE_IOCTL_DBG(xe, args->pad || args->pad2) ||
> > > > +	if (XE_IOCTL_DBG(xe, args->pad) ||
> > > >    	    XE_IOCTL_DBG(xe, args->reserved[0] || args->reserved[1]))
> > > >    		return -EINVAL;
> > > > @@ -2857,6 +2854,14 @@ static int vm_bind_ioctl_check_args(struct xe_device *xe,
> > > >    		*bind_ops = &args->bind;
> > > >    	}
> > > > +	*async = !(args->syncs.flags & DRM_XE_SYNCS_FLAG_WAIT_FOR_OP);
> > > > +
> > > > +	if (XE_IOCTL_DBG(xe, args->syncs.flags & ~ALL_DRM_XE_SYNCS_FLAGS) ||
> > > > +	    XE_IOCTL_DBG(xe, !*async && args->syncs.num_syncs)) {
> > > > +		err = -EINVAL;
> > > > +		goto free_bind_ops;
> > > > +	}
> > > > +
> > > >    	for (i = 0; i < args->num_binds; ++i) {
> > > >    		u64 range = (*bind_ops)[i].range;
> > > >    		u64 addr = (*bind_ops)[i].addr;
> > > > @@ -2887,18 +2892,6 @@ static int vm_bind_ioctl_check_args(struct xe_device *xe,
> > > >    			goto free_bind_ops;
> > > >    		}
> > > > -		if (i == 0) {
> > > > -			*async = !!(flags & DRM_XE_VM_BIND_FLAG_ASYNC);
> > > > -			if (XE_IOCTL_DBG(xe, !*async && args->num_syncs)) {
> > > > -				err = -EINVAL;
> > > > -				goto free_bind_ops;
> > > > -			}
> > > > -		} else if (XE_IOCTL_DBG(xe, *async !=
> > > > -					!!(flags & DRM_XE_VM_BIND_FLAG_ASYNC))) {
> > > > -			err = -EINVAL;
> > > > -			goto free_bind_ops;
> > > > -		}
> > > > -
> > > >    		if (XE_IOCTL_DBG(xe, op > DRM_XE_VM_BIND_OP_PREFETCH) ||
> > > >    		    XE_IOCTL_DBG(xe, flags & ~SUPPORTED_FLAGS) ||
> > > >    		    XE_IOCTL_DBG(xe, obj && is_null) ||
> > > > @@ -2951,7 +2944,7 @@ static int vm_bind_ioctl_check_args(struct xe_device *xe,
> > > >    static int vm_bind_ioctl_signal_fences(struct xe_vm *vm,
> > > >    				       struct xe_exec_queue *q,
> > > >    				       struct xe_sync_entry *syncs,
> > > > -				       int num_syncs)
> > > > +				       int num_syncs, bool async)
> > > >    {
> > > >    	struct dma_fence *fence;
> > > >    	int i, err = 0;
> > > > @@ -2967,7 +2960,7 @@ static int vm_bind_ioctl_signal_fences(struct xe_vm *vm,
> > > >    	xe_exec_queue_last_fence_set(to_wait_exec_queue(vm, q), vm,
> > > >    				     fence);
> > > > -	if (xe_vm_sync_mode(vm, q)) {
> > > > +	if (!async) {
> > > >    		long timeout = dma_fence_wait(fence, true);
> > > >    		if (timeout < 0)
> > > > @@ -3001,7 +2994,7 @@ int xe_vm_bind_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
> > > >    	if (err)
> > > >    		return err;
> > > > -	if (XE_IOCTL_DBG(xe, args->pad || args->pad2) ||
> > > > +	if (XE_IOCTL_DBG(xe, args->pad) ||
> > > >    	    XE_IOCTL_DBG(xe, args->reserved[0] || args->reserved[1]))
> > > >    		return -EINVAL;
> > > > @@ -3016,12 +3009,6 @@ int xe_vm_bind_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
> > > >    			err = -EINVAL;
> > > >    			goto put_exec_queue;
> > > >    		}
> > > > -
> > > > -		if (XE_IOCTL_DBG(xe, args->num_binds && async !=
> > > > -				 !!(q->flags & EXEC_QUEUE_FLAG_VM_ASYNC))) {
> > > > -			err = -EINVAL;
> > > > -			goto put_exec_queue;
> > > > -		}
> > > >    	}
> > > >    	vm = xe_vm_lookup(xef, args->vm_id);
> > > > @@ -3030,14 +3017,6 @@ int xe_vm_bind_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
> > > >    		goto put_exec_queue;
> > > >    	}
> > > > -	if (!args->exec_queue_id) {
> > > > -		if (XE_IOCTL_DBG(xe, args->num_binds && async !=
> > > > -				 !!(vm->flags & XE_VM_FLAG_ASYNC_DEFAULT))) {
> > > > -			err = -EINVAL;
> > > > -			goto put_vm;
> > > > -		}
> > > > -	}
> > > > -
> > > >    	err = down_write_killable(&vm->lock);
> > > >    	if (err)
> > > >    		goto put_vm;
> > > > @@ -3127,16 +3106,16 @@ int xe_vm_bind_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
> > > >    		}
> > > >    	}
> > > > -	if (args->num_syncs) {
> > > > -		syncs = kcalloc(args->num_syncs, sizeof(*syncs), GFP_KERNEL);
> > > > +	if (args->syncs.num_syncs) {
> > > > +		syncs = kcalloc(args->syncs.num_syncs, sizeof(*syncs), GFP_KERNEL);
> > > >    		if (!syncs) {
> > > >    			err = -ENOMEM;
> > > >    			goto put_obj;
> > > >    		}
> > > >    	}
> > > > -	syncs_user = u64_to_user_ptr(args->syncs);
> > > > -	for (num_syncs = 0; num_syncs < args->num_syncs; num_syncs++) {
> > > > +	syncs_user = u64_to_user_ptr(args->syncs.syncs);
> > > > +	for (num_syncs = 0; num_syncs < args->syncs.num_syncs; num_syncs++) {
> > > >    		err = xe_sync_entry_parse(xe, xef, &syncs[num_syncs],
> > > >    					  &syncs_user[num_syncs],
> > > >    					  (xe_vm_in_lr_mode(vm) ?
> > > > @@ -3210,7 +3189,8 @@ int xe_vm_bind_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
> > > >    	vm_bind_ioctl_ops_unwind(vm, ops, args->num_binds);
> > > >    free_syncs:
> > > >    	if (err == -ENODATA)
> > > > -		err = vm_bind_ioctl_signal_fences(vm, q, syncs, num_syncs);
> > > > +		err = vm_bind_ioctl_signal_fences(vm, q, syncs, num_syncs,
> > > > +						  async);
> > > >    	while (num_syncs--)
> > > >    		xe_sync_entry_cleanup(&syncs[num_syncs]);
> > > > diff --git a/drivers/gpu/drm/xe/xe_vm_types.h b/drivers/gpu/drm/xe/xe_vm_types.h
> > > > index 23abdfd8622f..ce8b9bde7e9c 100644
> > > > --- a/drivers/gpu/drm/xe/xe_vm_types.h
> > > > +++ b/drivers/gpu/drm/xe/xe_vm_types.h
> > > > @@ -167,13 +167,12 @@ struct xe_vm {
> > > >    	 */
> > > >    #define XE_VM_FLAG_64K			BIT(0)
> > > >    #define XE_VM_FLAG_LR_MODE		BIT(1)
> > > > -#define XE_VM_FLAG_ASYNC_DEFAULT	BIT(2)
> > > > -#define XE_VM_FLAG_MIGRATION		BIT(3)
> > > > -#define XE_VM_FLAG_SCRATCH_PAGE		BIT(4)
> > > > -#define XE_VM_FLAG_FAULT_MODE		BIT(5)
> > > > -#define XE_VM_FLAG_BANNED		BIT(6)
> > > > -#define XE_VM_FLAG_TILE_ID(flags)	FIELD_GET(GENMASK(8, 7), flags)
> > > > -#define XE_VM_FLAG_SET_TILE_ID(tile)	FIELD_PREP(GENMASK(8, 7), (tile)->id)
> > > > +#define XE_VM_FLAG_MIGRATION		BIT(2)
> > > > +#define XE_VM_FLAG_SCRATCH_PAGE		BIT(3)
> > > > +#define XE_VM_FLAG_FAULT_MODE		BIT(4)
> > > > +#define XE_VM_FLAG_BANNED		BIT(5)
> > > > +#define XE_VM_FLAG_TILE_ID(flags)	FIELD_GET(GENMASK(7, 6), flags)
> > > > +#define XE_VM_FLAG_SET_TILE_ID(tile)	FIELD_PREP(GENMASK(7, 6), (tile)->id)
> > > >    	unsigned long flags;
> > > >    	/** @composite_fence_ctx: context composite fence */
> > > > @@ -385,6 +384,8 @@ enum xe_vma_op_flags {
> > > >    	XE_VMA_OP_PREV_COMMITTED	= BIT(3),
> > > >    	/** @XE_VMA_OP_NEXT_COMMITTED: Next VMA operation committed */
> > > >    	XE_VMA_OP_NEXT_COMMITTED	= BIT(4),
> > > > +	/** @XE_VMA_OP_ASYNC: operation is async */
> > > > +	XE_VMA_OP_ASYNC			= BIT(5),
> > > >    };
> > > >    /** struct xe_vma_op - VMA operation */
> > > > diff --git a/include/uapi/drm/xe_drm.h b/include/uapi/drm/xe_drm.h
> > > > index eb03a49c17a1..fd8172fe2d9a 100644
> > > > --- a/include/uapi/drm/xe_drm.h
> > > > +++ b/include/uapi/drm/xe_drm.h
> > > > @@ -141,8 +141,7 @@ struct drm_xe_engine_class_instance {
> > > >    	 * Kernel only classes (not actual hardware engine class). Used for
> > > >    	 * creating ordered queues of VM bind operations.
> > > >    	 */
> > > > -#define DRM_XE_ENGINE_CLASS_VM_BIND_ASYNC	5
> > > > -#define DRM_XE_ENGINE_CLASS_VM_BIND_SYNC	6
> > > > +#define DRM_XE_ENGINE_CLASS_VM_BIND		5
> > > >    	__u16 engine_class;
> > > >    	__u16 engine_instance;
> > > > @@ -660,7 +659,6 @@ struct drm_xe_vm_create {
> > > >    	 * still enable recoverable pagefaults if supported by the device.
> > > >    	 */
> > > >    #define DRM_XE_VM_CREATE_FLAG_LR_MODE	        (1 << 1)
> > > > -#define DRM_XE_VM_CREATE_FLAG_ASYNC_DEFAULT	(1 << 2)
> > > >    	/*
> > > >    	 * DRM_XE_VM_CREATE_FLAG_FAULT_MODE requires also
> > > >    	 * DRM_XE_VM_CREATE_FLAG_LR_MODE. It allows memory to be allocated
> > > > @@ -668,7 +666,7 @@ struct drm_xe_vm_create {
> > > >    	 * The xe driver internally uses recoverable pagefaults to implement
> > > >    	 * this.
> > > >    	 */
> > > > -#define DRM_XE_VM_CREATE_FLAG_FAULT_MODE	(1 << 3)
> > > > +#define DRM_XE_VM_CREATE_FLAG_FAULT_MODE	(1 << 2)
> > > >    	/** @flags: Flags */
> > > >    	__u32 flags;
> > > > @@ -776,12 +774,11 @@ struct drm_xe_vm_bind_op {
> > > >    	__u32 op;
> > > >    #define DRM_XE_VM_BIND_FLAG_READONLY	(1 << 0)
> > > > -#define DRM_XE_VM_BIND_FLAG_ASYNC	(1 << 1)
> > > >    	/*
> > > >    	 * Valid on a faulting VM only, do the MAP operation immediately rather
> > > >    	 * than deferring the MAP to the page fault handler.
> > > >    	 */
> > > > -#define DRM_XE_VM_BIND_FLAG_IMMEDIATE	(1 << 2)
> > > > +#define DRM_XE_VM_BIND_FLAG_IMMEDIATE	(1 << 1)
> > > >    	/*
> > > >    	 * When the NULL flag is set, the page tables are setup with a special
> > > >    	 * bit which indicates writes are dropped and all reads return zero.  In
> > > > @@ -789,7 +786,7 @@ struct drm_xe_vm_bind_op {
> > > >    	 * operations, the BO handle MBZ, and the BO offset MBZ. This flag is
> > > >    	 * intended to implement VK sparse bindings.
> > > >    	 */
> > > > -#define DRM_XE_VM_BIND_FLAG_NULL	(1 << 3)
> > > > +#define DRM_XE_VM_BIND_FLAG_NULL	(1 << 2)
> > > >    	/** @flags: Bind flags */
> > > >    	__u32 flags;
> > > > @@ -807,6 +804,27 @@ struct drm_xe_vm_bind_op {
> > > >    	__u64 reserved[3];
> > > >    };
> > > > +/**
> > > > + * struct drm_xe_syncs - In / out syncs for IOCTLs.
> > > > + */
> > > > +struct drm_xe_syncs {
> > > > +	/** @num_syncs: amount of syncs to wait on */
> > > > +	__u32 num_syncs;
> > > > +
> > > > +	/*
> > > > +	 * Block in IOCTL until operation complete, num_syncs MBZ if set.
> > > > +	 */
> > > > +#define DRM_XE_SYNCS_FLAG_WAIT_FOR_OP (1 << 0)
> > > > +	/** @flags: Sync flags */
> > > > +	__u32 flags;
> > > > +
> > > > +	/** @syncs: pointer to struct drm_xe_sync array */
> > > > +	__u64 syncs;
> > > > +
> > > > +	/** @reserved: Reserved */
> > > > +	__u64 reserved[2];
> > > > +};
> > > > +
> > > >    struct drm_xe_vm_bind {
> > > >    	/** @extensions: Pointer to the first extension struct, if any */
> > > >    	__u64 extensions;
> > > > @@ -838,14 +856,8 @@ struct drm_xe_vm_bind {
> > > >    		__u64 vector_of_binds;
> > > >    	};
> > > > -	/** @pad: MBZ */
> > > > -	__u32 pad2;
> > > > -
> > > > -	/** @num_syncs: amount of syncs to wait on */
> > > > -	__u32 num_syncs;
> > > > -
> > > > -	/** @syncs: pointer to struct drm_xe_sync array */
> > > > -	__u64 syncs;
> > > > +	/** @syncs: syncs for bind */
> > > > +	struct drm_xe_syncs syncs;
> > > >    	/** @reserved: Reserved */
> > > >    	__u64 reserved[2];
> > > > @@ -974,14 +986,14 @@ struct drm_xe_exec {
> > > >    	/** @extensions: Pointer to the first extension struct, if any */
> > > >    	__u64 extensions;
> > > > +	/** @pad: MBZ */
> > > > +	__u32 pad;
> > > > +
> > > >    	/** @exec_queue_id: Exec queue ID for the batch buffer */
> > > >    	__u32 exec_queue_id;
> > > > -	/** @num_syncs: Amount of struct drm_xe_sync in array. */
> > > > -	__u32 num_syncs;
> > > > -
> > > > -	/** @syncs: Pointer to struct drm_xe_sync array. */
> > > > -	__u64 syncs;
> > > > +	/** @syncs: syncs for exec */
> > > > +	struct drm_xe_syncs syncs;
> > > >    	/**
> > > >    	 * @address: address of batch buffer if num_batch_buffer == 1 or an
> > > > @@ -995,8 +1007,8 @@ struct drm_xe_exec {
> > > >    	 */
> > > >    	__u16 num_batch_buffer;
> > > > -	/** @pad: MBZ */
> > > > -	__u16 pad[3];
> > > > +	/** @pad2: MBZ */
> > > > +	__u16 pad2[3];
> > > >    	/** @reserved: Reserved */
> > > >    	__u64 reserved[2];

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [RFC PATCH 7/7] drm/xe/uapi: Uniform async vs sync handling
  2023-12-11 16:49         ` Matthew Brost
@ 2023-12-11 18:11           ` Thomas Hellström
  2023-12-11 21:11             ` Matthew Brost
  0 siblings, 1 reply; 22+ messages in thread
From: Thomas Hellström @ 2023-12-11 18:11 UTC (permalink / raw)
  To: Matthew Brost; +Cc: Francois Dugast, intel-xe, Rodrigo Vivi

On Mon, 2023-12-11 at 16:49 +0000, Matthew Brost wrote:
> On Mon, Dec 11, 2023 at 04:43:06PM +0100, Thomas Hellström wrote:
> > 
> > On 12/8/23 10:45, Matthew Brost wrote:
> > > On Fri, Dec 08, 2023 at 04:00:37PM +0100, Thomas Hellström wrote:
> > > > On 12/7/23 06:57, Matthew Brost wrote:
> > > > > Remove concept of async vs sync VM bind queues, rather make
> > > > > async vs
> > > > > sync a per IOCTL choice. Since this is per IOCTL, it makes
> > > > > sense to have
> > > > > a singular flag IOCTL rather than per VM bind op flag too.
> > > > > Add
> > > > > DRM_XE_SYNCS_FLAG_WAIT_FOR_OP which is an input sync flag to
> > > > > support
> > > > > this. Support this new flag for both the VM bind IOCTL and
> > > > > the exec
> > > > > IOCTL to match behavior.
> > > > > 
> > > > > Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
> > > > > Cc: Thomas Hellström <thomas.hellstrom@linux.intel.com>
> > > > > Cc: Francois Dugast <francois.dugast@intel.com>
> > > > > Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> > > > > ---
> > > > >    drivers/gpu/drm/xe/xe_exec.c             |  58 ++++++++---
> > > > > -
> > > > >    drivers/gpu/drm/xe/xe_exec_queue.c       |   7 +-
> > > > >    drivers/gpu/drm/xe/xe_exec_queue_types.h |   2 -
> > > > >    drivers/gpu/drm/xe/xe_vm.c               | 110 ++++++++++-
> > > > > ------------
> > > > >    drivers/gpu/drm/xe/xe_vm_types.h         |  15 ++--
> > > > >    include/uapi/drm/xe_drm.h                |  56 +++++++----
> > > > > -
> > > > >    6 files changed, 129 insertions(+), 119 deletions(-)
> > > > > 
> > > > > diff --git a/drivers/gpu/drm/xe/xe_exec.c
> > > > > b/drivers/gpu/drm/xe/xe_exec.c
> > > > > index 92b0da6580e8..c62cabfaa112 100644
> > > > > --- a/drivers/gpu/drm/xe/xe_exec.c
> > > > > +++ b/drivers/gpu/drm/xe/xe_exec.c
> > > > > @@ -130,12 +130,15 @@ static int xe_exec_begin(struct
> > > > > drm_exec *exec, struct xe_vm *vm)
> > > > >         return err;
> > > > >    }
> > > > > +#define ALL_DRM_XE_SYNCS_FLAGS
> > > > > (DRM_XE_SYNCS_FLAG_WAIT_FOR_OP)
> > > > > +
> > > > >    int xe_exec_ioctl(struct drm_device *dev, void *data,
> > > > > struct drm_file *file)
> > > > >    {
> > > > >         struct xe_device *xe = to_xe_device(dev);
> > > > >         struct xe_file *xef = to_xe_file(file);
> > > > >         struct drm_xe_exec *args = data;
> > > > > -       struct drm_xe_sync __user *syncs_user =
> > > > > u64_to_user_ptr(args->syncs);
> > > > > +       struct drm_xe_sync __user *syncs_user =
> > > > > +               u64_to_user_ptr(args->syncs.syncs);
> > > > >         u64 __user *addresses_user = u64_to_user_ptr(args-
> > > > > >address);
> > > > >         struct xe_exec_queue *q;
> > > > >         struct xe_sync_entry *syncs = NULL;
> > > > > @@ -143,15 +146,18 @@ int xe_exec_ioctl(struct drm_device
> > > > > *dev, void *data, struct drm_file *file)
> > > > >         struct drm_exec exec;
> > > > >         u32 i, num_syncs = 0;
> > > > >         struct xe_sched_job *job;
> > > > > -       struct dma_fence *rebind_fence;
> > > > > +       struct dma_fence *rebind_fence, *job_fence;
> > > > >         struct xe_vm *vm;
> > > > > -       bool write_locked;
> > > > > +       bool write_locked, skip_job_put = false;
> > > > > +       bool wait = args->syncs.flags &
> > > > > DRM_XE_SYNCS_FLAG_WAIT_FOR_OP;
> > > > >         ktime_t end = 0;
> > > > >         int err = 0;
> > > > >         if (XE_IOCTL_DBG(xe, args->extensions) ||
> > > > > -           XE_IOCTL_DBG(xe, args->pad[0] || args->pad[1] ||
> > > > > args->pad[2]) ||
> > > > > -           XE_IOCTL_DBG(xe, args->reserved[0] || args-
> > > > > >reserved[1]))
> > > > > +           XE_IOCTL_DBG(xe, args->pad || args->pad2[0] ||
> > > > > args->pad2[1] || args->pad2[2]) ||
> > > > > +           XE_IOCTL_DBG(xe, args->reserved[0] || args-
> > > > > >reserved[1]) ||
> > > > > +           XE_IOCTL_DBG(xe, args->syncs.flags &
> > > > > ~ALL_DRM_XE_SYNCS_FLAGS) ||
> > > > > +           XE_IOCTL_DBG(xe, wait && args->syncs.num_syncs))
> > > > >                 return -EINVAL;
> > > > >         q = xe_exec_queue_lookup(xef, args->exec_queue_id);
> > > > > @@ -170,8 +176,9 @@ int xe_exec_ioctl(struct drm_device *dev,
> > > > > void *data, struct drm_file *file)
> > > > >                 goto err_exec_queue;
> > > > >         }
> > > > > -       if (args->num_syncs) {
> > > > > -               syncs = kcalloc(args->num_syncs,
> > > > > sizeof(*syncs), GFP_KERNEL);
> > > > > +       if (args->syncs.num_syncs) {
> > > > > +               syncs = kcalloc(args->syncs.num_syncs,
> > > > > sizeof(*syncs),
> > > > > +                               GFP_KERNEL);
> > > > >                 if (!syncs) {
> > > > >                         err = -ENOMEM;
> > > > >                         goto err_exec_queue;
> > > > > @@ -180,7 +187,7 @@ int xe_exec_ioctl(struct drm_device *dev,
> > > > > void *data, struct drm_file *file)
> > > > >         vm = q->vm;
> > > > > -       for (i = 0; i < args->num_syncs; i++) {
> > > > > +       for (i = 0; i < args->syncs.num_syncs; i++) {
> > > > >                 err = xe_sync_entry_parse(xe, xef,
> > > > > &syncs[num_syncs++],
> > > > >                                           &syncs_user[i],
> > > > > SYNC_PARSE_FLAG_EXEC |
> > > > >                                          
> > > > > (xe_vm_in_lr_mode(vm) ?
> > > > > @@ -245,9 +252,17 @@ int xe_exec_ioctl(struct drm_device
> > > > > *dev, void *data, struct drm_file *file)
> > > > >                                 err = PTR_ERR(fence);
> > > > >                                 goto err_exec;
> > > > >                         }
> > > > > +
> > > > >                         for (i = 0; i < num_syncs; i++)
> > > > >                                 xe_sync_entry_signal(&syncs[i
> > > > > ], NULL, fence);
> > > > > +
> > > > >                         xe_exec_queue_last_fence_set(q, vm,
> > > > > fence);
> > > > > +                       if (wait) {
> > > > > +                               long timeout =
> > > > > dma_fence_wait(fence, true);
> > > > > +
> > > > > +                               if (timeout < 0)
> > > > > +                                       err = -EINTR;
> > > > > +                       }
> > > > Here it looks like we will rerun the same IOCTL again if we
> > > > return -EINTR.
> > > > The user-space expected action on -EINTR is to just restart the
> > > > IOCTL
> > > > without any argument changes. Solution is to add an ioctl
> > > > argument cookie
> > > > (or to skip sync vm binds and have the user just use the 0
> > > > batch buffers /
> > > > vm_binds calls or wait for an out-fence). If you go for the
> > > > cookie solution
> > > > then IMO we should keep the -ERESTARTSYS returned from
> > > > dma_fence_wait()
> > > > since it's converted to -EINTR on return-to-user-space, and the
> > > > kernel
> > > > restarts the IOCTL automatically if there was no requested-for-
> > > > delivery
> > > > signal pending.
> > > > 
> > > > I think the simplest solution at this point is to skip the sync
> > > > behaviour,
> > > > in particular if we enable the 0 batch / bind possibility.
> > > > 
> > > > If we still want to provide it, we could add a cookie address
> > > > as an
> > > > extension to the ioctl and activate sync if present? (Just
> > > > throwing up ideas
> > > > here).
> > > > 
> > > Hmm, forgot about this. A cookie is fairly easy, what about
> > > something like this:
> > > 
> > >   807 /**
> > >   808  * struct drm_xe_syncs - In / out syncs for IOCTLs.
> > >   809  */
> > >   810 struct drm_xe_syncs {
> > >   811         /** @num_syncs: amount of syncs to wait on */
> > >   812         __u32 num_syncs;
> > >   813
> > >   814         /*
> > >   815          * Block in IOCTL until operation complete,
> > > num_syncs MBZ if set.
> > >   816          */
> > >   817 #define DRM_XE_SYNCS_IN_FLAG_WAIT_FOR_OP (1 << 0)
> > >   818         /** @in_flags: Input Sync flags */
> > >   819         __u16 in_flags;
> > >   820
> > >   821         /*
> > >   822          * IOCTL operation has started (no need for user to
> > > resubmit on
> > >   823          * -ERESTARTSYS)
> > >   824          */
> > >   825 #define DRM_XE_SYNCS_OUT_FLAG_OP_COMMITTED (1 << 0)
> > >   826         /** @out_flags: Output Sync flags */
> > >   827         __u16 out_flags;
> > >   828
> > >   829         /** @syncs: pointer to struct drm_xe_sync array */
> > >   830         __u64 syncs;
> > >   831
> > >   832         /** @reserved: Reserved */
> > >   833         __u64 reserved[2];
> > >   834 };
> > > 
> > > DRM_XE_SYNCS_OUT_FLAG_OP_COMMITTED gets set in exec / bind IOCTL
> > > after
> > > the job is committed or in the of zero ops last-fence updated on
> > > the
> > > queue. Note that for binds we don't yet do 1 job per IOCTL but
> > > after
> > > landing some version of [1]
> > > 
> > > After DRM_XE_SYNCS_OUT_FLAG_OP_COMMITTED is set we return -
> > > ERESTARTSYS if
> > > the wait is interrupted and -EINTR is still
> > > DRM_XE_SYNCS_OUT_FLAG_OP_COMMITTED (interrupted before job is
> > > committed).
> > > 
> > > I'd rather go with patch as we have to change the uAPI here
> > > regardless
> > > so we might as well make this complete.
> > > 
> > > Matt
> > > 
> > > [1] https://patchwork.freedesktop.org/series/125608/
> > 
> > Yeah as we discussed in the meeting that means making the ioctl RW
> > instead
> > of W with some copying overhead.
> > 
> > I also think we should leave the EXEC ioctl out of this, meaning
> > just having
> > a single field in the VM_BIND ioctl. Basically the reason is that
> > waiting
> > like this after submission is a bit weird and does not align well
> > with how
> > -EINTR is typically used.
> > 
> 
> I kinda like uniform behavior between exec and binds with the
> behavior
> defined in a common sync structure.

Even so, I strongly think we should *not* in any way expose this for
exec. If needed the user can just wait for an out-fence and then we
don't need to implement code for this that will probably never get used
and with an implementation that very few will understand.

Furthermore the sync VM_BIND ioctl per the ASYNC VM_BIND doc doesn't
allow neither in-fences nor out fences, so grouping like this becomes a
bit overkill.

> 
> > So either a pointer to a cookie in the ioctl,
> > 
> 
> What about:
> 
> 119 > >   807 /**
> 120 > >   808  * struct drm_xe_syncs - In / out syncs for IOCTLs.
> 121 > >   809  */
> 122 > >   810 struct drm_xe_syncs {
> 123 > >   811         /** @num_syncs: amount of syncs to wait on */
> 124 > >   812         __u32 num_syncs;
> 125 > >   813
> 126 > >   814         /*
> 127 > >   815          * Block in IOCTL until operation complete,
> num_syncs MBZ if set.
> 128 > >   816          */
> 129 > >   817 #define DRM_XE_SYNCS_IN_FLAG_WAIT_FOR_OP (1 << 0)
> 130 > >   818         /** @flags: Sync flags */
> 131 > >   819         __u32 in_flags;
> 132 > >   820
> 138 > >   826         /** @cookie: userptr cookie written back with
> non-zero value once operation committed, only valid when IOCTL
> returns -EINTR */
> 139 > >   827         __u64 cookie;
> 140 > >   828
> 141 > >   829         /** @syncs: pointer to struct drm_xe_sync array
> */
> 142 > >   830         __u64 syncs;
> 143 > >   831
> 144 > >   832         /** @reserved: Reserved */
> 145 > >   833         __u64 reserved[2];
> 146 > >   834 };
> 
> Also if cookie is 0, we wait uninterruptable once the op is
> committed?

I'm afraid I don't follow. The *interruptible* wait after commit is
what trigger the need for a cookie in the first place? Also here,
@cookie is still read-only for the kernel since the struct drm_xe_syncs
is embedded in the ioctl. Also I think any cookie should be opaque to
the user, other than that it must must be 0 if not calling after an -
ERESTART.

> 
> > or perhaps dig up again the idea we had of mostly waiting before
> > the
> > submission:
> > 
> > 1) Pull out the last_op fence for the queue from under the relevant
> > lock.
> > 2) Wait for all dependencies without any locks.
> > 3) Lock, and (optionally) if the last_op fence changed, wait for
> > it.
> > 4) Submit
> > 5) Wait for completion uninterruptible.
> > 
> 
> We can always change the internal implementation to something like
> this
> after [1]. That series makes refactors like this quite a bit easier.

Well the idea of the above 1) - 5) was that we wouldn't be needing any
cookie at all, since the wait in 5) would be short, and we therefore
could get away with implementing it uninterruptible. If that turned out
to be bad, We add the cookie as an extension. Initial implementation
can even use uninterruptible waits for simplicity.

To summarize:

* I strongly don't think we should support sync exec calls.
* No in-syncs or out-syncs if SYNC.
* A flag to trigger sync binds. Syncs could be in a separate struct,
but not really needed if we don't support sync execs.
* If we go for the interruptible wait, we need a writable cookie that
is not embedded in the main struct.


/Thomas



> 
> Matt
> 
> [1] https://patchwork.freedesktop.org/series/125608/ 
> 
> > I actually like this last one best, but we'd recommend UMD to uses
> > out-fences whenever possible.
> > 
> > Thoughts?
> > 
> > > 
> > > > >                         dma_fence_put(fence);
> > > > >                 }
> > > > > @@ -331,42 +346,51 @@ int xe_exec_ioctl(struct drm_device
> > > > > *dev, void *data, struct drm_file *file)
> > > > >          * the job and let the DRM scheduler / backend clean
> > > > > up the job.
> > > > >          */
> > > > >         xe_sched_job_arm(job);
> > > > > +       job_fence = &job->drm.s_fence->finished;
> > > > > +       if (wait)
> > > > > +               dma_fence_get(job_fence);
> > > > >         if (!xe_vm_in_lr_mode(vm)) {
> > > > >                 /* Block userptr invalidations / BO eviction
> > > > > */
> > > > > -               dma_resv_add_fence(&vm->resv,
> > > > > -                                  &job->drm.s_fence-
> > > > > >finished,
> > > > > +               dma_resv_add_fence(&vm->resv, job_fence,
> > > > >                                    DMA_RESV_USAGE_BOOKKEEP);
> > > > >                 /*
> > > > >                  * Make implicit sync work across drivers,
> > > > > assuming all external
> > > > >                  * BOs are written as we don't pass in a read
> > > > > / write list.
> > > > >                  */
> > > > > -               xe_vm_fence_all_extobjs(vm, &job-
> > > > > >drm.s_fence->finished,
> > > > > -
> > > > >                                        DMA_RESV_USAGE_WRITE);
> > > > > +               xe_vm_fence_all_extobjs(vm, job_fence,
> > > > > DMA_RESV_USAGE_WRITE);
> > > > >         }
> > > > >         for (i = 0; i < num_syncs; i++)
> > > > > -               xe_sync_entry_signal(&syncs[i], job,
> > > > > -                                    &job->drm.s_fence-
> > > > > >finished);
> > > > > +               xe_sync_entry_signal(&syncs[i], job,
> > > > > job_fence);
> > > > >         if (xe_exec_queue_is_lr(q))
> > > > >                 q->ring_ops->emit_job(job);
> > > > >         if (!xe_vm_in_lr_mode(vm))
> > > > > -               xe_exec_queue_last_fence_set(q, vm, &job-
> > > > > >drm.s_fence->finished);
> > > > > +               xe_exec_queue_last_fence_set(q, vm,
> > > > > job_fence);
> > > > >         xe_sched_job_push(job);
> > > > >         xe_vm_reactivate_rebind(vm);
> > > > > -       if (!err && !xe_vm_in_lr_mode(vm)) {
> > > > > +       if (!xe_vm_in_lr_mode(vm)) {
> > > > >                 spin_lock(&xe->ttm.lru_lock);
> > > > >                 ttm_lru_bulk_move_tail(&vm->lru_bulk_move);
> > > > >                 spin_unlock(&xe->ttm.lru_lock);
> > > > >         }
> > > > > +       skip_job_put = true;
> > > > > +       if (wait) {
> > > > > +               long timeout = dma_fence_wait(job_fence,
> > > > > true);
> > > > > +
> > > > > +               dma_fence_put(job_fence);
> > > > > +               if (timeout < 0)
> > > > > +                       err = -EINTR;
> > > > > +       }
> > > > > +
> > > > >    err_repin:
> > > > >         if (!xe_vm_in_lr_mode(vm))
> > > > >                 up_read(&vm->userptr.notifier_lock);
> > > > >    err_put_job:
> > > > > -       if (err)
> > > > > +       if (err && !skip_job_put)
> > > > >                 xe_sched_job_put(job);
> > > > >    err_exec:
> > > > >         drm_exec_fini(&exec);
> > > > > diff --git a/drivers/gpu/drm/xe/xe_exec_queue.c
> > > > > b/drivers/gpu/drm/xe/xe_exec_queue.c
> > > > > index 3911d14522ee..98776d02d634 100644
> > > > > --- a/drivers/gpu/drm/xe/xe_exec_queue.c
> > > > > +++ b/drivers/gpu/drm/xe/xe_exec_queue.c
> > > > > @@ -625,10 +625,7 @@ int xe_exec_queue_create_ioctl(struct
> > > > > drm_device *dev, void *data,
> > > > >         if (XE_IOCTL_DBG(xe, eci[0].gt_id >= xe-
> > > > > >info.gt_count))
> > > > >                 return -EINVAL;
> > > > > -       if (eci[0].engine_class >=
> > > > > DRM_XE_ENGINE_CLASS_VM_BIND_ASYNC) {
> > > > > -               bool sync = eci[0].engine_class ==
> > > > > -                       DRM_XE_ENGINE_CLASS_VM_BIND_SYNC;
> > > > > -
> > > > > +       if (eci[0].engine_class ==
> > > > > DRM_XE_ENGINE_CLASS_VM_BIND) {
> > > > >                 for_each_gt(gt, xe, id) {
> > > > >                         struct xe_exec_queue *new;
> > > > > @@ -654,8 +651,6 @@ int xe_exec_queue_create_ioctl(struct
> > > > > drm_device *dev, void *data,
> > > > >                                                    args-
> > > > > >width, hwe,
> > > > >                                                   
> > > > > EXEC_QUEUE_FLAG_PERSISTENT |
> > > > >                                                   
> > > > > EXEC_QUEUE_FLAG_VM |
> > > > > -                                                  (sync ? 0
> > > > > :
> > > > > -                                                  
> > > > > EXEC_QUEUE_FLAG_VM_ASYNC) |
> > > > >                                                    (id ?
> > > > >                                                    
> > > > > EXEC_QUEUE_FLAG_BIND_ENGINE_CHILD :
> > > > >                                                     0));
> > > > > diff --git a/drivers/gpu/drm/xe/xe_exec_queue_types.h
> > > > > b/drivers/gpu/drm/xe/xe_exec_queue_types.h
> > > > > index 52f0927d0d9b..c78f6e8b41c4 100644
> > > > > --- a/drivers/gpu/drm/xe/xe_exec_queue_types.h
> > > > > +++ b/drivers/gpu/drm/xe/xe_exec_queue_types.h
> > > > > @@ -74,8 +74,6 @@ struct xe_exec_queue {
> > > > >    #define EXEC_QUEUE_FLAG_VM                   BIT(4)
> > > > >    /* child of VM queue for multi-tile VM jobs */
> > > > >    #define EXEC_QUEUE_FLAG_BIND_ENGINE_CHILD    BIT(5)
> > > > > -/* VM jobs for this queue are asynchronous */
> > > > > -#define EXEC_QUEUE_FLAG_VM_ASYNC               BIT(6)
> > > > >         /**
> > > > >          * @flags: flags for this exec queue, should
> > > > > statically setup aside from ban
> > > > > diff --git a/drivers/gpu/drm/xe/xe_vm.c
> > > > > b/drivers/gpu/drm/xe/xe_vm.c
> > > > > index cf2eb44a71db..4b0c976c003a 100644
> > > > > --- a/drivers/gpu/drm/xe/xe_vm.c
> > > > > +++ b/drivers/gpu/drm/xe/xe_vm.c
> > > > > @@ -1433,9 +1433,7 @@ struct xe_vm *xe_vm_create(struct
> > > > > xe_device *xe, u32 flags)
> > > > >                         struct xe_gt *gt = tile->primary_gt;
> > > > >                         struct xe_vm *migrate_vm;
> > > > >                         struct xe_exec_queue *q;
> > > > > -                       u32 create_flags = EXEC_QUEUE_FLAG_VM
> > > > > |
> > > > > -                               ((flags &
> > > > > XE_VM_FLAG_ASYNC_DEFAULT) ?
> > > > > -                               EXEC_QUEUE_FLAG_VM_ASYNC :
> > > > > 0);
> > > > > +                       u32 create_flags =
> > > > > EXEC_QUEUE_FLAG_VM;
> > > > >                         if (!vm->pt_root[id])
> > > > >                                 continue;
> > > > > @@ -1835,16 +1833,10 @@ xe_vm_bind_vma(struct xe_vma *vma,
> > > > > struct xe_exec_queue *q,
> > > > >         return ERR_PTR(err);
> > > > >    }
> > > > > -static bool xe_vm_sync_mode(struct xe_vm *vm, struct
> > > > > xe_exec_queue *q)
> > > > > -{
> > > > > -       return q ? !(q->flags & EXEC_QUEUE_FLAG_VM_ASYNC) :
> > > > > -               !(vm->flags & XE_VM_FLAG_ASYNC_DEFAULT);
> > > > > -}
> > > > > -
> > > > >    static int __xe_vm_bind(struct xe_vm *vm, struct xe_vma
> > > > > *vma,
> > > > >                         struct xe_exec_queue *q, struct
> > > > > xe_sync_entry *syncs,
> > > > >                         u32 num_syncs, bool immediate, bool
> > > > > first_op,
> > > > > -                       bool last_op)
> > > > > +                       bool last_op, bool async)
> > > > >    {
> > > > >         struct dma_fence *fence;
> > > > >         struct xe_exec_queue *wait_exec_queue =
> > > > > to_wait_exec_queue(vm, q);
> > > > > @@ -1870,7 +1862,7 @@ static int __xe_vm_bind(struct xe_vm
> > > > > *vm, struct xe_vma *vma,
> > > > >         if (last_op)
> > > > >                 xe_exec_queue_last_fence_set(wait_exec_queue,
> > > > > vm, fence);
> > > > > -       if (last_op && xe_vm_sync_mode(vm, q))
> > > > > +       if (last_op && !async)
> > > > >                 dma_fence_wait(fence, true);
> > > > >         dma_fence_put(fence);
> > > > > @@ -1880,7 +1872,7 @@ static int __xe_vm_bind(struct xe_vm
> > > > > *vm, struct xe_vma *vma,
> > > > >    static int xe_vm_bind(struct xe_vm *vm, struct xe_vma
> > > > > *vma, struct xe_exec_queue *q,
> > > > >                       struct xe_bo *bo, struct xe_sync_entry
> > > > > *syncs,
> > > > >                       u32 num_syncs, bool immediate, bool
> > > > > first_op,
> > > > > -                     bool last_op)
> > > > > +                     bool last_op, bool async)
> > > > >    {
> > > > >         int err;
> > > > > @@ -1894,12 +1886,12 @@ static int xe_vm_bind(struct xe_vm
> > > > > *vm, struct xe_vma *vma, struct xe_exec_queue
> > > > >         }
> > > > >         return __xe_vm_bind(vm, vma, q, syncs, num_syncs,
> > > > > immediate, first_op,
> > > > > -                           last_op);
> > > > > +                           last_op, async);
> > > > >    }
> > > > >    static int xe_vm_unbind(struct xe_vm *vm, struct xe_vma
> > > > > *vma,
> > > > >                         struct xe_exec_queue *q, struct
> > > > > xe_sync_entry *syncs,
> > > > > -                       u32 num_syncs, bool first_op, bool
> > > > > last_op)
> > > > > +                       u32 num_syncs, bool first_op, bool
> > > > > last_op, bool async)
> > > > >    {
> > > > >         struct dma_fence *fence;
> > > > >         struct xe_exec_queue *wait_exec_queue =
> > > > > to_wait_exec_queue(vm, q);
> > > > > @@ -1914,7 +1906,7 @@ static int xe_vm_unbind(struct xe_vm
> > > > > *vm, struct xe_vma *vma,
> > > > >         xe_vma_destroy(vma, fence);
> > > > >         if (last_op)
> > > > >                 xe_exec_queue_last_fence_set(wait_exec_queue,
> > > > > vm, fence);
> > > > > -       if (last_op && xe_vm_sync_mode(vm, q))
> > > > > +       if (last_op && !async)
> > > > >                 dma_fence_wait(fence, true);
> > > > It looks like we're dropping the error return code here.
> > > > 
> > > > 
> > > > >         dma_fence_put(fence);
> > > > > @@ -1923,7 +1915,6 @@ static int xe_vm_unbind(struct xe_vm
> > > > > *vm, struct xe_vma *vma,
> > > > >    #define ALL_DRM_XE_VM_CREATE_FLAGS
> > > > > (DRM_XE_VM_CREATE_FLAG_SCRATCH_PAGE | \
> > > > >                                    
> > > > > DRM_XE_VM_CREATE_FLAG_LR_MODE | \
> > > > > -                                  
> > > > > DRM_XE_VM_CREATE_FLAG_ASYNC_DEFAULT | \
> > > > >                                    
> > > > > DRM_XE_VM_CREATE_FLAG_FAULT_MODE)
> > > > >    int xe_vm_create_ioctl(struct drm_device *dev, void *data,
> > > > > @@ -1977,8 +1968,6 @@ int xe_vm_create_ioctl(struct
> > > > > drm_device *dev, void *data,
> > > > >                 flags |= XE_VM_FLAG_SCRATCH_PAGE;
> > > > >         if (args->flags & DRM_XE_VM_CREATE_FLAG_LR_MODE)
> > > > >                 flags |= XE_VM_FLAG_LR_MODE;
> > > > > -       if (args->flags &
> > > > > DRM_XE_VM_CREATE_FLAG_ASYNC_DEFAULT)
> > > > > -               flags |= XE_VM_FLAG_ASYNC_DEFAULT;
> > > > >         if (args->flags & DRM_XE_VM_CREATE_FLAG_FAULT_MODE)
> > > > >                 flags |= XE_VM_FLAG_FAULT_MODE;
> > > > > @@ -2062,7 +2051,7 @@ static const u32 region_to_mem_type[] =
> > > > > {
> > > > >    static int xe_vm_prefetch(struct xe_vm *vm, struct xe_vma
> > > > > *vma,
> > > > >                           struct xe_exec_queue *q, u32
> > > > > region,
> > > > >                           struct xe_sync_entry *syncs, u32
> > > > > num_syncs,
> > > > > -                         bool first_op, bool last_op)
> > > > > +                         bool first_op, bool last_op, bool
> > > > > async)
> > > > >    {
> > > > >         struct xe_exec_queue *wait_exec_queue =
> > > > > to_wait_exec_queue(vm, q);
> > > > >         int err;
> > > > > @@ -2077,7 +2066,7 @@ static int xe_vm_prefetch(struct xe_vm
> > > > > *vm, struct xe_vma *vma,
> > > > >         if (vma->tile_mask != (vma->tile_present & ~vma-
> > > > > >usm.tile_invalidated)) {
> > > > >                 return xe_vm_bind(vm, vma, q, xe_vma_bo(vma),
> > > > > syncs, num_syncs,
> > > > > -                                 true, first_op, last_op);
> > > > > +                                 true, first_op, last_op,
> > > > > async);
> > > > >         } else {
> > > > >                 int i;
> > > > > @@ -2400,6 +2389,8 @@ static int
> > > > > vm_bind_ioctl_ops_parse(struct xe_vm *vm, struct
> > > > > xe_exec_queue *q,
> > > > >                 }
> > > > >                 op->q = q;
> > > > > +               if (async)
> > > > > +                       op->flags |= XE_VMA_OP_ASYNC;
> > > > >                 switch (op->base.op) {
> > > > >                 case DRM_GPUVA_OP_MAP:
> > > > > @@ -2538,7 +2529,8 @@ static int op_execute(struct drm_exec
> > > > > *exec, struct xe_vm *vm,
> > > > >                                  op->syncs, op->num_syncs,
> > > > >                                  op->map.immediate ||
> > > > > !xe_vm_in_fault_mode(vm),
> > > > >                                  op->flags & XE_VMA_OP_FIRST,
> > > > > -                                op->flags & XE_VMA_OP_LAST);
> > > > > +                                op->flags & XE_VMA_OP_LAST,
> > > > > +                                op->flags &
> > > > > XE_VMA_OP_ASYNC);
> > > > >                 break;
> > > > >         case DRM_GPUVA_OP_REMAP:
> > > > >         {
> > > > > @@ -2552,7 +2544,8 @@ static int op_execute(struct drm_exec
> > > > > *exec, struct xe_vm *vm,
> > > > >                                            op->num_syncs,
> > > > >                                            op->flags &
> > > > > XE_VMA_OP_FIRST,
> > > > >                                            op->flags &
> > > > > XE_VMA_OP_LAST &&
> > > > > -                                          !prev && !next);
> > > > > +                                          !prev && !next,
> > > > > +                                          op->flags &
> > > > > XE_VMA_OP_ASYNC);
> > > > >                         if (err)
> > > > >                                 break;
> > > > >                         op->remap.unmap_done = true;
> > > > > @@ -2563,7 +2556,8 @@ static int op_execute(struct drm_exec
> > > > > *exec, struct xe_vm *vm,
> > > > >                         err = xe_vm_bind(vm, op->remap.prev,
> > > > > op->q,
> > > > >                                          xe_vma_bo(op-
> > > > > >remap.prev), op->syncs,
> > > > >                                          op->num_syncs, true,
> > > > > false,
> > > > > -                                        op->flags &
> > > > > XE_VMA_OP_LAST && !next);
> > > > > +                                        op->flags &
> > > > > XE_VMA_OP_LAST && !next,
> > > > > +                                        op->flags &
> > > > > XE_VMA_OP_ASYNC);
> > > > >                         op->remap.prev->gpuva.flags &=
> > > > > ~XE_VMA_LAST_REBIND;
> > > > >                         if (err)
> > > > >                                 break;
> > > > > @@ -2576,7 +2570,8 @@ static int op_execute(struct drm_exec
> > > > > *exec, struct xe_vm *vm,
> > > > >                                          xe_vma_bo(op-
> > > > > >remap.next),
> > > > >                                          op->syncs, op-
> > > > > >num_syncs,
> > > > >                                          true, false,
> > > > > -                                        op->flags &
> > > > > XE_VMA_OP_LAST);
> > > > > +                                        op->flags &
> > > > > XE_VMA_OP_LAST,
> > > > > +                                        op->flags &
> > > > > XE_VMA_OP_ASYNC);
> > > > >                         op->remap.next->gpuva.flags &=
> > > > > ~XE_VMA_LAST_REBIND;
> > > > >                         if (err)
> > > > >                                 break;
> > > > > @@ -2588,13 +2583,15 @@ static int op_execute(struct drm_exec
> > > > > *exec, struct xe_vm *vm,
> > > > >         case DRM_GPUVA_OP_UNMAP:
> > > > >                 err = xe_vm_unbind(vm, vma, op->q, op->syncs,
> > > > >                                    op->num_syncs, op->flags &
> > > > > XE_VMA_OP_FIRST,
> > > > > -                                  op->flags &
> > > > > XE_VMA_OP_LAST);
> > > > > +                                  op->flags &
> > > > > XE_VMA_OP_LAST,
> > > > > +                                  op->flags &
> > > > > XE_VMA_OP_ASYNC);
> > > > >                 break;
> > > > >         case DRM_GPUVA_OP_PREFETCH:
> > > > >                 err = xe_vm_prefetch(vm, vma, op->q, op-
> > > > > >prefetch.region,
> > > > >                                      op->syncs, op-
> > > > > >num_syncs,
> > > > >                                      op->flags &
> > > > > XE_VMA_OP_FIRST,
> > > > > -                                    op->flags &
> > > > > XE_VMA_OP_LAST);
> > > > > +                                    op->flags &
> > > > > XE_VMA_OP_LAST,
> > > > > +                                    op->flags &
> > > > > XE_VMA_OP_ASYNC);
> > > > >                 break;
> > > > >         default:
> > > > >                 drm_warn(&vm->xe->drm, "NOT POSSIBLE");
> > > > > @@ -2808,16 +2805,16 @@ static int
> > > > > vm_bind_ioctl_ops_execute(struct xe_vm *vm,
> > > > >    #ifdef TEST_VM_ASYNC_OPS_ERROR
> > > > >    #define SUPPORTED_FLAGS      \
> > > > > -       (FORCE_ASYNC_OP_ERROR | DRM_XE_VM_BIND_FLAG_ASYNC | \
> > > > > -        DRM_XE_VM_BIND_FLAG_READONLY |
> > > > > DRM_XE_VM_BIND_FLAG_IMMEDIATE | \
> > > > > -        DRM_XE_VM_BIND_FLAG_NULL | 0xffff)
> > > > > +       (FORCE_ASYNC_OP_ERROR | DRM_XE_VM_BIND_FLAG_READONLY
> > > > > | \
> > > > > +        DRM_XE_VM_BIND_FLAG_IMMEDIATE |
> > > > > DRM_XE_VM_BIND_FLAG_NULL | 0xffff)
> > > > >    #else
> > > > >    #define SUPPORTED_FLAGS      \
> > > > > -       (DRM_XE_VM_BIND_FLAG_ASYNC |
> > > > > DRM_XE_VM_BIND_FLAG_READONLY | \
> > > > > +       (DRM_XE_VM_BIND_FLAG_READONLY | \
> > > > >          DRM_XE_VM_BIND_FLAG_IMMEDIATE |
> > > > > DRM_XE_VM_BIND_FLAG_NULL | \
> > > > >          0xffff)
> > > > >    #endif
> > > > >    #define XE_64K_PAGE_MASK 0xffffull
> > > > > +#define ALL_DRM_XE_SYNCS_FLAGS
> > > > > (DRM_XE_SYNCS_FLAG_WAIT_FOR_OP)
> > > > >    #define MAX_BINDS    512     /* FIXME: Picking random
> > > > > upper limit */
> > > > > @@ -2829,7 +2826,7 @@ static int
> > > > > vm_bind_ioctl_check_args(struct xe_device *xe,
> > > > >         int err;
> > > > >         int i;
> > > > > -       if (XE_IOCTL_DBG(xe, args->pad || args->pad2) ||
> > > > > +       if (XE_IOCTL_DBG(xe, args->pad) ||
> > > > >             XE_IOCTL_DBG(xe, args->reserved[0] || args-
> > > > > >reserved[1]))
> > > > >                 return -EINVAL;
> > > > > @@ -2857,6 +2854,14 @@ static int
> > > > > vm_bind_ioctl_check_args(struct xe_device *xe,
> > > > >                 *bind_ops = &args->bind;
> > > > >         }
> > > > > +       *async = !(args->syncs.flags &
> > > > > DRM_XE_SYNCS_FLAG_WAIT_FOR_OP);
> > > > > +
> > > > > +       if (XE_IOCTL_DBG(xe, args->syncs.flags &
> > > > > ~ALL_DRM_XE_SYNCS_FLAGS) ||
> > > > > +           XE_IOCTL_DBG(xe, !*async && args-
> > > > > >syncs.num_syncs)) {
> > > > > +               err = -EINVAL;
> > > > > +               goto free_bind_ops;
> > > > > +       }
> > > > > +
> > > > >         for (i = 0; i < args->num_binds; ++i) {
> > > > >                 u64 range = (*bind_ops)[i].range;
> > > > >                 u64 addr = (*bind_ops)[i].addr;
> > > > > @@ -2887,18 +2892,6 @@ static int
> > > > > vm_bind_ioctl_check_args(struct xe_device *xe,
> > > > >                         goto free_bind_ops;
> > > > >                 }
> > > > > -               if (i == 0) {
> > > > > -                       *async = !!(flags &
> > > > > DRM_XE_VM_BIND_FLAG_ASYNC);
> > > > > -                       if (XE_IOCTL_DBG(xe, !*async && args-
> > > > > >num_syncs)) {
> > > > > -                               err = -EINVAL;
> > > > > -                               goto free_bind_ops;
> > > > > -                       }
> > > > > -               } else if (XE_IOCTL_DBG(xe, *async !=
> > > > > -                                       !!(flags &
> > > > > DRM_XE_VM_BIND_FLAG_ASYNC))) {
> > > > > -                       err = -EINVAL;
> > > > > -                       goto free_bind_ops;
> > > > > -               }
> > > > > -
> > > > >                 if (XE_IOCTL_DBG(xe, op >
> > > > > DRM_XE_VM_BIND_OP_PREFETCH) ||
> > > > >                     XE_IOCTL_DBG(xe, flags &
> > > > > ~SUPPORTED_FLAGS) ||
> > > > >                     XE_IOCTL_DBG(xe, obj && is_null) ||
> > > > > @@ -2951,7 +2944,7 @@ static int
> > > > > vm_bind_ioctl_check_args(struct xe_device *xe,
> > > > >    static int vm_bind_ioctl_signal_fences(struct xe_vm *vm,
> > > > >                                        struct xe_exec_queue
> > > > > *q,
> > > > >                                        struct xe_sync_entry
> > > > > *syncs,
> > > > > -                                      int num_syncs)
> > > > > +                                      int num_syncs, bool
> > > > > async)
> > > > >    {
> > > > >         struct dma_fence *fence;
> > > > >         int i, err = 0;
> > > > > @@ -2967,7 +2960,7 @@ static int
> > > > > vm_bind_ioctl_signal_fences(struct xe_vm *vm,
> > > > >         xe_exec_queue_last_fence_set(to_wait_exec_queue(vm,
> > > > > q), vm,
> > > > >                                      fence);
> > > > > -       if (xe_vm_sync_mode(vm, q)) {
> > > > > +       if (!async) {
> > > > >                 long timeout = dma_fence_wait(fence, true);
> > > > >                 if (timeout < 0)
> > > > > @@ -3001,7 +2994,7 @@ int xe_vm_bind_ioctl(struct drm_device
> > > > > *dev, void *data, struct drm_file *file)
> > > > >         if (err)
> > > > >                 return err;
> > > > > -       if (XE_IOCTL_DBG(xe, args->pad || args->pad2) ||
> > > > > +       if (XE_IOCTL_DBG(xe, args->pad) ||
> > > > >             XE_IOCTL_DBG(xe, args->reserved[0] || args-
> > > > > >reserved[1]))
> > > > >                 return -EINVAL;
> > > > > @@ -3016,12 +3009,6 @@ int xe_vm_bind_ioctl(struct drm_device
> > > > > *dev, void *data, struct drm_file *file)
> > > > >                         err = -EINVAL;
> > > > >                         goto put_exec_queue;
> > > > >                 }
> > > > > -
> > > > > -               if (XE_IOCTL_DBG(xe, args->num_binds && async
> > > > > !=
> > > > > -                                !!(q->flags &
> > > > > EXEC_QUEUE_FLAG_VM_ASYNC))) {
> > > > > -                       err = -EINVAL;
> > > > > -                       goto put_exec_queue;
> > > > > -               }
> > > > >         }
> > > > >         vm = xe_vm_lookup(xef, args->vm_id);
> > > > > @@ -3030,14 +3017,6 @@ int xe_vm_bind_ioctl(struct drm_device
> > > > > *dev, void *data, struct drm_file *file)
> > > > >                 goto put_exec_queue;
> > > > >         }
> > > > > -       if (!args->exec_queue_id) {
> > > > > -               if (XE_IOCTL_DBG(xe, args->num_binds && async
> > > > > !=
> > > > > -                                !!(vm->flags &
> > > > > XE_VM_FLAG_ASYNC_DEFAULT))) {
> > > > > -                       err = -EINVAL;
> > > > > -                       goto put_vm;
> > > > > -               }
> > > > > -       }
> > > > > -
> > > > >         err = down_write_killable(&vm->lock);
> > > > >         if (err)
> > > > >                 goto put_vm;
> > > > > @@ -3127,16 +3106,16 @@ int xe_vm_bind_ioctl(struct
> > > > > drm_device *dev, void *data, struct drm_file *file)
> > > > >                 }
> > > > >         }
> > > > > -       if (args->num_syncs) {
> > > > > -               syncs = kcalloc(args->num_syncs,
> > > > > sizeof(*syncs), GFP_KERNEL);
> > > > > +       if (args->syncs.num_syncs) {
> > > > > +               syncs = kcalloc(args->syncs.num_syncs,
> > > > > sizeof(*syncs), GFP_KERNEL);
> > > > >                 if (!syncs) {
> > > > >                         err = -ENOMEM;
> > > > >                         goto put_obj;
> > > > >                 }
> > > > >         }
> > > > > -       syncs_user = u64_to_user_ptr(args->syncs);
> > > > > -       for (num_syncs = 0; num_syncs < args->num_syncs;
> > > > > num_syncs++) {
> > > > > +       syncs_user = u64_to_user_ptr(args->syncs.syncs);
> > > > > +       for (num_syncs = 0; num_syncs < args-
> > > > > >syncs.num_syncs; num_syncs++) {
> > > > >                 err = xe_sync_entry_parse(xe, xef,
> > > > > &syncs[num_syncs],
> > > > >                                          
> > > > > &syncs_user[num_syncs],
> > > > >                                          
> > > > > (xe_vm_in_lr_mode(vm) ?
> > > > > @@ -3210,7 +3189,8 @@ int xe_vm_bind_ioctl(struct drm_device
> > > > > *dev, void *data, struct drm_file *file)
> > > > >         vm_bind_ioctl_ops_unwind(vm, ops, args->num_binds);
> > > > >    free_syncs:
> > > > >         if (err == -ENODATA)
> > > > > -               err = vm_bind_ioctl_signal_fences(vm, q,
> > > > > syncs, num_syncs);
> > > > > +               err = vm_bind_ioctl_signal_fences(vm, q,
> > > > > syncs, num_syncs,
> > > > > +                                                 async);
> > > > >         while (num_syncs--)
> > > > >                 xe_sync_entry_cleanup(&syncs[num_syncs]);
> > > > > diff --git a/drivers/gpu/drm/xe/xe_vm_types.h
> > > > > b/drivers/gpu/drm/xe/xe_vm_types.h
> > > > > index 23abdfd8622f..ce8b9bde7e9c 100644
> > > > > --- a/drivers/gpu/drm/xe/xe_vm_types.h
> > > > > +++ b/drivers/gpu/drm/xe/xe_vm_types.h
> > > > > @@ -167,13 +167,12 @@ struct xe_vm {
> > > > >          */
> > > > >    #define XE_VM_FLAG_64K                       BIT(0)
> > > > >    #define XE_VM_FLAG_LR_MODE           BIT(1)
> > > > > -#define XE_VM_FLAG_ASYNC_DEFAULT       BIT(2)
> > > > > -#define XE_VM_FLAG_MIGRATION           BIT(3)
> > > > > -#define XE_VM_FLAG_SCRATCH_PAGE                BIT(4)
> > > > > -#define XE_VM_FLAG_FAULT_MODE          BIT(5)
> > > > > -#define XE_VM_FLAG_BANNED              BIT(6)
> > > > > -#define XE_VM_FLAG_TILE_ID(flags)      FIELD_GET(GENMASK(8,
> > > > > 7), flags)
> > > > > -#define XE_VM_FLAG_SET_TILE_ID(tile)   FIELD_PREP(GENMASK(8,
> > > > > 7), (tile)->id)
> > > > > +#define XE_VM_FLAG_MIGRATION           BIT(2)
> > > > > +#define XE_VM_FLAG_SCRATCH_PAGE                BIT(3)
> > > > > +#define XE_VM_FLAG_FAULT_MODE          BIT(4)
> > > > > +#define XE_VM_FLAG_BANNED              BIT(5)
> > > > > +#define XE_VM_FLAG_TILE_ID(flags)      FIELD_GET(GENMASK(7,
> > > > > 6), flags)
> > > > > +#define XE_VM_FLAG_SET_TILE_ID(tile)   FIELD_PREP(GENMASK(7,
> > > > > 6), (tile)->id)
> > > > >         unsigned long flags;
> > > > >         /** @composite_fence_ctx: context composite fence */
> > > > > @@ -385,6 +384,8 @@ enum xe_vma_op_flags {
> > > > >         XE_VMA_OP_PREV_COMMITTED        = BIT(3),
> > > > >         /** @XE_VMA_OP_NEXT_COMMITTED: Next VMA operation
> > > > > committed */
> > > > >         XE_VMA_OP_NEXT_COMMITTED        = BIT(4),
> > > > > +       /** @XE_VMA_OP_ASYNC: operation is async */
> > > > > +       XE_VMA_OP_ASYNC                 = BIT(5),
> > > > >    };
> > > > >    /** struct xe_vma_op - VMA operation */
> > > > > diff --git a/include/uapi/drm/xe_drm.h
> > > > > b/include/uapi/drm/xe_drm.h
> > > > > index eb03a49c17a1..fd8172fe2d9a 100644
> > > > > --- a/include/uapi/drm/xe_drm.h
> > > > > +++ b/include/uapi/drm/xe_drm.h
> > > > > @@ -141,8 +141,7 @@ struct drm_xe_engine_class_instance {
> > > > >          * Kernel only classes (not actual hardware engine
> > > > > class). Used for
> > > > >          * creating ordered queues of VM bind operations.
> > > > >          */
> > > > > -#define DRM_XE_ENGINE_CLASS_VM_BIND_ASYNC      5
> > > > > -#define DRM_XE_ENGINE_CLASS_VM_BIND_SYNC       6
> > > > > +#define DRM_XE_ENGINE_CLASS_VM_BIND            5
> > > > >         __u16 engine_class;
> > > > >         __u16 engine_instance;
> > > > > @@ -660,7 +659,6 @@ struct drm_xe_vm_create {
> > > > >          * still enable recoverable pagefaults if supported
> > > > > by the device.
> > > > >          */
> > > > >    #define DRM_XE_VM_CREATE_FLAG_LR_MODE                (1 <<
> > > > > 1)
> > > > > -#define DRM_XE_VM_CREATE_FLAG_ASYNC_DEFAULT    (1 << 2)
> > > > >         /*
> > > > >          * DRM_XE_VM_CREATE_FLAG_FAULT_MODE requires also
> > > > >          * DRM_XE_VM_CREATE_FLAG_LR_MODE. It allows memory to
> > > > > be allocated
> > > > > @@ -668,7 +666,7 @@ struct drm_xe_vm_create {
> > > > >          * The xe driver internally uses recoverable
> > > > > pagefaults to implement
> > > > >          * this.
> > > > >          */
> > > > > -#define DRM_XE_VM_CREATE_FLAG_FAULT_MODE       (1 << 3)
> > > > > +#define DRM_XE_VM_CREATE_FLAG_FAULT_MODE       (1 << 2)
> > > > >         /** @flags: Flags */
> > > > >         __u32 flags;
> > > > > @@ -776,12 +774,11 @@ struct drm_xe_vm_bind_op {
> > > > >         __u32 op;
> > > > >    #define DRM_XE_VM_BIND_FLAG_READONLY (1 << 0)
> > > > > -#define DRM_XE_VM_BIND_FLAG_ASYNC      (1 << 1)
> > > > >         /*
> > > > >          * Valid on a faulting VM only, do the MAP operation
> > > > > immediately rather
> > > > >          * than deferring the MAP to the page fault handler.
> > > > >          */
> > > > > -#define DRM_XE_VM_BIND_FLAG_IMMEDIATE  (1 << 2)
> > > > > +#define DRM_XE_VM_BIND_FLAG_IMMEDIATE  (1 << 1)
> > > > >         /*
> > > > >          * When the NULL flag is set, the page tables are
> > > > > setup with a special
> > > > >          * bit which indicates writes are dropped and all
> > > > > reads return zero.  In
> > > > > @@ -789,7 +786,7 @@ struct drm_xe_vm_bind_op {
> > > > >          * operations, the BO handle MBZ, and the BO offset
> > > > > MBZ. This flag is
> > > > >          * intended to implement VK sparse bindings.
> > > > >          */
> > > > > -#define DRM_XE_VM_BIND_FLAG_NULL       (1 << 3)
> > > > > +#define DRM_XE_VM_BIND_FLAG_NULL       (1 << 2)
> > > > >         /** @flags: Bind flags */
> > > > >         __u32 flags;
> > > > > @@ -807,6 +804,27 @@ struct drm_xe_vm_bind_op {
> > > > >         __u64 reserved[3];
> > > > >    };
> > > > > +/**
> > > > > + * struct drm_xe_syncs - In / out syncs for IOCTLs.
> > > > > + */
> > > > > +struct drm_xe_syncs {
> > > > > +       /** @num_syncs: amount of syncs to wait on */
> > > > > +       __u32 num_syncs;
> > > > > +
> > > > > +       /*
> > > > > +        * Block in IOCTL until operation complete, num_syncs
> > > > > MBZ if set.
> > > > > +        */
> > > > > +#define DRM_XE_SYNCS_FLAG_WAIT_FOR_OP (1 << 0)
> > > > > +       /** @flags: Sync flags */
> > > > > +       __u32 flags;
> > > > > +
> > > > > +       /** @syncs: pointer to struct drm_xe_sync array */
> > > > > +       __u64 syncs;
> > > > > +
> > > > > +       /** @reserved: Reserved */
> > > > > +       __u64 reserved[2];
> > > > > +};
> > > > > +
> > > > >    struct drm_xe_vm_bind {
> > > > >         /** @extensions: Pointer to the first extension
> > > > > struct, if any */
> > > > >         __u64 extensions;
> > > > > @@ -838,14 +856,8 @@ struct drm_xe_vm_bind {
> > > > >                 __u64 vector_of_binds;
> > > > >         };
> > > > > -       /** @pad: MBZ */
> > > > > -       __u32 pad2;
> > > > > -
> > > > > -       /** @num_syncs: amount of syncs to wait on */
> > > > > -       __u32 num_syncs;
> > > > > -
> > > > > -       /** @syncs: pointer to struct drm_xe_sync array */
> > > > > -       __u64 syncs;
> > > > > +       /** @syncs: syncs for bind */
> > > > > +       struct drm_xe_syncs syncs;
> > > > >         /** @reserved: Reserved */
> > > > >         __u64 reserved[2];
> > > > > @@ -974,14 +986,14 @@ struct drm_xe_exec {
> > > > >         /** @extensions: Pointer to the first extension
> > > > > struct, if any */
> > > > >         __u64 extensions;
> > > > > +       /** @pad: MBZ */
> > > > > +       __u32 pad;
> > > > > +
> > > > >         /** @exec_queue_id: Exec queue ID for the batch
> > > > > buffer */
> > > > >         __u32 exec_queue_id;
> > > > > -       /** @num_syncs: Amount of struct drm_xe_sync in
> > > > > array. */
> > > > > -       __u32 num_syncs;
> > > > > -
> > > > > -       /** @syncs: Pointer to struct drm_xe_sync array. */
> > > > > -       __u64 syncs;
> > > > > +       /** @syncs: syncs for exec */
> > > > > +       struct drm_xe_syncs syncs;
> > > > >         /**
> > > > >          * @address: address of batch buffer if
> > > > > num_batch_buffer == 1 or an
> > > > > @@ -995,8 +1007,8 @@ struct drm_xe_exec {
> > > > >          */
> > > > >         __u16 num_batch_buffer;
> > > > > -       /** @pad: MBZ */
> > > > > -       __u16 pad[3];
> > > > > +       /** @pad2: MBZ */
> > > > > +       __u16 pad2[3];
> > > > >         /** @reserved: Reserved */
> > > > >         __u64 reserved[2];


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [RFC PATCH 7/7] drm/xe/uapi: Uniform async vs sync handling
  2023-12-11 18:11           ` Thomas Hellström
@ 2023-12-11 21:11             ` Matthew Brost
  2023-12-12  8:43               ` Thomas Hellström
  0 siblings, 1 reply; 22+ messages in thread
From: Matthew Brost @ 2023-12-11 21:11 UTC (permalink / raw)
  To: Thomas Hellström; +Cc: Francois Dugast, intel-xe, Rodrigo Vivi

On Mon, Dec 11, 2023 at 07:11:15PM +0100, Thomas Hellström wrote:
> On Mon, 2023-12-11 at 16:49 +0000, Matthew Brost wrote:
> > On Mon, Dec 11, 2023 at 04:43:06PM +0100, Thomas Hellström wrote:
> > > 
> > > On 12/8/23 10:45, Matthew Brost wrote:
> > > > On Fri, Dec 08, 2023 at 04:00:37PM +0100, Thomas Hellström wrote:
> > > > > On 12/7/23 06:57, Matthew Brost wrote:
> > > > > > Remove concept of async vs sync VM bind queues, rather make
> > > > > > async vs
> > > > > > sync a per IOCTL choice. Since this is per IOCTL, it makes
> > > > > > sense to have
> > > > > > a singular flag IOCTL rather than per VM bind op flag too.
> > > > > > Add
> > > > > > DRM_XE_SYNCS_FLAG_WAIT_FOR_OP which is an input sync flag to
> > > > > > support
> > > > > > this. Support this new flag for both the VM bind IOCTL and
> > > > > > the exec
> > > > > > IOCTL to match behavior.
> > > > > > 
> > > > > > Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
> > > > > > Cc: Thomas Hellström <thomas.hellstrom@linux.intel.com>
> > > > > > Cc: Francois Dugast <francois.dugast@intel.com>
> > > > > > Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> > > > > > ---
> > > > > >    drivers/gpu/drm/xe/xe_exec.c             |  58 ++++++++---
> > > > > > -
> > > > > >    drivers/gpu/drm/xe/xe_exec_queue.c       |   7 +-
> > > > > >    drivers/gpu/drm/xe/xe_exec_queue_types.h |   2 -
> > > > > >    drivers/gpu/drm/xe/xe_vm.c               | 110 ++++++++++-
> > > > > > ------------
> > > > > >    drivers/gpu/drm/xe/xe_vm_types.h         |  15 ++--
> > > > > >    include/uapi/drm/xe_drm.h                |  56 +++++++----
> > > > > > -
> > > > > >    6 files changed, 129 insertions(+), 119 deletions(-)
> > > > > > 
> > > > > > diff --git a/drivers/gpu/drm/xe/xe_exec.c
> > > > > > b/drivers/gpu/drm/xe/xe_exec.c
> > > > > > index 92b0da6580e8..c62cabfaa112 100644
> > > > > > --- a/drivers/gpu/drm/xe/xe_exec.c
> > > > > > +++ b/drivers/gpu/drm/xe/xe_exec.c
> > > > > > @@ -130,12 +130,15 @@ static int xe_exec_begin(struct
> > > > > > drm_exec *exec, struct xe_vm *vm)
> > > > > >         return err;
> > > > > >    }
> > > > > > +#define ALL_DRM_XE_SYNCS_FLAGS
> > > > > > (DRM_XE_SYNCS_FLAG_WAIT_FOR_OP)
> > > > > > +
> > > > > >    int xe_exec_ioctl(struct drm_device *dev, void *data,
> > > > > > struct drm_file *file)
> > > > > >    {
> > > > > >         struct xe_device *xe = to_xe_device(dev);
> > > > > >         struct xe_file *xef = to_xe_file(file);
> > > > > >         struct drm_xe_exec *args = data;
> > > > > > -       struct drm_xe_sync __user *syncs_user =
> > > > > > u64_to_user_ptr(args->syncs);
> > > > > > +       struct drm_xe_sync __user *syncs_user =
> > > > > > +               u64_to_user_ptr(args->syncs.syncs);
> > > > > >         u64 __user *addresses_user = u64_to_user_ptr(args-
> > > > > > >address);
> > > > > >         struct xe_exec_queue *q;
> > > > > >         struct xe_sync_entry *syncs = NULL;
> > > > > > @@ -143,15 +146,18 @@ int xe_exec_ioctl(struct drm_device
> > > > > > *dev, void *data, struct drm_file *file)
> > > > > >         struct drm_exec exec;
> > > > > >         u32 i, num_syncs = 0;
> > > > > >         struct xe_sched_job *job;
> > > > > > -       struct dma_fence *rebind_fence;
> > > > > > +       struct dma_fence *rebind_fence, *job_fence;
> > > > > >         struct xe_vm *vm;
> > > > > > -       bool write_locked;
> > > > > > +       bool write_locked, skip_job_put = false;
> > > > > > +       bool wait = args->syncs.flags &
> > > > > > DRM_XE_SYNCS_FLAG_WAIT_FOR_OP;
> > > > > >         ktime_t end = 0;
> > > > > >         int err = 0;
> > > > > >         if (XE_IOCTL_DBG(xe, args->extensions) ||
> > > > > > -           XE_IOCTL_DBG(xe, args->pad[0] || args->pad[1] ||
> > > > > > args->pad[2]) ||
> > > > > > -           XE_IOCTL_DBG(xe, args->reserved[0] || args-
> > > > > > >reserved[1]))
> > > > > > +           XE_IOCTL_DBG(xe, args->pad || args->pad2[0] ||
> > > > > > args->pad2[1] || args->pad2[2]) ||
> > > > > > +           XE_IOCTL_DBG(xe, args->reserved[0] || args-
> > > > > > >reserved[1]) ||
> > > > > > +           XE_IOCTL_DBG(xe, args->syncs.flags &
> > > > > > ~ALL_DRM_XE_SYNCS_FLAGS) ||
> > > > > > +           XE_IOCTL_DBG(xe, wait && args->syncs.num_syncs))
> > > > > >                 return -EINVAL;
> > > > > >         q = xe_exec_queue_lookup(xef, args->exec_queue_id);
> > > > > > @@ -170,8 +176,9 @@ int xe_exec_ioctl(struct drm_device *dev,
> > > > > > void *data, struct drm_file *file)
> > > > > >                 goto err_exec_queue;
> > > > > >         }
> > > > > > -       if (args->num_syncs) {
> > > > > > -               syncs = kcalloc(args->num_syncs,
> > > > > > sizeof(*syncs), GFP_KERNEL);
> > > > > > +       if (args->syncs.num_syncs) {
> > > > > > +               syncs = kcalloc(args->syncs.num_syncs,
> > > > > > sizeof(*syncs),
> > > > > > +                               GFP_KERNEL);
> > > > > >                 if (!syncs) {
> > > > > >                         err = -ENOMEM;
> > > > > >                         goto err_exec_queue;
> > > > > > @@ -180,7 +187,7 @@ int xe_exec_ioctl(struct drm_device *dev,
> > > > > > void *data, struct drm_file *file)
> > > > > >         vm = q->vm;
> > > > > > -       for (i = 0; i < args->num_syncs; i++) {
> > > > > > +       for (i = 0; i < args->syncs.num_syncs; i++) {
> > > > > >                 err = xe_sync_entry_parse(xe, xef,
> > > > > > &syncs[num_syncs++],
> > > > > >                                           &syncs_user[i],
> > > > > > SYNC_PARSE_FLAG_EXEC |
> > > > > >                                          
> > > > > > (xe_vm_in_lr_mode(vm) ?
> > > > > > @@ -245,9 +252,17 @@ int xe_exec_ioctl(struct drm_device
> > > > > > *dev, void *data, struct drm_file *file)
> > > > > >                                 err = PTR_ERR(fence);
> > > > > >                                 goto err_exec;
> > > > > >                         }
> > > > > > +
> > > > > >                         for (i = 0; i < num_syncs; i++)
> > > > > >                                 xe_sync_entry_signal(&syncs[i
> > > > > > ], NULL, fence);
> > > > > > +
> > > > > >                         xe_exec_queue_last_fence_set(q, vm,
> > > > > > fence);
> > > > > > +                       if (wait) {
> > > > > > +                               long timeout =
> > > > > > dma_fence_wait(fence, true);
> > > > > > +
> > > > > > +                               if (timeout < 0)
> > > > > > +                                       err = -EINTR;
> > > > > > +                       }
> > > > > Here it looks like we will rerun the same IOCTL again if we
> > > > > return -EINTR.
> > > > > The user-space expected action on -EINTR is to just restart the
> > > > > IOCTL
> > > > > without any argument changes. Solution is to add an ioctl
> > > > > argument cookie
> > > > > (or to skip sync vm binds and have the user just use the 0
> > > > > batch buffers /
> > > > > vm_binds calls or wait for an out-fence). If you go for the
> > > > > cookie solution
> > > > > then IMO we should keep the -ERESTARTSYS returned from
> > > > > dma_fence_wait()
> > > > > since it's converted to -EINTR on return-to-user-space, and the
> > > > > kernel
> > > > > restarts the IOCTL automatically if there was no requested-for-
> > > > > delivery
> > > > > signal pending.
> > > > > 
> > > > > I think the simplest solution at this point is to skip the sync
> > > > > behaviour,
> > > > > in particular if we enable the 0 batch / bind possibility.
> > > > > 
> > > > > If we still want to provide it, we could add a cookie address
> > > > > as an
> > > > > extension to the ioctl and activate sync if present? (Just
> > > > > throwing up ideas
> > > > > here).
> > > > > 
> > > > Hmm, forgot about this. A cookie is fairly easy, what about
> > > > something like this:
> > > > 
> > > >   807 /**
> > > >   808  * struct drm_xe_syncs - In / out syncs for IOCTLs.
> > > >   809  */
> > > >   810 struct drm_xe_syncs {
> > > >   811         /** @num_syncs: amount of syncs to wait on */
> > > >   812         __u32 num_syncs;
> > > >   813
> > > >   814         /*
> > > >   815          * Block in IOCTL until operation complete,
> > > > num_syncs MBZ if set.
> > > >   816          */
> > > >   817 #define DRM_XE_SYNCS_IN_FLAG_WAIT_FOR_OP (1 << 0)
> > > >   818         /** @in_flags: Input Sync flags */
> > > >   819         __u16 in_flags;
> > > >   820
> > > >   821         /*
> > > >   822          * IOCTL operation has started (no need for user to
> > > > resubmit on
> > > >   823          * -ERESTARTSYS)
> > > >   824          */
> > > >   825 #define DRM_XE_SYNCS_OUT_FLAG_OP_COMMITTED (1 << 0)
> > > >   826         /** @out_flags: Output Sync flags */
> > > >   827         __u16 out_flags;
> > > >   828
> > > >   829         /** @syncs: pointer to struct drm_xe_sync array */
> > > >   830         __u64 syncs;
> > > >   831
> > > >   832         /** @reserved: Reserved */
> > > >   833         __u64 reserved[2];
> > > >   834 };
> > > > 
> > > > DRM_XE_SYNCS_OUT_FLAG_OP_COMMITTED gets set in exec / bind IOCTL
> > > > after
> > > > the job is committed or in the of zero ops last-fence updated on
> > > > the
> > > > queue. Note that for binds we don't yet do 1 job per IOCTL but
> > > > after
> > > > landing some version of [1]
> > > > 
> > > > After DRM_XE_SYNCS_OUT_FLAG_OP_COMMITTED is set we return -
> > > > ERESTARTSYS if
> > > > the wait is interrupted and -EINTR is still
> > > > DRM_XE_SYNCS_OUT_FLAG_OP_COMMITTED (interrupted before job is
> > > > committed).
> > > > 
> > > > I'd rather go with patch as we have to change the uAPI here
> > > > regardless
> > > > so we might as well make this complete.
> > > > 
> > > > Matt
> > > > 
> > > > [1] https://patchwork.freedesktop.org/series/125608/
> > > 
> > > Yeah as we discussed in the meeting that means making the ioctl RW
> > > instead
> > > of W with some copying overhead.
> > > 
> > > I also think we should leave the EXEC ioctl out of this, meaning
> > > just having
> > > a single field in the VM_BIND ioctl. Basically the reason is that
> > > waiting
> > > like this after submission is a bit weird and does not align well
> > > with how
> > > -EINTR is typically used.
> > > 
> > 
> > I kinda like uniform behavior between exec and binds with the
> > behavior
> > defined in a common sync structure.
> 
> Even so, I strongly think we should *not* in any way expose this for
> exec. If needed the user can just wait for an out-fence and then we
> don't need to implement code for this that will probably never get used
> and with an implementation that very few will understand.
> 
> Furthermore the sync VM_BIND ioctl per the ASYNC VM_BIND doc doesn't
> allow neither in-fences nor out fences, so grouping like this becomes a
> bit overkill.
> 
> > 
> > > So either a pointer to a cookie in the ioctl,
> > > 
> > 
> > What about:
> > 
> > 119 > >   807 /**
> > 120 > >   808  * struct drm_xe_syncs - In / out syncs for IOCTLs.
> > 121 > >   809  */
> > 122 > >   810 struct drm_xe_syncs {
> > 123 > >   811         /** @num_syncs: amount of syncs to wait on */
> > 124 > >   812         __u32 num_syncs;
> > 125 > >   813
> > 126 > >   814         /*
> > 127 > >   815          * Block in IOCTL until operation complete,
> > num_syncs MBZ if set.
> > 128 > >   816          */
> > 129 > >   817 #define DRM_XE_SYNCS_IN_FLAG_WAIT_FOR_OP (1 << 0)
> > 130 > >   818         /** @flags: Sync flags */
> > 131 > >   819         __u32 in_flags;
> > 132 > >   820
> > 138 > >   826         /** @cookie: userptr cookie written back with
> > non-zero value once operation committed, only valid when IOCTL
> > returns -EINTR */
> > 139 > >   827         __u64 cookie;
> > 140 > >   828
> > 141 > >   829         /** @syncs: pointer to struct drm_xe_sync array
> > */
> > 142 > >   830         __u64 syncs;
> > 143 > >   831
> > 144 > >   832         /** @reserved: Reserved */
> > 145 > >   833         __u64 reserved[2];
> > 146 > >   834 };
> > 
> > Also if cookie is 0, we wait uninterruptable once the op is
> > committed?
> 
> I'm afraid I don't follow. The *interruptible* wait after commit is
> what trigger the need for a cookie in the first place? Also here,
> @cookie is still read-only for the kernel since the struct drm_xe_syncs
> is embedded in the ioctl. Also I think any cookie should be opaque to
> the user, other than that it must must be 0 if not calling after an -
> ERESTART.
> 

Cookie here is a user address which is written back to when a sync wait
is interrupted. The expected value of *Cookie is zero on the IOCTL
submission. If Cookie == NULL, the sync wait would be uninterruptible
(we can skip this part if this is confusing). The kernel only writes
*Cookie when a sync wait is interrupted. The value of the write is
defined just as non-zero.

> > 
> > > or perhaps dig up again the idea we had of mostly waiting before
> > > the
> > > submission:
> > > 
> > > 1) Pull out the last_op fence for the queue from under the relevant
> > > lock.
> > > 2) Wait for all dependencies without any locks.
> > > 3) Lock, and (optionally) if the last_op fence changed, wait for
> > > it.
> > > 4) Submit
> > > 5) Wait for completion uninterruptible.
> > > 
> > 
> > We can always change the internal implementation to something like
> > this
> > after [1]. That series makes refactors like this quite a bit easier.
> 
> Well the idea of the above 1) - 5) was that we wouldn't be needing any
> cookie at all, since the wait in 5) would be short, and we therefore
> could get away with implementing it uninterruptible. If that turned out
> to be bad, We add the cookie as an extension. Initial implementation
> can even use uninterruptible waits for simplicity.
> 
> To summarize:
> 
> * I strongly don't think we should support sync exec calls.

What about the same interface as defined above but on exec if
DRM_XE_SYNCS_IN_FLAG_WAIT_FOR_OP set we return -EOPNOTSUPP. This gives
us a uniform interface between bind and exec with an optional path to
support sync execs in the future if a UMD asks for it.

> * No in-syncs or out-syncs if SYNC.

Agree.

> * A flag to trigger sync binds. Syncs could be in a separate struct,
> but not really needed if we don't support sync execs.

See above, the interface I purposing has this.

> * If we go for the interruptible wait, we need a writable cookie that
> is not embedded in the main struct.

See above, the interface I purposing has this.

Matt

> 
> 
> /Thomas
> 
> 
> 
> > 
> > Matt
> > 
> > [1] https://patchwork.freedesktop.org/series/125608/ 
> > 
> > > I actually like this last one best, but we'd recommend UMD to uses
> > > out-fences whenever possible.
> > > 
> > > Thoughts?
> > > 
> > > > 
> > > > > >                         dma_fence_put(fence);
> > > > > >                 }
> > > > > > @@ -331,42 +346,51 @@ int xe_exec_ioctl(struct drm_device
> > > > > > *dev, void *data, struct drm_file *file)
> > > > > >          * the job and let the DRM scheduler / backend clean
> > > > > > up the job.
> > > > > >          */
> > > > > >         xe_sched_job_arm(job);
> > > > > > +       job_fence = &job->drm.s_fence->finished;
> > > > > > +       if (wait)
> > > > > > +               dma_fence_get(job_fence);
> > > > > >         if (!xe_vm_in_lr_mode(vm)) {
> > > > > >                 /* Block userptr invalidations / BO eviction
> > > > > > */
> > > > > > -               dma_resv_add_fence(&vm->resv,
> > > > > > -                                  &job->drm.s_fence-
> > > > > > >finished,
> > > > > > +               dma_resv_add_fence(&vm->resv, job_fence,
> > > > > >                                    DMA_RESV_USAGE_BOOKKEEP);
> > > > > >                 /*
> > > > > >                  * Make implicit sync work across drivers,
> > > > > > assuming all external
> > > > > >                  * BOs are written as we don't pass in a read
> > > > > > / write list.
> > > > > >                  */
> > > > > > -               xe_vm_fence_all_extobjs(vm, &job-
> > > > > > >drm.s_fence->finished,
> > > > > > -
> > > > > >                                        DMA_RESV_USAGE_WRITE);
> > > > > > +               xe_vm_fence_all_extobjs(vm, job_fence,
> > > > > > DMA_RESV_USAGE_WRITE);
> > > > > >         }
> > > > > >         for (i = 0; i < num_syncs; i++)
> > > > > > -               xe_sync_entry_signal(&syncs[i], job,
> > > > > > -                                    &job->drm.s_fence-
> > > > > > >finished);
> > > > > > +               xe_sync_entry_signal(&syncs[i], job,
> > > > > > job_fence);
> > > > > >         if (xe_exec_queue_is_lr(q))
> > > > > >                 q->ring_ops->emit_job(job);
> > > > > >         if (!xe_vm_in_lr_mode(vm))
> > > > > > -               xe_exec_queue_last_fence_set(q, vm, &job-
> > > > > > >drm.s_fence->finished);
> > > > > > +               xe_exec_queue_last_fence_set(q, vm,
> > > > > > job_fence);
> > > > > >         xe_sched_job_push(job);
> > > > > >         xe_vm_reactivate_rebind(vm);
> > > > > > -       if (!err && !xe_vm_in_lr_mode(vm)) {
> > > > > > +       if (!xe_vm_in_lr_mode(vm)) {
> > > > > >                 spin_lock(&xe->ttm.lru_lock);
> > > > > >                 ttm_lru_bulk_move_tail(&vm->lru_bulk_move);
> > > > > >                 spin_unlock(&xe->ttm.lru_lock);
> > > > > >         }
> > > > > > +       skip_job_put = true;
> > > > > > +       if (wait) {
> > > > > > +               long timeout = dma_fence_wait(job_fence,
> > > > > > true);
> > > > > > +
> > > > > > +               dma_fence_put(job_fence);
> > > > > > +               if (timeout < 0)
> > > > > > +                       err = -EINTR;
> > > > > > +       }
> > > > > > +
> > > > > >    err_repin:
> > > > > >         if (!xe_vm_in_lr_mode(vm))
> > > > > >                 up_read(&vm->userptr.notifier_lock);
> > > > > >    err_put_job:
> > > > > > -       if (err)
> > > > > > +       if (err && !skip_job_put)
> > > > > >                 xe_sched_job_put(job);
> > > > > >    err_exec:
> > > > > >         drm_exec_fini(&exec);
> > > > > > diff --git a/drivers/gpu/drm/xe/xe_exec_queue.c
> > > > > > b/drivers/gpu/drm/xe/xe_exec_queue.c
> > > > > > index 3911d14522ee..98776d02d634 100644
> > > > > > --- a/drivers/gpu/drm/xe/xe_exec_queue.c
> > > > > > +++ b/drivers/gpu/drm/xe/xe_exec_queue.c
> > > > > > @@ -625,10 +625,7 @@ int xe_exec_queue_create_ioctl(struct
> > > > > > drm_device *dev, void *data,
> > > > > >         if (XE_IOCTL_DBG(xe, eci[0].gt_id >= xe-
> > > > > > >info.gt_count))
> > > > > >                 return -EINVAL;
> > > > > > -       if (eci[0].engine_class >=
> > > > > > DRM_XE_ENGINE_CLASS_VM_BIND_ASYNC) {
> > > > > > -               bool sync = eci[0].engine_class ==
> > > > > > -                       DRM_XE_ENGINE_CLASS_VM_BIND_SYNC;
> > > > > > -
> > > > > > +       if (eci[0].engine_class ==
> > > > > > DRM_XE_ENGINE_CLASS_VM_BIND) {
> > > > > >                 for_each_gt(gt, xe, id) {
> > > > > >                         struct xe_exec_queue *new;
> > > > > > @@ -654,8 +651,6 @@ int xe_exec_queue_create_ioctl(struct
> > > > > > drm_device *dev, void *data,
> > > > > >                                                    args-
> > > > > > >width, hwe,
> > > > > >                                                   
> > > > > > EXEC_QUEUE_FLAG_PERSISTENT |
> > > > > >                                                   
> > > > > > EXEC_QUEUE_FLAG_VM |
> > > > > > -                                                  (sync ? 0
> > > > > > :
> > > > > > -                                                  
> > > > > > EXEC_QUEUE_FLAG_VM_ASYNC) |
> > > > > >                                                    (id ?
> > > > > >                                                    
> > > > > > EXEC_QUEUE_FLAG_BIND_ENGINE_CHILD :
> > > > > >                                                     0));
> > > > > > diff --git a/drivers/gpu/drm/xe/xe_exec_queue_types.h
> > > > > > b/drivers/gpu/drm/xe/xe_exec_queue_types.h
> > > > > > index 52f0927d0d9b..c78f6e8b41c4 100644
> > > > > > --- a/drivers/gpu/drm/xe/xe_exec_queue_types.h
> > > > > > +++ b/drivers/gpu/drm/xe/xe_exec_queue_types.h
> > > > > > @@ -74,8 +74,6 @@ struct xe_exec_queue {
> > > > > >    #define EXEC_QUEUE_FLAG_VM                   BIT(4)
> > > > > >    /* child of VM queue for multi-tile VM jobs */
> > > > > >    #define EXEC_QUEUE_FLAG_BIND_ENGINE_CHILD    BIT(5)
> > > > > > -/* VM jobs for this queue are asynchronous */
> > > > > > -#define EXEC_QUEUE_FLAG_VM_ASYNC               BIT(6)
> > > > > >         /**
> > > > > >          * @flags: flags for this exec queue, should
> > > > > > statically setup aside from ban
> > > > > > diff --git a/drivers/gpu/drm/xe/xe_vm.c
> > > > > > b/drivers/gpu/drm/xe/xe_vm.c
> > > > > > index cf2eb44a71db..4b0c976c003a 100644
> > > > > > --- a/drivers/gpu/drm/xe/xe_vm.c
> > > > > > +++ b/drivers/gpu/drm/xe/xe_vm.c
> > > > > > @@ -1433,9 +1433,7 @@ struct xe_vm *xe_vm_create(struct
> > > > > > xe_device *xe, u32 flags)
> > > > > >                         struct xe_gt *gt = tile->primary_gt;
> > > > > >                         struct xe_vm *migrate_vm;
> > > > > >                         struct xe_exec_queue *q;
> > > > > > -                       u32 create_flags = EXEC_QUEUE_FLAG_VM
> > > > > > |
> > > > > > -                               ((flags &
> > > > > > XE_VM_FLAG_ASYNC_DEFAULT) ?
> > > > > > -                               EXEC_QUEUE_FLAG_VM_ASYNC :
> > > > > > 0);
> > > > > > +                       u32 create_flags =
> > > > > > EXEC_QUEUE_FLAG_VM;
> > > > > >                         if (!vm->pt_root[id])
> > > > > >                                 continue;
> > > > > > @@ -1835,16 +1833,10 @@ xe_vm_bind_vma(struct xe_vma *vma,
> > > > > > struct xe_exec_queue *q,
> > > > > >         return ERR_PTR(err);
> > > > > >    }
> > > > > > -static bool xe_vm_sync_mode(struct xe_vm *vm, struct
> > > > > > xe_exec_queue *q)
> > > > > > -{
> > > > > > -       return q ? !(q->flags & EXEC_QUEUE_FLAG_VM_ASYNC) :
> > > > > > -               !(vm->flags & XE_VM_FLAG_ASYNC_DEFAULT);
> > > > > > -}
> > > > > > -
> > > > > >    static int __xe_vm_bind(struct xe_vm *vm, struct xe_vma
> > > > > > *vma,
> > > > > >                         struct xe_exec_queue *q, struct
> > > > > > xe_sync_entry *syncs,
> > > > > >                         u32 num_syncs, bool immediate, bool
> > > > > > first_op,
> > > > > > -                       bool last_op)
> > > > > > +                       bool last_op, bool async)
> > > > > >    {
> > > > > >         struct dma_fence *fence;
> > > > > >         struct xe_exec_queue *wait_exec_queue =
> > > > > > to_wait_exec_queue(vm, q);
> > > > > > @@ -1870,7 +1862,7 @@ static int __xe_vm_bind(struct xe_vm
> > > > > > *vm, struct xe_vma *vma,
> > > > > >         if (last_op)
> > > > > >                 xe_exec_queue_last_fence_set(wait_exec_queue,
> > > > > > vm, fence);
> > > > > > -       if (last_op && xe_vm_sync_mode(vm, q))
> > > > > > +       if (last_op && !async)
> > > > > >                 dma_fence_wait(fence, true);
> > > > > >         dma_fence_put(fence);
> > > > > > @@ -1880,7 +1872,7 @@ static int __xe_vm_bind(struct xe_vm
> > > > > > *vm, struct xe_vma *vma,
> > > > > >    static int xe_vm_bind(struct xe_vm *vm, struct xe_vma
> > > > > > *vma, struct xe_exec_queue *q,
> > > > > >                       struct xe_bo *bo, struct xe_sync_entry
> > > > > > *syncs,
> > > > > >                       u32 num_syncs, bool immediate, bool
> > > > > > first_op,
> > > > > > -                     bool last_op)
> > > > > > +                     bool last_op, bool async)
> > > > > >    {
> > > > > >         int err;
> > > > > > @@ -1894,12 +1886,12 @@ static int xe_vm_bind(struct xe_vm
> > > > > > *vm, struct xe_vma *vma, struct xe_exec_queue
> > > > > >         }
> > > > > >         return __xe_vm_bind(vm, vma, q, syncs, num_syncs,
> > > > > > immediate, first_op,
> > > > > > -                           last_op);
> > > > > > +                           last_op, async);
> > > > > >    }
> > > > > >    static int xe_vm_unbind(struct xe_vm *vm, struct xe_vma
> > > > > > *vma,
> > > > > >                         struct xe_exec_queue *q, struct
> > > > > > xe_sync_entry *syncs,
> > > > > > -                       u32 num_syncs, bool first_op, bool
> > > > > > last_op)
> > > > > > +                       u32 num_syncs, bool first_op, bool
> > > > > > last_op, bool async)
> > > > > >    {
> > > > > >         struct dma_fence *fence;
> > > > > >         struct xe_exec_queue *wait_exec_queue =
> > > > > > to_wait_exec_queue(vm, q);
> > > > > > @@ -1914,7 +1906,7 @@ static int xe_vm_unbind(struct xe_vm
> > > > > > *vm, struct xe_vma *vma,
> > > > > >         xe_vma_destroy(vma, fence);
> > > > > >         if (last_op)
> > > > > >                 xe_exec_queue_last_fence_set(wait_exec_queue,
> > > > > > vm, fence);
> > > > > > -       if (last_op && xe_vm_sync_mode(vm, q))
> > > > > > +       if (last_op && !async)
> > > > > >                 dma_fence_wait(fence, true);
> > > > > It looks like we're dropping the error return code here.
> > > > > 
> > > > > 
> > > > > >         dma_fence_put(fence);
> > > > > > @@ -1923,7 +1915,6 @@ static int xe_vm_unbind(struct xe_vm
> > > > > > *vm, struct xe_vma *vma,
> > > > > >    #define ALL_DRM_XE_VM_CREATE_FLAGS
> > > > > > (DRM_XE_VM_CREATE_FLAG_SCRATCH_PAGE | \
> > > > > >                                    
> > > > > > DRM_XE_VM_CREATE_FLAG_LR_MODE | \
> > > > > > -                                  
> > > > > > DRM_XE_VM_CREATE_FLAG_ASYNC_DEFAULT | \
> > > > > >                                    
> > > > > > DRM_XE_VM_CREATE_FLAG_FAULT_MODE)
> > > > > >    int xe_vm_create_ioctl(struct drm_device *dev, void *data,
> > > > > > @@ -1977,8 +1968,6 @@ int xe_vm_create_ioctl(struct
> > > > > > drm_device *dev, void *data,
> > > > > >                 flags |= XE_VM_FLAG_SCRATCH_PAGE;
> > > > > >         if (args->flags & DRM_XE_VM_CREATE_FLAG_LR_MODE)
> > > > > >                 flags |= XE_VM_FLAG_LR_MODE;
> > > > > > -       if (args->flags &
> > > > > > DRM_XE_VM_CREATE_FLAG_ASYNC_DEFAULT)
> > > > > > -               flags |= XE_VM_FLAG_ASYNC_DEFAULT;
> > > > > >         if (args->flags & DRM_XE_VM_CREATE_FLAG_FAULT_MODE)
> > > > > >                 flags |= XE_VM_FLAG_FAULT_MODE;
> > > > > > @@ -2062,7 +2051,7 @@ static const u32 region_to_mem_type[] =
> > > > > > {
> > > > > >    static int xe_vm_prefetch(struct xe_vm *vm, struct xe_vma
> > > > > > *vma,
> > > > > >                           struct xe_exec_queue *q, u32
> > > > > > region,
> > > > > >                           struct xe_sync_entry *syncs, u32
> > > > > > num_syncs,
> > > > > > -                         bool first_op, bool last_op)
> > > > > > +                         bool first_op, bool last_op, bool
> > > > > > async)
> > > > > >    {
> > > > > >         struct xe_exec_queue *wait_exec_queue =
> > > > > > to_wait_exec_queue(vm, q);
> > > > > >         int err;
> > > > > > @@ -2077,7 +2066,7 @@ static int xe_vm_prefetch(struct xe_vm
> > > > > > *vm, struct xe_vma *vma,
> > > > > >         if (vma->tile_mask != (vma->tile_present & ~vma-
> > > > > > >usm.tile_invalidated)) {
> > > > > >                 return xe_vm_bind(vm, vma, q, xe_vma_bo(vma),
> > > > > > syncs, num_syncs,
> > > > > > -                                 true, first_op, last_op);
> > > > > > +                                 true, first_op, last_op,
> > > > > > async);
> > > > > >         } else {
> > > > > >                 int i;
> > > > > > @@ -2400,6 +2389,8 @@ static int
> > > > > > vm_bind_ioctl_ops_parse(struct xe_vm *vm, struct
> > > > > > xe_exec_queue *q,
> > > > > >                 }
> > > > > >                 op->q = q;
> > > > > > +               if (async)
> > > > > > +                       op->flags |= XE_VMA_OP_ASYNC;
> > > > > >                 switch (op->base.op) {
> > > > > >                 case DRM_GPUVA_OP_MAP:
> > > > > > @@ -2538,7 +2529,8 @@ static int op_execute(struct drm_exec
> > > > > > *exec, struct xe_vm *vm,
> > > > > >                                  op->syncs, op->num_syncs,
> > > > > >                                  op->map.immediate ||
> > > > > > !xe_vm_in_fault_mode(vm),
> > > > > >                                  op->flags & XE_VMA_OP_FIRST,
> > > > > > -                                op->flags & XE_VMA_OP_LAST);
> > > > > > +                                op->flags & XE_VMA_OP_LAST,
> > > > > > +                                op->flags &
> > > > > > XE_VMA_OP_ASYNC);
> > > > > >                 break;
> > > > > >         case DRM_GPUVA_OP_REMAP:
> > > > > >         {
> > > > > > @@ -2552,7 +2544,8 @@ static int op_execute(struct drm_exec
> > > > > > *exec, struct xe_vm *vm,
> > > > > >                                            op->num_syncs,
> > > > > >                                            op->flags &
> > > > > > XE_VMA_OP_FIRST,
> > > > > >                                            op->flags &
> > > > > > XE_VMA_OP_LAST &&
> > > > > > -                                          !prev && !next);
> > > > > > +                                          !prev && !next,
> > > > > > +                                          op->flags &
> > > > > > XE_VMA_OP_ASYNC);
> > > > > >                         if (err)
> > > > > >                                 break;
> > > > > >                         op->remap.unmap_done = true;
> > > > > > @@ -2563,7 +2556,8 @@ static int op_execute(struct drm_exec
> > > > > > *exec, struct xe_vm *vm,
> > > > > >                         err = xe_vm_bind(vm, op->remap.prev,
> > > > > > op->q,
> > > > > >                                          xe_vma_bo(op-
> > > > > > >remap.prev), op->syncs,
> > > > > >                                          op->num_syncs, true,
> > > > > > false,
> > > > > > -                                        op->flags &
> > > > > > XE_VMA_OP_LAST && !next);
> > > > > > +                                        op->flags &
> > > > > > XE_VMA_OP_LAST && !next,
> > > > > > +                                        op->flags &
> > > > > > XE_VMA_OP_ASYNC);
> > > > > >                         op->remap.prev->gpuva.flags &=
> > > > > > ~XE_VMA_LAST_REBIND;
> > > > > >                         if (err)
> > > > > >                                 break;
> > > > > > @@ -2576,7 +2570,8 @@ static int op_execute(struct drm_exec
> > > > > > *exec, struct xe_vm *vm,
> > > > > >                                          xe_vma_bo(op-
> > > > > > >remap.next),
> > > > > >                                          op->syncs, op-
> > > > > > >num_syncs,
> > > > > >                                          true, false,
> > > > > > -                                        op->flags &
> > > > > > XE_VMA_OP_LAST);
> > > > > > +                                        op->flags &
> > > > > > XE_VMA_OP_LAST,
> > > > > > +                                        op->flags &
> > > > > > XE_VMA_OP_ASYNC);
> > > > > >                         op->remap.next->gpuva.flags &=
> > > > > > ~XE_VMA_LAST_REBIND;
> > > > > >                         if (err)
> > > > > >                                 break;
> > > > > > @@ -2588,13 +2583,15 @@ static int op_execute(struct drm_exec
> > > > > > *exec, struct xe_vm *vm,
> > > > > >         case DRM_GPUVA_OP_UNMAP:
> > > > > >                 err = xe_vm_unbind(vm, vma, op->q, op->syncs,
> > > > > >                                    op->num_syncs, op->flags &
> > > > > > XE_VMA_OP_FIRST,
> > > > > > -                                  op->flags &
> > > > > > XE_VMA_OP_LAST);
> > > > > > +                                  op->flags &
> > > > > > XE_VMA_OP_LAST,
> > > > > > +                                  op->flags &
> > > > > > XE_VMA_OP_ASYNC);
> > > > > >                 break;
> > > > > >         case DRM_GPUVA_OP_PREFETCH:
> > > > > >                 err = xe_vm_prefetch(vm, vma, op->q, op-
> > > > > > >prefetch.region,
> > > > > >                                      op->syncs, op-
> > > > > > >num_syncs,
> > > > > >                                      op->flags &
> > > > > > XE_VMA_OP_FIRST,
> > > > > > -                                    op->flags &
> > > > > > XE_VMA_OP_LAST);
> > > > > > +                                    op->flags &
> > > > > > XE_VMA_OP_LAST,
> > > > > > +                                    op->flags &
> > > > > > XE_VMA_OP_ASYNC);
> > > > > >                 break;
> > > > > >         default:
> > > > > >                 drm_warn(&vm->xe->drm, "NOT POSSIBLE");
> > > > > > @@ -2808,16 +2805,16 @@ static int
> > > > > > vm_bind_ioctl_ops_execute(struct xe_vm *vm,
> > > > > >    #ifdef TEST_VM_ASYNC_OPS_ERROR
> > > > > >    #define SUPPORTED_FLAGS      \
> > > > > > -       (FORCE_ASYNC_OP_ERROR | DRM_XE_VM_BIND_FLAG_ASYNC | \
> > > > > > -        DRM_XE_VM_BIND_FLAG_READONLY |
> > > > > > DRM_XE_VM_BIND_FLAG_IMMEDIATE | \
> > > > > > -        DRM_XE_VM_BIND_FLAG_NULL | 0xffff)
> > > > > > +       (FORCE_ASYNC_OP_ERROR | DRM_XE_VM_BIND_FLAG_READONLY
> > > > > > | \
> > > > > > +        DRM_XE_VM_BIND_FLAG_IMMEDIATE |
> > > > > > DRM_XE_VM_BIND_FLAG_NULL | 0xffff)
> > > > > >    #else
> > > > > >    #define SUPPORTED_FLAGS      \
> > > > > > -       (DRM_XE_VM_BIND_FLAG_ASYNC |
> > > > > > DRM_XE_VM_BIND_FLAG_READONLY | \
> > > > > > +       (DRM_XE_VM_BIND_FLAG_READONLY | \
> > > > > >          DRM_XE_VM_BIND_FLAG_IMMEDIATE |
> > > > > > DRM_XE_VM_BIND_FLAG_NULL | \
> > > > > >          0xffff)
> > > > > >    #endif
> > > > > >    #define XE_64K_PAGE_MASK 0xffffull
> > > > > > +#define ALL_DRM_XE_SYNCS_FLAGS
> > > > > > (DRM_XE_SYNCS_FLAG_WAIT_FOR_OP)
> > > > > >    #define MAX_BINDS    512     /* FIXME: Picking random
> > > > > > upper limit */
> > > > > > @@ -2829,7 +2826,7 @@ static int
> > > > > > vm_bind_ioctl_check_args(struct xe_device *xe,
> > > > > >         int err;
> > > > > >         int i;
> > > > > > -       if (XE_IOCTL_DBG(xe, args->pad || args->pad2) ||
> > > > > > +       if (XE_IOCTL_DBG(xe, args->pad) ||
> > > > > >             XE_IOCTL_DBG(xe, args->reserved[0] || args-
> > > > > > >reserved[1]))
> > > > > >                 return -EINVAL;
> > > > > > @@ -2857,6 +2854,14 @@ static int
> > > > > > vm_bind_ioctl_check_args(struct xe_device *xe,
> > > > > >                 *bind_ops = &args->bind;
> > > > > >         }
> > > > > > +       *async = !(args->syncs.flags &
> > > > > > DRM_XE_SYNCS_FLAG_WAIT_FOR_OP);
> > > > > > +
> > > > > > +       if (XE_IOCTL_DBG(xe, args->syncs.flags &
> > > > > > ~ALL_DRM_XE_SYNCS_FLAGS) ||
> > > > > > +           XE_IOCTL_DBG(xe, !*async && args-
> > > > > > >syncs.num_syncs)) {
> > > > > > +               err = -EINVAL;
> > > > > > +               goto free_bind_ops;
> > > > > > +       }
> > > > > > +
> > > > > >         for (i = 0; i < args->num_binds; ++i) {
> > > > > >                 u64 range = (*bind_ops)[i].range;
> > > > > >                 u64 addr = (*bind_ops)[i].addr;
> > > > > > @@ -2887,18 +2892,6 @@ static int
> > > > > > vm_bind_ioctl_check_args(struct xe_device *xe,
> > > > > >                         goto free_bind_ops;
> > > > > >                 }
> > > > > > -               if (i == 0) {
> > > > > > -                       *async = !!(flags &
> > > > > > DRM_XE_VM_BIND_FLAG_ASYNC);
> > > > > > -                       if (XE_IOCTL_DBG(xe, !*async && args-
> > > > > > >num_syncs)) {
> > > > > > -                               err = -EINVAL;
> > > > > > -                               goto free_bind_ops;
> > > > > > -                       }
> > > > > > -               } else if (XE_IOCTL_DBG(xe, *async !=
> > > > > > -                                       !!(flags &
> > > > > > DRM_XE_VM_BIND_FLAG_ASYNC))) {
> > > > > > -                       err = -EINVAL;
> > > > > > -                       goto free_bind_ops;
> > > > > > -               }
> > > > > > -
> > > > > >                 if (XE_IOCTL_DBG(xe, op >
> > > > > > DRM_XE_VM_BIND_OP_PREFETCH) ||
> > > > > >                     XE_IOCTL_DBG(xe, flags &
> > > > > > ~SUPPORTED_FLAGS) ||
> > > > > >                     XE_IOCTL_DBG(xe, obj && is_null) ||
> > > > > > @@ -2951,7 +2944,7 @@ static int
> > > > > > vm_bind_ioctl_check_args(struct xe_device *xe,
> > > > > >    static int vm_bind_ioctl_signal_fences(struct xe_vm *vm,
> > > > > >                                        struct xe_exec_queue
> > > > > > *q,
> > > > > >                                        struct xe_sync_entry
> > > > > > *syncs,
> > > > > > -                                      int num_syncs)
> > > > > > +                                      int num_syncs, bool
> > > > > > async)
> > > > > >    {
> > > > > >         struct dma_fence *fence;
> > > > > >         int i, err = 0;
> > > > > > @@ -2967,7 +2960,7 @@ static int
> > > > > > vm_bind_ioctl_signal_fences(struct xe_vm *vm,
> > > > > >         xe_exec_queue_last_fence_set(to_wait_exec_queue(vm,
> > > > > > q), vm,
> > > > > >                                      fence);
> > > > > > -       if (xe_vm_sync_mode(vm, q)) {
> > > > > > +       if (!async) {
> > > > > >                 long timeout = dma_fence_wait(fence, true);
> > > > > >                 if (timeout < 0)
> > > > > > @@ -3001,7 +2994,7 @@ int xe_vm_bind_ioctl(struct drm_device
> > > > > > *dev, void *data, struct drm_file *file)
> > > > > >         if (err)
> > > > > >                 return err;
> > > > > > -       if (XE_IOCTL_DBG(xe, args->pad || args->pad2) ||
> > > > > > +       if (XE_IOCTL_DBG(xe, args->pad) ||
> > > > > >             XE_IOCTL_DBG(xe, args->reserved[0] || args-
> > > > > > >reserved[1]))
> > > > > >                 return -EINVAL;
> > > > > > @@ -3016,12 +3009,6 @@ int xe_vm_bind_ioctl(struct drm_device
> > > > > > *dev, void *data, struct drm_file *file)
> > > > > >                         err = -EINVAL;
> > > > > >                         goto put_exec_queue;
> > > > > >                 }
> > > > > > -
> > > > > > -               if (XE_IOCTL_DBG(xe, args->num_binds && async
> > > > > > !=
> > > > > > -                                !!(q->flags &
> > > > > > EXEC_QUEUE_FLAG_VM_ASYNC))) {
> > > > > > -                       err = -EINVAL;
> > > > > > -                       goto put_exec_queue;
> > > > > > -               }
> > > > > >         }
> > > > > >         vm = xe_vm_lookup(xef, args->vm_id);
> > > > > > @@ -3030,14 +3017,6 @@ int xe_vm_bind_ioctl(struct drm_device
> > > > > > *dev, void *data, struct drm_file *file)
> > > > > >                 goto put_exec_queue;
> > > > > >         }
> > > > > > -       if (!args->exec_queue_id) {
> > > > > > -               if (XE_IOCTL_DBG(xe, args->num_binds && async
> > > > > > !=
> > > > > > -                                !!(vm->flags &
> > > > > > XE_VM_FLAG_ASYNC_DEFAULT))) {
> > > > > > -                       err = -EINVAL;
> > > > > > -                       goto put_vm;
> > > > > > -               }
> > > > > > -       }
> > > > > > -
> > > > > >         err = down_write_killable(&vm->lock);
> > > > > >         if (err)
> > > > > >                 goto put_vm;
> > > > > > @@ -3127,16 +3106,16 @@ int xe_vm_bind_ioctl(struct
> > > > > > drm_device *dev, void *data, struct drm_file *file)
> > > > > >                 }
> > > > > >         }
> > > > > > -       if (args->num_syncs) {
> > > > > > -               syncs = kcalloc(args->num_syncs,
> > > > > > sizeof(*syncs), GFP_KERNEL);
> > > > > > +       if (args->syncs.num_syncs) {
> > > > > > +               syncs = kcalloc(args->syncs.num_syncs,
> > > > > > sizeof(*syncs), GFP_KERNEL);
> > > > > >                 if (!syncs) {
> > > > > >                         err = -ENOMEM;
> > > > > >                         goto put_obj;
> > > > > >                 }
> > > > > >         }
> > > > > > -       syncs_user = u64_to_user_ptr(args->syncs);
> > > > > > -       for (num_syncs = 0; num_syncs < args->num_syncs;
> > > > > > num_syncs++) {
> > > > > > +       syncs_user = u64_to_user_ptr(args->syncs.syncs);
> > > > > > +       for (num_syncs = 0; num_syncs < args-
> > > > > > >syncs.num_syncs; num_syncs++) {
> > > > > >                 err = xe_sync_entry_parse(xe, xef,
> > > > > > &syncs[num_syncs],
> > > > > >                                          
> > > > > > &syncs_user[num_syncs],
> > > > > >                                          
> > > > > > (xe_vm_in_lr_mode(vm) ?
> > > > > > @@ -3210,7 +3189,8 @@ int xe_vm_bind_ioctl(struct drm_device
> > > > > > *dev, void *data, struct drm_file *file)
> > > > > >         vm_bind_ioctl_ops_unwind(vm, ops, args->num_binds);
> > > > > >    free_syncs:
> > > > > >         if (err == -ENODATA)
> > > > > > -               err = vm_bind_ioctl_signal_fences(vm, q,
> > > > > > syncs, num_syncs);
> > > > > > +               err = vm_bind_ioctl_signal_fences(vm, q,
> > > > > > syncs, num_syncs,
> > > > > > +                                                 async);
> > > > > >         while (num_syncs--)
> > > > > >                 xe_sync_entry_cleanup(&syncs[num_syncs]);
> > > > > > diff --git a/drivers/gpu/drm/xe/xe_vm_types.h
> > > > > > b/drivers/gpu/drm/xe/xe_vm_types.h
> > > > > > index 23abdfd8622f..ce8b9bde7e9c 100644
> > > > > > --- a/drivers/gpu/drm/xe/xe_vm_types.h
> > > > > > +++ b/drivers/gpu/drm/xe/xe_vm_types.h
> > > > > > @@ -167,13 +167,12 @@ struct xe_vm {
> > > > > >          */
> > > > > >    #define XE_VM_FLAG_64K                       BIT(0)
> > > > > >    #define XE_VM_FLAG_LR_MODE           BIT(1)
> > > > > > -#define XE_VM_FLAG_ASYNC_DEFAULT       BIT(2)
> > > > > > -#define XE_VM_FLAG_MIGRATION           BIT(3)
> > > > > > -#define XE_VM_FLAG_SCRATCH_PAGE                BIT(4)
> > > > > > -#define XE_VM_FLAG_FAULT_MODE          BIT(5)
> > > > > > -#define XE_VM_FLAG_BANNED              BIT(6)
> > > > > > -#define XE_VM_FLAG_TILE_ID(flags)      FIELD_GET(GENMASK(8,
> > > > > > 7), flags)
> > > > > > -#define XE_VM_FLAG_SET_TILE_ID(tile)   FIELD_PREP(GENMASK(8,
> > > > > > 7), (tile)->id)
> > > > > > +#define XE_VM_FLAG_MIGRATION           BIT(2)
> > > > > > +#define XE_VM_FLAG_SCRATCH_PAGE                BIT(3)
> > > > > > +#define XE_VM_FLAG_FAULT_MODE          BIT(4)
> > > > > > +#define XE_VM_FLAG_BANNED              BIT(5)
> > > > > > +#define XE_VM_FLAG_TILE_ID(flags)      FIELD_GET(GENMASK(7,
> > > > > > 6), flags)
> > > > > > +#define XE_VM_FLAG_SET_TILE_ID(tile)   FIELD_PREP(GENMASK(7,
> > > > > > 6), (tile)->id)
> > > > > >         unsigned long flags;
> > > > > >         /** @composite_fence_ctx: context composite fence */
> > > > > > @@ -385,6 +384,8 @@ enum xe_vma_op_flags {
> > > > > >         XE_VMA_OP_PREV_COMMITTED        = BIT(3),
> > > > > >         /** @XE_VMA_OP_NEXT_COMMITTED: Next VMA operation
> > > > > > committed */
> > > > > >         XE_VMA_OP_NEXT_COMMITTED        = BIT(4),
> > > > > > +       /** @XE_VMA_OP_ASYNC: operation is async */
> > > > > > +       XE_VMA_OP_ASYNC                 = BIT(5),
> > > > > >    };
> > > > > >    /** struct xe_vma_op - VMA operation */
> > > > > > diff --git a/include/uapi/drm/xe_drm.h
> > > > > > b/include/uapi/drm/xe_drm.h
> > > > > > index eb03a49c17a1..fd8172fe2d9a 100644
> > > > > > --- a/include/uapi/drm/xe_drm.h
> > > > > > +++ b/include/uapi/drm/xe_drm.h
> > > > > > @@ -141,8 +141,7 @@ struct drm_xe_engine_class_instance {
> > > > > >          * Kernel only classes (not actual hardware engine
> > > > > > class). Used for
> > > > > >          * creating ordered queues of VM bind operations.
> > > > > >          */
> > > > > > -#define DRM_XE_ENGINE_CLASS_VM_BIND_ASYNC      5
> > > > > > -#define DRM_XE_ENGINE_CLASS_VM_BIND_SYNC       6
> > > > > > +#define DRM_XE_ENGINE_CLASS_VM_BIND            5
> > > > > >         __u16 engine_class;
> > > > > >         __u16 engine_instance;
> > > > > > @@ -660,7 +659,6 @@ struct drm_xe_vm_create {
> > > > > >          * still enable recoverable pagefaults if supported
> > > > > > by the device.
> > > > > >          */
> > > > > >    #define DRM_XE_VM_CREATE_FLAG_LR_MODE                (1 <<
> > > > > > 1)
> > > > > > -#define DRM_XE_VM_CREATE_FLAG_ASYNC_DEFAULT    (1 << 2)
> > > > > >         /*
> > > > > >          * DRM_XE_VM_CREATE_FLAG_FAULT_MODE requires also
> > > > > >          * DRM_XE_VM_CREATE_FLAG_LR_MODE. It allows memory to
> > > > > > be allocated
> > > > > > @@ -668,7 +666,7 @@ struct drm_xe_vm_create {
> > > > > >          * The xe driver internally uses recoverable
> > > > > > pagefaults to implement
> > > > > >          * this.
> > > > > >          */
> > > > > > -#define DRM_XE_VM_CREATE_FLAG_FAULT_MODE       (1 << 3)
> > > > > > +#define DRM_XE_VM_CREATE_FLAG_FAULT_MODE       (1 << 2)
> > > > > >         /** @flags: Flags */
> > > > > >         __u32 flags;
> > > > > > @@ -776,12 +774,11 @@ struct drm_xe_vm_bind_op {
> > > > > >         __u32 op;
> > > > > >    #define DRM_XE_VM_BIND_FLAG_READONLY (1 << 0)
> > > > > > -#define DRM_XE_VM_BIND_FLAG_ASYNC      (1 << 1)
> > > > > >         /*
> > > > > >          * Valid on a faulting VM only, do the MAP operation
> > > > > > immediately rather
> > > > > >          * than deferring the MAP to the page fault handler.
> > > > > >          */
> > > > > > -#define DRM_XE_VM_BIND_FLAG_IMMEDIATE  (1 << 2)
> > > > > > +#define DRM_XE_VM_BIND_FLAG_IMMEDIATE  (1 << 1)
> > > > > >         /*
> > > > > >          * When the NULL flag is set, the page tables are
> > > > > > setup with a special
> > > > > >          * bit which indicates writes are dropped and all
> > > > > > reads return zero.  In
> > > > > > @@ -789,7 +786,7 @@ struct drm_xe_vm_bind_op {
> > > > > >          * operations, the BO handle MBZ, and the BO offset
> > > > > > MBZ. This flag is
> > > > > >          * intended to implement VK sparse bindings.
> > > > > >          */
> > > > > > -#define DRM_XE_VM_BIND_FLAG_NULL       (1 << 3)
> > > > > > +#define DRM_XE_VM_BIND_FLAG_NULL       (1 << 2)
> > > > > >         /** @flags: Bind flags */
> > > > > >         __u32 flags;
> > > > > > @@ -807,6 +804,27 @@ struct drm_xe_vm_bind_op {
> > > > > >         __u64 reserved[3];
> > > > > >    };
> > > > > > +/**
> > > > > > + * struct drm_xe_syncs - In / out syncs for IOCTLs.
> > > > > > + */
> > > > > > +struct drm_xe_syncs {
> > > > > > +       /** @num_syncs: amount of syncs to wait on */
> > > > > > +       __u32 num_syncs;
> > > > > > +
> > > > > > +       /*
> > > > > > +        * Block in IOCTL until operation complete, num_syncs
> > > > > > MBZ if set.
> > > > > > +        */
> > > > > > +#define DRM_XE_SYNCS_FLAG_WAIT_FOR_OP (1 << 0)
> > > > > > +       /** @flags: Sync flags */
> > > > > > +       __u32 flags;
> > > > > > +
> > > > > > +       /** @syncs: pointer to struct drm_xe_sync array */
> > > > > > +       __u64 syncs;
> > > > > > +
> > > > > > +       /** @reserved: Reserved */
> > > > > > +       __u64 reserved[2];
> > > > > > +};
> > > > > > +
> > > > > >    struct drm_xe_vm_bind {
> > > > > >         /** @extensions: Pointer to the first extension
> > > > > > struct, if any */
> > > > > >         __u64 extensions;
> > > > > > @@ -838,14 +856,8 @@ struct drm_xe_vm_bind {
> > > > > >                 __u64 vector_of_binds;
> > > > > >         };
> > > > > > -       /** @pad: MBZ */
> > > > > > -       __u32 pad2;
> > > > > > -
> > > > > > -       /** @num_syncs: amount of syncs to wait on */
> > > > > > -       __u32 num_syncs;
> > > > > > -
> > > > > > -       /** @syncs: pointer to struct drm_xe_sync array */
> > > > > > -       __u64 syncs;
> > > > > > +       /** @syncs: syncs for bind */
> > > > > > +       struct drm_xe_syncs syncs;
> > > > > >         /** @reserved: Reserved */
> > > > > >         __u64 reserved[2];
> > > > > > @@ -974,14 +986,14 @@ struct drm_xe_exec {
> > > > > >         /** @extensions: Pointer to the first extension
> > > > > > struct, if any */
> > > > > >         __u64 extensions;
> > > > > > +       /** @pad: MBZ */
> > > > > > +       __u32 pad;
> > > > > > +
> > > > > >         /** @exec_queue_id: Exec queue ID for the batch
> > > > > > buffer */
> > > > > >         __u32 exec_queue_id;
> > > > > > -       /** @num_syncs: Amount of struct drm_xe_sync in
> > > > > > array. */
> > > > > > -       __u32 num_syncs;
> > > > > > -
> > > > > > -       /** @syncs: Pointer to struct drm_xe_sync array. */
> > > > > > -       __u64 syncs;
> > > > > > +       /** @syncs: syncs for exec */
> > > > > > +       struct drm_xe_syncs syncs;
> > > > > >         /**
> > > > > >          * @address: address of batch buffer if
> > > > > > num_batch_buffer == 1 or an
> > > > > > @@ -995,8 +1007,8 @@ struct drm_xe_exec {
> > > > > >          */
> > > > > >         __u16 num_batch_buffer;
> > > > > > -       /** @pad: MBZ */
> > > > > > -       __u16 pad[3];
> > > > > > +       /** @pad2: MBZ */
> > > > > > +       __u16 pad2[3];
> > > > > >         /** @reserved: Reserved */
> > > > > >         __u64 reserved[2];
> 

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [RFC PATCH 7/7] drm/xe/uapi: Uniform async vs sync handling
  2023-12-11 21:11             ` Matthew Brost
@ 2023-12-12  8:43               ` Thomas Hellström
  0 siblings, 0 replies; 22+ messages in thread
From: Thomas Hellström @ 2023-12-12  8:43 UTC (permalink / raw)
  To: Matthew Brost; +Cc: Francois Dugast, intel-xe, Rodrigo Vivi


On 12/11/23 22:11, Matthew Brost wrote:
> On Mon, Dec 11, 2023 at 07:11:15PM +0100, Thomas Hellström wrote:
>> On Mon, 2023-12-11 at 16:49 +0000, Matthew Brost wrote:
>>> On Mon, Dec 11, 2023 at 04:43:06PM +0100, Thomas Hellström wrote:
>>>> On 12/8/23 10:45, Matthew Brost wrote:
>>>>> On Fri, Dec 08, 2023 at 04:00:37PM +0100, Thomas Hellström wrote:
>>>>>> On 12/7/23 06:57, Matthew Brost wrote:
>>>>>>> Remove concept of async vs sync VM bind queues, rather make
>>>>>>> async vs
>>>>>>> sync a per IOCTL choice. Since this is per IOCTL, it makes
>>>>>>> sense to have
>>>>>>> a singular flag IOCTL rather than per VM bind op flag too.
>>>>>>> Add
>>>>>>> DRM_XE_SYNCS_FLAG_WAIT_FOR_OP which is an input sync flag to
>>>>>>> support
>>>>>>> this. Support this new flag for both the VM bind IOCTL and
>>>>>>> the exec
>>>>>>> IOCTL to match behavior.
>>>>>>>
>>>>>>> Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
>>>>>>> Cc: Thomas Hellström <thomas.hellstrom@linux.intel.com>
>>>>>>> Cc: Francois Dugast <francois.dugast@intel.com>
>>>>>>> Signed-off-by: Matthew Brost <matthew.brost@intel.com>
>>>>>>> ---
>>>>>>>     drivers/gpu/drm/xe/xe_exec.c             |  58 ++++++++---
>>>>>>> -
>>>>>>>     drivers/gpu/drm/xe/xe_exec_queue.c       |   7 +-
>>>>>>>     drivers/gpu/drm/xe/xe_exec_queue_types.h |   2 -
>>>>>>>     drivers/gpu/drm/xe/xe_vm.c               | 110 ++++++++++-
>>>>>>> ------------
>>>>>>>     drivers/gpu/drm/xe/xe_vm_types.h         |  15 ++--
>>>>>>>     include/uapi/drm/xe_drm.h                |  56 +++++++----
>>>>>>> -
>>>>>>>     6 files changed, 129 insertions(+), 119 deletions(-)
>>>>>>>
>>>>>>> diff --git a/drivers/gpu/drm/xe/xe_exec.c
>>>>>>> b/drivers/gpu/drm/xe/xe_exec.c
>>>>>>> index 92b0da6580e8..c62cabfaa112 100644
>>>>>>> --- a/drivers/gpu/drm/xe/xe_exec.c
>>>>>>> +++ b/drivers/gpu/drm/xe/xe_exec.c
>>>>>>> @@ -130,12 +130,15 @@ static int xe_exec_begin(struct
>>>>>>> drm_exec *exec, struct xe_vm *vm)
>>>>>>>          return err;
>>>>>>>     }
>>>>>>> +#define ALL_DRM_XE_SYNCS_FLAGS
>>>>>>> (DRM_XE_SYNCS_FLAG_WAIT_FOR_OP)
>>>>>>> +
>>>>>>>     int xe_exec_ioctl(struct drm_device *dev, void *data,
>>>>>>> struct drm_file *file)
>>>>>>>     {
>>>>>>>          struct xe_device *xe = to_xe_device(dev);
>>>>>>>          struct xe_file *xef = to_xe_file(file);
>>>>>>>          struct drm_xe_exec *args = data;
>>>>>>> -       struct drm_xe_sync __user *syncs_user =
>>>>>>> u64_to_user_ptr(args->syncs);
>>>>>>> +       struct drm_xe_sync __user *syncs_user =
>>>>>>> +               u64_to_user_ptr(args->syncs.syncs);
>>>>>>>          u64 __user *addresses_user = u64_to_user_ptr(args-
>>>>>>>> address);
>>>>>>>          struct xe_exec_queue *q;
>>>>>>>          struct xe_sync_entry *syncs = NULL;
>>>>>>> @@ -143,15 +146,18 @@ int xe_exec_ioctl(struct drm_device
>>>>>>> *dev, void *data, struct drm_file *file)
>>>>>>>          struct drm_exec exec;
>>>>>>>          u32 i, num_syncs = 0;
>>>>>>>          struct xe_sched_job *job;
>>>>>>> -       struct dma_fence *rebind_fence;
>>>>>>> +       struct dma_fence *rebind_fence, *job_fence;
>>>>>>>          struct xe_vm *vm;
>>>>>>> -       bool write_locked;
>>>>>>> +       bool write_locked, skip_job_put = false;
>>>>>>> +       bool wait = args->syncs.flags &
>>>>>>> DRM_XE_SYNCS_FLAG_WAIT_FOR_OP;
>>>>>>>          ktime_t end = 0;
>>>>>>>          int err = 0;
>>>>>>>          if (XE_IOCTL_DBG(xe, args->extensions) ||
>>>>>>> -           XE_IOCTL_DBG(xe, args->pad[0] || args->pad[1] ||
>>>>>>> args->pad[2]) ||
>>>>>>> -           XE_IOCTL_DBG(xe, args->reserved[0] || args-
>>>>>>>> reserved[1]))
>>>>>>> +           XE_IOCTL_DBG(xe, args->pad || args->pad2[0] ||
>>>>>>> args->pad2[1] || args->pad2[2]) ||
>>>>>>> +           XE_IOCTL_DBG(xe, args->reserved[0] || args-
>>>>>>>> reserved[1]) ||
>>>>>>> +           XE_IOCTL_DBG(xe, args->syncs.flags &
>>>>>>> ~ALL_DRM_XE_SYNCS_FLAGS) ||
>>>>>>> +           XE_IOCTL_DBG(xe, wait && args->syncs.num_syncs))
>>>>>>>                  return -EINVAL;
>>>>>>>          q = xe_exec_queue_lookup(xef, args->exec_queue_id);
>>>>>>> @@ -170,8 +176,9 @@ int xe_exec_ioctl(struct drm_device *dev,
>>>>>>> void *data, struct drm_file *file)
>>>>>>>                  goto err_exec_queue;
>>>>>>>          }
>>>>>>> -       if (args->num_syncs) {
>>>>>>> -               syncs = kcalloc(args->num_syncs,
>>>>>>> sizeof(*syncs), GFP_KERNEL);
>>>>>>> +       if (args->syncs.num_syncs) {
>>>>>>> +               syncs = kcalloc(args->syncs.num_syncs,
>>>>>>> sizeof(*syncs),
>>>>>>> +                               GFP_KERNEL);
>>>>>>>                  if (!syncs) {
>>>>>>>                          err = -ENOMEM;
>>>>>>>                          goto err_exec_queue;
>>>>>>> @@ -180,7 +187,7 @@ int xe_exec_ioctl(struct drm_device *dev,
>>>>>>> void *data, struct drm_file *file)
>>>>>>>          vm = q->vm;
>>>>>>> -       for (i = 0; i < args->num_syncs; i++) {
>>>>>>> +       for (i = 0; i < args->syncs.num_syncs; i++) {
>>>>>>>                  err = xe_sync_entry_parse(xe, xef,
>>>>>>> &syncs[num_syncs++],
>>>>>>>                                            &syncs_user[i],
>>>>>>> SYNC_PARSE_FLAG_EXEC |
>>>>>>>                                           
>>>>>>> (xe_vm_in_lr_mode(vm) ?
>>>>>>> @@ -245,9 +252,17 @@ int xe_exec_ioctl(struct drm_device
>>>>>>> *dev, void *data, struct drm_file *file)
>>>>>>>                                  err = PTR_ERR(fence);
>>>>>>>                                  goto err_exec;
>>>>>>>                          }
>>>>>>> +
>>>>>>>                          for (i = 0; i < num_syncs; i++)
>>>>>>>                                  xe_sync_entry_signal(&syncs[i
>>>>>>> ], NULL, fence);
>>>>>>> +
>>>>>>>                          xe_exec_queue_last_fence_set(q, vm,
>>>>>>> fence);
>>>>>>> +                       if (wait) {
>>>>>>> +                               long timeout =
>>>>>>> dma_fence_wait(fence, true);
>>>>>>> +
>>>>>>> +                               if (timeout < 0)
>>>>>>> +                                       err = -EINTR;
>>>>>>> +                       }
>>>>>> Here it looks like we will rerun the same IOCTL again if we
>>>>>> return -EINTR.
>>>>>> The user-space expected action on -EINTR is to just restart the
>>>>>> IOCTL
>>>>>> without any argument changes. Solution is to add an ioctl
>>>>>> argument cookie
>>>>>> (or to skip sync vm binds and have the user just use the 0
>>>>>> batch buffers /
>>>>>> vm_binds calls or wait for an out-fence). If you go for the
>>>>>> cookie solution
>>>>>> then IMO we should keep the -ERESTARTSYS returned from
>>>>>> dma_fence_wait()
>>>>>> since it's converted to -EINTR on return-to-user-space, and the
>>>>>> kernel
>>>>>> restarts the IOCTL automatically if there was no requested-for-
>>>>>> delivery
>>>>>> signal pending.
>>>>>>
>>>>>> I think the simplest solution at this point is to skip the sync
>>>>>> behaviour,
>>>>>> in particular if we enable the 0 batch / bind possibility.
>>>>>>
>>>>>> If we still want to provide it, we could add a cookie address
>>>>>> as an
>>>>>> extension to the ioctl and activate sync if present? (Just
>>>>>> throwing up ideas
>>>>>> here).
>>>>>>
>>>>> Hmm, forgot about this. A cookie is fairly easy, what about
>>>>> something like this:
>>>>>
>>>>>    807 /**
>>>>>    808  * struct drm_xe_syncs - In / out syncs for IOCTLs.
>>>>>    809  */
>>>>>    810 struct drm_xe_syncs {
>>>>>    811         /** @num_syncs: amount of syncs to wait on */
>>>>>    812         __u32 num_syncs;
>>>>>    813
>>>>>    814         /*
>>>>>    815          * Block in IOCTL until operation complete,
>>>>> num_syncs MBZ if set.
>>>>>    816          */
>>>>>    817 #define DRM_XE_SYNCS_IN_FLAG_WAIT_FOR_OP (1 << 0)
>>>>>    818         /** @in_flags: Input Sync flags */
>>>>>    819         __u16 in_flags;
>>>>>    820
>>>>>    821         /*
>>>>>    822          * IOCTL operation has started (no need for user to
>>>>> resubmit on
>>>>>    823          * -ERESTARTSYS)
>>>>>    824          */
>>>>>    825 #define DRM_XE_SYNCS_OUT_FLAG_OP_COMMITTED (1 << 0)
>>>>>    826         /** @out_flags: Output Sync flags */
>>>>>    827         __u16 out_flags;
>>>>>    828
>>>>>    829         /** @syncs: pointer to struct drm_xe_sync array */
>>>>>    830         __u64 syncs;
>>>>>    831
>>>>>    832         /** @reserved: Reserved */
>>>>>    833         __u64 reserved[2];
>>>>>    834 };
>>>>>
>>>>> DRM_XE_SYNCS_OUT_FLAG_OP_COMMITTED gets set in exec / bind IOCTL
>>>>> after
>>>>> the job is committed or in the of zero ops last-fence updated on
>>>>> the
>>>>> queue. Note that for binds we don't yet do 1 job per IOCTL but
>>>>> after
>>>>> landing some version of [1]
>>>>>
>>>>> After DRM_XE_SYNCS_OUT_FLAG_OP_COMMITTED is set we return -
>>>>> ERESTARTSYS if
>>>>> the wait is interrupted and -EINTR is still
>>>>> DRM_XE_SYNCS_OUT_FLAG_OP_COMMITTED (interrupted before job is
>>>>> committed).
>>>>>
>>>>> I'd rather go with patch as we have to change the uAPI here
>>>>> regardless
>>>>> so we might as well make this complete.
>>>>>
>>>>> Matt
>>>>>
>>>>> [1] https://patchwork.freedesktop.org/series/125608/
>>>> Yeah as we discussed in the meeting that means making the ioctl RW
>>>> instead
>>>> of W with some copying overhead.
>>>>
>>>> I also think we should leave the EXEC ioctl out of this, meaning
>>>> just having
>>>> a single field in the VM_BIND ioctl. Basically the reason is that
>>>> waiting
>>>> like this after submission is a bit weird and does not align well
>>>> with how
>>>> -EINTR is typically used.
>>>>
>>> I kinda like uniform behavior between exec and binds with the
>>> behavior
>>> defined in a common sync structure.
>> Even so, I strongly think we should *not* in any way expose this for
>> exec. If needed the user can just wait for an out-fence and then we
>> don't need to implement code for this that will probably never get used
>> and with an implementation that very few will understand.
>>
>> Furthermore the sync VM_BIND ioctl per the ASYNC VM_BIND doc doesn't
>> allow neither in-fences nor out fences, so grouping like this becomes a
>> bit overkill.
>>
>>>> So either a pointer to a cookie in the ioctl,
>>>>
>>> What about:
>>>
>>> 119 > >   807 /**
>>> 120 > >   808  * struct drm_xe_syncs - In / out syncs for IOCTLs.
>>> 121 > >   809  */
>>> 122 > >   810 struct drm_xe_syncs {
>>> 123 > >   811         /** @num_syncs: amount of syncs to wait on */
>>> 124 > >   812         __u32 num_syncs;
>>> 125 > >   813
>>> 126 > >   814         /*
>>> 127 > >   815          * Block in IOCTL until operation complete,
>>> num_syncs MBZ if set.
>>> 128 > >   816          */
>>> 129 > >   817 #define DRM_XE_SYNCS_IN_FLAG_WAIT_FOR_OP (1 << 0)
>>> 130 > >   818         /** @flags: Sync flags */
>>> 131 > >   819         __u32 in_flags;
>>> 132 > >   820
>>> 138 > >   826         /** @cookie: userptr cookie written back with
>>> non-zero value once operation committed, only valid when IOCTL
>>> returns -EINTR */
>>> 139 > >   827         __u64 cookie;
>>> 140 > >   828
>>> 141 > >   829         /** @syncs: pointer to struct drm_xe_sync array
>>> */
>>> 142 > >   830         __u64 syncs;
>>> 143 > >   831
>>> 144 > >   832         /** @reserved: Reserved */
>>> 145 > >   833         __u64 reserved[2];
>>> 146 > >   834 };
>>>
>>> Also if cookie is 0, we wait uninterruptable once the op is
>>> committed?
>> I'm afraid I don't follow. The *interruptible* wait after commit is
>> what trigger the need for a cookie in the first place? Also here,
>> @cookie is still read-only for the kernel since the struct drm_xe_syncs
>> is embedded in the ioctl. Also I think any cookie should be opaque to
>> the user, other than that it must must be 0 if not calling after an -
>> ERESTART.
>>
> Cookie here is a user address which is written back to when a sync wait
> is interrupted. The expected value of *Cookie is zero on the IOCTL
> submission. If Cookie == NULL, the sync wait would be uninterruptible
> (we can skip this part if this is confusing). The kernel only writes
> *Cookie when a sync wait is interrupted. The value of the write is
> defined just as non-zero.
>
>>>> or perhaps dig up again the idea we had of mostly waiting before
>>>> the
>>>> submission:
>>>>
>>>> 1) Pull out the last_op fence for the queue from under the relevant
>>>> lock.
>>>> 2) Wait for all dependencies without any locks.
>>>> 3) Lock, and (optionally) if the last_op fence changed, wait for
>>>> it.
>>>> 4) Submit
>>>> 5) Wait for completion uninterruptible.
>>>>
>>> We can always change the internal implementation to something like
>>> this
>>> after [1]. That series makes refactors like this quite a bit easier.
>> Well the idea of the above 1) - 5) was that we wouldn't be needing any
>> cookie at all, since the wait in 5) would be short, and we therefore
>> could get away with implementing it uninterruptible. If that turned out
>> to be bad, We add the cookie as an extension. Initial implementation
>> can even use uninterruptible waits for simplicity.
>>
>> To summarize:
>>
>> * I strongly don't think we should support sync exec calls.
> What about the same interface as defined above but on exec if
> DRM_XE_SYNCS_IN_FLAG_WAIT_FOR_OP set we return -EOPNOTSUPP. This gives
> us a uniform interface between bind and exec with an optional path to
> support sync execs in the future if a UMD asks for it.
>
>> * No in-syncs or out-syncs if SYNC.
> Agree.
>
>> * A flag to trigger sync binds. Syncs could be in a separate struct,
>> but not really needed if we don't support sync execs.
> See above, the interface I purposing has this.
>
>> * If we go for the interruptible wait, we need a writable cookie that
>> is not embedded in the main struct.
> See above, the interface I purposing has this.

Hi, Matt.

Ack on this. Let's code an xe_drm.h up and run it by the UMD people. 
(I'd rather skip the sync functionality completely for symmetry, but IMO 
this will work).

/Thomas

>
> Matt
>
>>
>> /Thomas
>>
>>
>>
>>> Matt
>>>
>>> [1] https://patchwork.freedesktop.org/series/125608/
>>>
>>>> I actually like this last one best, but we'd recommend UMD to uses
>>>> out-fences whenever possible.
>>>>
>>>> Thoughts?
>>>>
>>>>>>>                          dma_fence_put(fence);
>>>>>>>                  }
>>>>>>> @@ -331,42 +346,51 @@ int xe_exec_ioctl(struct drm_device
>>>>>>> *dev, void *data, struct drm_file *file)
>>>>>>>           * the job and let the DRM scheduler / backend clean
>>>>>>> up the job.
>>>>>>>           */
>>>>>>>          xe_sched_job_arm(job);
>>>>>>> +       job_fence = &job->drm.s_fence->finished;
>>>>>>> +       if (wait)
>>>>>>> +               dma_fence_get(job_fence);
>>>>>>>          if (!xe_vm_in_lr_mode(vm)) {
>>>>>>>                  /* Block userptr invalidations / BO eviction
>>>>>>> */
>>>>>>> -               dma_resv_add_fence(&vm->resv,
>>>>>>> -                                  &job->drm.s_fence-
>>>>>>>> finished,
>>>>>>> +               dma_resv_add_fence(&vm->resv, job_fence,
>>>>>>>                                     DMA_RESV_USAGE_BOOKKEEP);
>>>>>>>                  /*
>>>>>>>                   * Make implicit sync work across drivers,
>>>>>>> assuming all external
>>>>>>>                   * BOs are written as we don't pass in a read
>>>>>>> / write list.
>>>>>>>                   */
>>>>>>> -               xe_vm_fence_all_extobjs(vm, &job-
>>>>>>>> drm.s_fence->finished,
>>>>>>> -
>>>>>>>                                         DMA_RESV_USAGE_WRITE);
>>>>>>> +               xe_vm_fence_all_extobjs(vm, job_fence,
>>>>>>> DMA_RESV_USAGE_WRITE);
>>>>>>>          }
>>>>>>>          for (i = 0; i < num_syncs; i++)
>>>>>>> -               xe_sync_entry_signal(&syncs[i], job,
>>>>>>> -                                    &job->drm.s_fence-
>>>>>>>> finished);
>>>>>>> +               xe_sync_entry_signal(&syncs[i], job,
>>>>>>> job_fence);
>>>>>>>          if (xe_exec_queue_is_lr(q))
>>>>>>>                  q->ring_ops->emit_job(job);
>>>>>>>          if (!xe_vm_in_lr_mode(vm))
>>>>>>> -               xe_exec_queue_last_fence_set(q, vm, &job-
>>>>>>>> drm.s_fence->finished);
>>>>>>> +               xe_exec_queue_last_fence_set(q, vm,
>>>>>>> job_fence);
>>>>>>>          xe_sched_job_push(job);
>>>>>>>          xe_vm_reactivate_rebind(vm);
>>>>>>> -       if (!err && !xe_vm_in_lr_mode(vm)) {
>>>>>>> +       if (!xe_vm_in_lr_mode(vm)) {
>>>>>>>                  spin_lock(&xe->ttm.lru_lock);
>>>>>>>                  ttm_lru_bulk_move_tail(&vm->lru_bulk_move);
>>>>>>>                  spin_unlock(&xe->ttm.lru_lock);
>>>>>>>          }
>>>>>>> +       skip_job_put = true;
>>>>>>> +       if (wait) {
>>>>>>> +               long timeout = dma_fence_wait(job_fence,
>>>>>>> true);
>>>>>>> +
>>>>>>> +               dma_fence_put(job_fence);
>>>>>>> +               if (timeout < 0)
>>>>>>> +                       err = -EINTR;
>>>>>>> +       }
>>>>>>> +
>>>>>>>     err_repin:
>>>>>>>          if (!xe_vm_in_lr_mode(vm))
>>>>>>>                  up_read(&vm->userptr.notifier_lock);
>>>>>>>     err_put_job:
>>>>>>> -       if (err)
>>>>>>> +       if (err && !skip_job_put)
>>>>>>>                  xe_sched_job_put(job);
>>>>>>>     err_exec:
>>>>>>>          drm_exec_fini(&exec);
>>>>>>> diff --git a/drivers/gpu/drm/xe/xe_exec_queue.c
>>>>>>> b/drivers/gpu/drm/xe/xe_exec_queue.c
>>>>>>> index 3911d14522ee..98776d02d634 100644
>>>>>>> --- a/drivers/gpu/drm/xe/xe_exec_queue.c
>>>>>>> +++ b/drivers/gpu/drm/xe/xe_exec_queue.c
>>>>>>> @@ -625,10 +625,7 @@ int xe_exec_queue_create_ioctl(struct
>>>>>>> drm_device *dev, void *data,
>>>>>>>          if (XE_IOCTL_DBG(xe, eci[0].gt_id >= xe-
>>>>>>>> info.gt_count))
>>>>>>>                  return -EINVAL;
>>>>>>> -       if (eci[0].engine_class >=
>>>>>>> DRM_XE_ENGINE_CLASS_VM_BIND_ASYNC) {
>>>>>>> -               bool sync = eci[0].engine_class ==
>>>>>>> -                       DRM_XE_ENGINE_CLASS_VM_BIND_SYNC;
>>>>>>> -
>>>>>>> +       if (eci[0].engine_class ==
>>>>>>> DRM_XE_ENGINE_CLASS_VM_BIND) {
>>>>>>>                  for_each_gt(gt, xe, id) {
>>>>>>>                          struct xe_exec_queue *new;
>>>>>>> @@ -654,8 +651,6 @@ int xe_exec_queue_create_ioctl(struct
>>>>>>> drm_device *dev, void *data,
>>>>>>>                                                     args-
>>>>>>>> width, hwe,
>>>>>>>                                                    
>>>>>>> EXEC_QUEUE_FLAG_PERSISTENT |
>>>>>>>                                                    
>>>>>>> EXEC_QUEUE_FLAG_VM |
>>>>>>> -                                                  (sync ? 0
>>>>>>> :
>>>>>>> -
>>>>>>> EXEC_QUEUE_FLAG_VM_ASYNC) |
>>>>>>>                                                     (id ?
>>>>>>>                                                     
>>>>>>> EXEC_QUEUE_FLAG_BIND_ENGINE_CHILD :
>>>>>>>                                                      0));
>>>>>>> diff --git a/drivers/gpu/drm/xe/xe_exec_queue_types.h
>>>>>>> b/drivers/gpu/drm/xe/xe_exec_queue_types.h
>>>>>>> index 52f0927d0d9b..c78f6e8b41c4 100644
>>>>>>> --- a/drivers/gpu/drm/xe/xe_exec_queue_types.h
>>>>>>> +++ b/drivers/gpu/drm/xe/xe_exec_queue_types.h
>>>>>>> @@ -74,8 +74,6 @@ struct xe_exec_queue {
>>>>>>>     #define EXEC_QUEUE_FLAG_VM                   BIT(4)
>>>>>>>     /* child of VM queue for multi-tile VM jobs */
>>>>>>>     #define EXEC_QUEUE_FLAG_BIND_ENGINE_CHILD    BIT(5)
>>>>>>> -/* VM jobs for this queue are asynchronous */
>>>>>>> -#define EXEC_QUEUE_FLAG_VM_ASYNC               BIT(6)
>>>>>>>          /**
>>>>>>>           * @flags: flags for this exec queue, should
>>>>>>> statically setup aside from ban
>>>>>>> diff --git a/drivers/gpu/drm/xe/xe_vm.c
>>>>>>> b/drivers/gpu/drm/xe/xe_vm.c
>>>>>>> index cf2eb44a71db..4b0c976c003a 100644
>>>>>>> --- a/drivers/gpu/drm/xe/xe_vm.c
>>>>>>> +++ b/drivers/gpu/drm/xe/xe_vm.c
>>>>>>> @@ -1433,9 +1433,7 @@ struct xe_vm *xe_vm_create(struct
>>>>>>> xe_device *xe, u32 flags)
>>>>>>>                          struct xe_gt *gt = tile->primary_gt;
>>>>>>>                          struct xe_vm *migrate_vm;
>>>>>>>                          struct xe_exec_queue *q;
>>>>>>> -                       u32 create_flags = EXEC_QUEUE_FLAG_VM
>>>>>>> |
>>>>>>> -                               ((flags &
>>>>>>> XE_VM_FLAG_ASYNC_DEFAULT) ?
>>>>>>> -                               EXEC_QUEUE_FLAG_VM_ASYNC :
>>>>>>> 0);
>>>>>>> +                       u32 create_flags =
>>>>>>> EXEC_QUEUE_FLAG_VM;
>>>>>>>                          if (!vm->pt_root[id])
>>>>>>>                                  continue;
>>>>>>> @@ -1835,16 +1833,10 @@ xe_vm_bind_vma(struct xe_vma *vma,
>>>>>>> struct xe_exec_queue *q,
>>>>>>>          return ERR_PTR(err);
>>>>>>>     }
>>>>>>> -static bool xe_vm_sync_mode(struct xe_vm *vm, struct
>>>>>>> xe_exec_queue *q)
>>>>>>> -{
>>>>>>> -       return q ? !(q->flags & EXEC_QUEUE_FLAG_VM_ASYNC) :
>>>>>>> -               !(vm->flags & XE_VM_FLAG_ASYNC_DEFAULT);
>>>>>>> -}
>>>>>>> -
>>>>>>>     static int __xe_vm_bind(struct xe_vm *vm, struct xe_vma
>>>>>>> *vma,
>>>>>>>                          struct xe_exec_queue *q, struct
>>>>>>> xe_sync_entry *syncs,
>>>>>>>                          u32 num_syncs, bool immediate, bool
>>>>>>> first_op,
>>>>>>> -                       bool last_op)
>>>>>>> +                       bool last_op, bool async)
>>>>>>>     {
>>>>>>>          struct dma_fence *fence;
>>>>>>>          struct xe_exec_queue *wait_exec_queue =
>>>>>>> to_wait_exec_queue(vm, q);
>>>>>>> @@ -1870,7 +1862,7 @@ static int __xe_vm_bind(struct xe_vm
>>>>>>> *vm, struct xe_vma *vma,
>>>>>>>          if (last_op)
>>>>>>>                  xe_exec_queue_last_fence_set(wait_exec_queue,
>>>>>>> vm, fence);
>>>>>>> -       if (last_op && xe_vm_sync_mode(vm, q))
>>>>>>> +       if (last_op && !async)
>>>>>>>                  dma_fence_wait(fence, true);
>>>>>>>          dma_fence_put(fence);
>>>>>>> @@ -1880,7 +1872,7 @@ static int __xe_vm_bind(struct xe_vm
>>>>>>> *vm, struct xe_vma *vma,
>>>>>>>     static int xe_vm_bind(struct xe_vm *vm, struct xe_vma
>>>>>>> *vma, struct xe_exec_queue *q,
>>>>>>>                        struct xe_bo *bo, struct xe_sync_entry
>>>>>>> *syncs,
>>>>>>>                        u32 num_syncs, bool immediate, bool
>>>>>>> first_op,
>>>>>>> -                     bool last_op)
>>>>>>> +                     bool last_op, bool async)
>>>>>>>     {
>>>>>>>          int err;
>>>>>>> @@ -1894,12 +1886,12 @@ static int xe_vm_bind(struct xe_vm
>>>>>>> *vm, struct xe_vma *vma, struct xe_exec_queue
>>>>>>>          }
>>>>>>>          return __xe_vm_bind(vm, vma, q, syncs, num_syncs,
>>>>>>> immediate, first_op,
>>>>>>> -                           last_op);
>>>>>>> +                           last_op, async);
>>>>>>>     }
>>>>>>>     static int xe_vm_unbind(struct xe_vm *vm, struct xe_vma
>>>>>>> *vma,
>>>>>>>                          struct xe_exec_queue *q, struct
>>>>>>> xe_sync_entry *syncs,
>>>>>>> -                       u32 num_syncs, bool first_op, bool
>>>>>>> last_op)
>>>>>>> +                       u32 num_syncs, bool first_op, bool
>>>>>>> last_op, bool async)
>>>>>>>     {
>>>>>>>          struct dma_fence *fence;
>>>>>>>          struct xe_exec_queue *wait_exec_queue =
>>>>>>> to_wait_exec_queue(vm, q);
>>>>>>> @@ -1914,7 +1906,7 @@ static int xe_vm_unbind(struct xe_vm
>>>>>>> *vm, struct xe_vma *vma,
>>>>>>>          xe_vma_destroy(vma, fence);
>>>>>>>          if (last_op)
>>>>>>>                  xe_exec_queue_last_fence_set(wait_exec_queue,
>>>>>>> vm, fence);
>>>>>>> -       if (last_op && xe_vm_sync_mode(vm, q))
>>>>>>> +       if (last_op && !async)
>>>>>>>                  dma_fence_wait(fence, true);
>>>>>> It looks like we're dropping the error return code here.
>>>>>>
>>>>>>
>>>>>>>          dma_fence_put(fence);
>>>>>>> @@ -1923,7 +1915,6 @@ static int xe_vm_unbind(struct xe_vm
>>>>>>> *vm, struct xe_vma *vma,
>>>>>>>     #define ALL_DRM_XE_VM_CREATE_FLAGS
>>>>>>> (DRM_XE_VM_CREATE_FLAG_SCRATCH_PAGE | \
>>>>>>>                                     
>>>>>>> DRM_XE_VM_CREATE_FLAG_LR_MODE | \
>>>>>>> -
>>>>>>> DRM_XE_VM_CREATE_FLAG_ASYNC_DEFAULT | \
>>>>>>>                                     
>>>>>>> DRM_XE_VM_CREATE_FLAG_FAULT_MODE)
>>>>>>>     int xe_vm_create_ioctl(struct drm_device *dev, void *data,
>>>>>>> @@ -1977,8 +1968,6 @@ int xe_vm_create_ioctl(struct
>>>>>>> drm_device *dev, void *data,
>>>>>>>                  flags |= XE_VM_FLAG_SCRATCH_PAGE;
>>>>>>>          if (args->flags & DRM_XE_VM_CREATE_FLAG_LR_MODE)
>>>>>>>                  flags |= XE_VM_FLAG_LR_MODE;
>>>>>>> -       if (args->flags &
>>>>>>> DRM_XE_VM_CREATE_FLAG_ASYNC_DEFAULT)
>>>>>>> -               flags |= XE_VM_FLAG_ASYNC_DEFAULT;
>>>>>>>          if (args->flags & DRM_XE_VM_CREATE_FLAG_FAULT_MODE)
>>>>>>>                  flags |= XE_VM_FLAG_FAULT_MODE;
>>>>>>> @@ -2062,7 +2051,7 @@ static const u32 region_to_mem_type[] =
>>>>>>> {
>>>>>>>     static int xe_vm_prefetch(struct xe_vm *vm, struct xe_vma
>>>>>>> *vma,
>>>>>>>                            struct xe_exec_queue *q, u32
>>>>>>> region,
>>>>>>>                            struct xe_sync_entry *syncs, u32
>>>>>>> num_syncs,
>>>>>>> -                         bool first_op, bool last_op)
>>>>>>> +                         bool first_op, bool last_op, bool
>>>>>>> async)
>>>>>>>     {
>>>>>>>          struct xe_exec_queue *wait_exec_queue =
>>>>>>> to_wait_exec_queue(vm, q);
>>>>>>>          int err;
>>>>>>> @@ -2077,7 +2066,7 @@ static int xe_vm_prefetch(struct xe_vm
>>>>>>> *vm, struct xe_vma *vma,
>>>>>>>          if (vma->tile_mask != (vma->tile_present & ~vma-
>>>>>>>> usm.tile_invalidated)) {
>>>>>>>                  return xe_vm_bind(vm, vma, q, xe_vma_bo(vma),
>>>>>>> syncs, num_syncs,
>>>>>>> -                                 true, first_op, last_op);
>>>>>>> +                                 true, first_op, last_op,
>>>>>>> async);
>>>>>>>          } else {
>>>>>>>                  int i;
>>>>>>> @@ -2400,6 +2389,8 @@ static int
>>>>>>> vm_bind_ioctl_ops_parse(struct xe_vm *vm, struct
>>>>>>> xe_exec_queue *q,
>>>>>>>                  }
>>>>>>>                  op->q = q;
>>>>>>> +               if (async)
>>>>>>> +                       op->flags |= XE_VMA_OP_ASYNC;
>>>>>>>                  switch (op->base.op) {
>>>>>>>                  case DRM_GPUVA_OP_MAP:
>>>>>>> @@ -2538,7 +2529,8 @@ static int op_execute(struct drm_exec
>>>>>>> *exec, struct xe_vm *vm,
>>>>>>>                                   op->syncs, op->num_syncs,
>>>>>>>                                   op->map.immediate ||
>>>>>>> !xe_vm_in_fault_mode(vm),
>>>>>>>                                   op->flags & XE_VMA_OP_FIRST,
>>>>>>> -                                op->flags & XE_VMA_OP_LAST);
>>>>>>> +                                op->flags & XE_VMA_OP_LAST,
>>>>>>> +                                op->flags &
>>>>>>> XE_VMA_OP_ASYNC);
>>>>>>>                  break;
>>>>>>>          case DRM_GPUVA_OP_REMAP:
>>>>>>>          {
>>>>>>> @@ -2552,7 +2544,8 @@ static int op_execute(struct drm_exec
>>>>>>> *exec, struct xe_vm *vm,
>>>>>>>                                             op->num_syncs,
>>>>>>>                                             op->flags &
>>>>>>> XE_VMA_OP_FIRST,
>>>>>>>                                             op->flags &
>>>>>>> XE_VMA_OP_LAST &&
>>>>>>> -                                          !prev && !next);
>>>>>>> +                                          !prev && !next,
>>>>>>> +                                          op->flags &
>>>>>>> XE_VMA_OP_ASYNC);
>>>>>>>                          if (err)
>>>>>>>                                  break;
>>>>>>>                          op->remap.unmap_done = true;
>>>>>>> @@ -2563,7 +2556,8 @@ static int op_execute(struct drm_exec
>>>>>>> *exec, struct xe_vm *vm,
>>>>>>>                          err = xe_vm_bind(vm, op->remap.prev,
>>>>>>> op->q,
>>>>>>>                                           xe_vma_bo(op-
>>>>>>>> remap.prev), op->syncs,
>>>>>>>                                           op->num_syncs, true,
>>>>>>> false,
>>>>>>> -                                        op->flags &
>>>>>>> XE_VMA_OP_LAST && !next);
>>>>>>> +                                        op->flags &
>>>>>>> XE_VMA_OP_LAST && !next,
>>>>>>> +                                        op->flags &
>>>>>>> XE_VMA_OP_ASYNC);
>>>>>>>                          op->remap.prev->gpuva.flags &=
>>>>>>> ~XE_VMA_LAST_REBIND;
>>>>>>>                          if (err)
>>>>>>>                                  break;
>>>>>>> @@ -2576,7 +2570,8 @@ static int op_execute(struct drm_exec
>>>>>>> *exec, struct xe_vm *vm,
>>>>>>>                                           xe_vma_bo(op-
>>>>>>>> remap.next),
>>>>>>>                                           op->syncs, op-
>>>>>>>> num_syncs,
>>>>>>>                                           true, false,
>>>>>>> -                                        op->flags &
>>>>>>> XE_VMA_OP_LAST);
>>>>>>> +                                        op->flags &
>>>>>>> XE_VMA_OP_LAST,
>>>>>>> +                                        op->flags &
>>>>>>> XE_VMA_OP_ASYNC);
>>>>>>>                          op->remap.next->gpuva.flags &=
>>>>>>> ~XE_VMA_LAST_REBIND;
>>>>>>>                          if (err)
>>>>>>>                                  break;
>>>>>>> @@ -2588,13 +2583,15 @@ static int op_execute(struct drm_exec
>>>>>>> *exec, struct xe_vm *vm,
>>>>>>>          case DRM_GPUVA_OP_UNMAP:
>>>>>>>                  err = xe_vm_unbind(vm, vma, op->q, op->syncs,
>>>>>>>                                     op->num_syncs, op->flags &
>>>>>>> XE_VMA_OP_FIRST,
>>>>>>> -                                  op->flags &
>>>>>>> XE_VMA_OP_LAST);
>>>>>>> +                                  op->flags &
>>>>>>> XE_VMA_OP_LAST,
>>>>>>> +                                  op->flags &
>>>>>>> XE_VMA_OP_ASYNC);
>>>>>>>                  break;
>>>>>>>          case DRM_GPUVA_OP_PREFETCH:
>>>>>>>                  err = xe_vm_prefetch(vm, vma, op->q, op-
>>>>>>>> prefetch.region,
>>>>>>>                                       op->syncs, op-
>>>>>>>> num_syncs,
>>>>>>>                                       op->flags &
>>>>>>> XE_VMA_OP_FIRST,
>>>>>>> -                                    op->flags &
>>>>>>> XE_VMA_OP_LAST);
>>>>>>> +                                    op->flags &
>>>>>>> XE_VMA_OP_LAST,
>>>>>>> +                                    op->flags &
>>>>>>> XE_VMA_OP_ASYNC);
>>>>>>>                  break;
>>>>>>>          default:
>>>>>>>                  drm_warn(&vm->xe->drm, "NOT POSSIBLE");
>>>>>>> @@ -2808,16 +2805,16 @@ static int
>>>>>>> vm_bind_ioctl_ops_execute(struct xe_vm *vm,
>>>>>>>     #ifdef TEST_VM_ASYNC_OPS_ERROR
>>>>>>>     #define SUPPORTED_FLAGS      \
>>>>>>> -       (FORCE_ASYNC_OP_ERROR | DRM_XE_VM_BIND_FLAG_ASYNC | \
>>>>>>> -        DRM_XE_VM_BIND_FLAG_READONLY |
>>>>>>> DRM_XE_VM_BIND_FLAG_IMMEDIATE | \
>>>>>>> -        DRM_XE_VM_BIND_FLAG_NULL | 0xffff)
>>>>>>> +       (FORCE_ASYNC_OP_ERROR | DRM_XE_VM_BIND_FLAG_READONLY
>>>>>>> | \
>>>>>>> +        DRM_XE_VM_BIND_FLAG_IMMEDIATE |
>>>>>>> DRM_XE_VM_BIND_FLAG_NULL | 0xffff)
>>>>>>>     #else
>>>>>>>     #define SUPPORTED_FLAGS      \
>>>>>>> -       (DRM_XE_VM_BIND_FLAG_ASYNC |
>>>>>>> DRM_XE_VM_BIND_FLAG_READONLY | \
>>>>>>> +       (DRM_XE_VM_BIND_FLAG_READONLY | \
>>>>>>>           DRM_XE_VM_BIND_FLAG_IMMEDIATE |
>>>>>>> DRM_XE_VM_BIND_FLAG_NULL | \
>>>>>>>           0xffff)
>>>>>>>     #endif
>>>>>>>     #define XE_64K_PAGE_MASK 0xffffull
>>>>>>> +#define ALL_DRM_XE_SYNCS_FLAGS
>>>>>>> (DRM_XE_SYNCS_FLAG_WAIT_FOR_OP)
>>>>>>>     #define MAX_BINDS    512     /* FIXME: Picking random
>>>>>>> upper limit */
>>>>>>> @@ -2829,7 +2826,7 @@ static int
>>>>>>> vm_bind_ioctl_check_args(struct xe_device *xe,
>>>>>>>          int err;
>>>>>>>          int i;
>>>>>>> -       if (XE_IOCTL_DBG(xe, args->pad || args->pad2) ||
>>>>>>> +       if (XE_IOCTL_DBG(xe, args->pad) ||
>>>>>>>              XE_IOCTL_DBG(xe, args->reserved[0] || args-
>>>>>>>> reserved[1]))
>>>>>>>                  return -EINVAL;
>>>>>>> @@ -2857,6 +2854,14 @@ static int
>>>>>>> vm_bind_ioctl_check_args(struct xe_device *xe,
>>>>>>>                  *bind_ops = &args->bind;
>>>>>>>          }
>>>>>>> +       *async = !(args->syncs.flags &
>>>>>>> DRM_XE_SYNCS_FLAG_WAIT_FOR_OP);
>>>>>>> +
>>>>>>> +       if (XE_IOCTL_DBG(xe, args->syncs.flags &
>>>>>>> ~ALL_DRM_XE_SYNCS_FLAGS) ||
>>>>>>> +           XE_IOCTL_DBG(xe, !*async && args-
>>>>>>>> syncs.num_syncs)) {
>>>>>>> +               err = -EINVAL;
>>>>>>> +               goto free_bind_ops;
>>>>>>> +       }
>>>>>>> +
>>>>>>>          for (i = 0; i < args->num_binds; ++i) {
>>>>>>>                  u64 range = (*bind_ops)[i].range;
>>>>>>>                  u64 addr = (*bind_ops)[i].addr;
>>>>>>> @@ -2887,18 +2892,6 @@ static int
>>>>>>> vm_bind_ioctl_check_args(struct xe_device *xe,
>>>>>>>                          goto free_bind_ops;
>>>>>>>                  }
>>>>>>> -               if (i == 0) {
>>>>>>> -                       *async = !!(flags &
>>>>>>> DRM_XE_VM_BIND_FLAG_ASYNC);
>>>>>>> -                       if (XE_IOCTL_DBG(xe, !*async && args-
>>>>>>>> num_syncs)) {
>>>>>>> -                               err = -EINVAL;
>>>>>>> -                               goto free_bind_ops;
>>>>>>> -                       }
>>>>>>> -               } else if (XE_IOCTL_DBG(xe, *async !=
>>>>>>> -                                       !!(flags &
>>>>>>> DRM_XE_VM_BIND_FLAG_ASYNC))) {
>>>>>>> -                       err = -EINVAL;
>>>>>>> -                       goto free_bind_ops;
>>>>>>> -               }
>>>>>>> -
>>>>>>>                  if (XE_IOCTL_DBG(xe, op >
>>>>>>> DRM_XE_VM_BIND_OP_PREFETCH) ||
>>>>>>>                      XE_IOCTL_DBG(xe, flags &
>>>>>>> ~SUPPORTED_FLAGS) ||
>>>>>>>                      XE_IOCTL_DBG(xe, obj && is_null) ||
>>>>>>> @@ -2951,7 +2944,7 @@ static int
>>>>>>> vm_bind_ioctl_check_args(struct xe_device *xe,
>>>>>>>     static int vm_bind_ioctl_signal_fences(struct xe_vm *vm,
>>>>>>>                                         struct xe_exec_queue
>>>>>>> *q,
>>>>>>>                                         struct xe_sync_entry
>>>>>>> *syncs,
>>>>>>> -                                      int num_syncs)
>>>>>>> +                                      int num_syncs, bool
>>>>>>> async)
>>>>>>>     {
>>>>>>>          struct dma_fence *fence;
>>>>>>>          int i, err = 0;
>>>>>>> @@ -2967,7 +2960,7 @@ static int
>>>>>>> vm_bind_ioctl_signal_fences(struct xe_vm *vm,
>>>>>>>          xe_exec_queue_last_fence_set(to_wait_exec_queue(vm,
>>>>>>> q), vm,
>>>>>>>                                       fence);
>>>>>>> -       if (xe_vm_sync_mode(vm, q)) {
>>>>>>> +       if (!async) {
>>>>>>>                  long timeout = dma_fence_wait(fence, true);
>>>>>>>                  if (timeout < 0)
>>>>>>> @@ -3001,7 +2994,7 @@ int xe_vm_bind_ioctl(struct drm_device
>>>>>>> *dev, void *data, struct drm_file *file)
>>>>>>>          if (err)
>>>>>>>                  return err;
>>>>>>> -       if (XE_IOCTL_DBG(xe, args->pad || args->pad2) ||
>>>>>>> +       if (XE_IOCTL_DBG(xe, args->pad) ||
>>>>>>>              XE_IOCTL_DBG(xe, args->reserved[0] || args-
>>>>>>>> reserved[1]))
>>>>>>>                  return -EINVAL;
>>>>>>> @@ -3016,12 +3009,6 @@ int xe_vm_bind_ioctl(struct drm_device
>>>>>>> *dev, void *data, struct drm_file *file)
>>>>>>>                          err = -EINVAL;
>>>>>>>                          goto put_exec_queue;
>>>>>>>                  }
>>>>>>> -
>>>>>>> -               if (XE_IOCTL_DBG(xe, args->num_binds && async
>>>>>>> !=
>>>>>>> -                                !!(q->flags &
>>>>>>> EXEC_QUEUE_FLAG_VM_ASYNC))) {
>>>>>>> -                       err = -EINVAL;
>>>>>>> -                       goto put_exec_queue;
>>>>>>> -               }
>>>>>>>          }
>>>>>>>          vm = xe_vm_lookup(xef, args->vm_id);
>>>>>>> @@ -3030,14 +3017,6 @@ int xe_vm_bind_ioctl(struct drm_device
>>>>>>> *dev, void *data, struct drm_file *file)
>>>>>>>                  goto put_exec_queue;
>>>>>>>          }
>>>>>>> -       if (!args->exec_queue_id) {
>>>>>>> -               if (XE_IOCTL_DBG(xe, args->num_binds && async
>>>>>>> !=
>>>>>>> -                                !!(vm->flags &
>>>>>>> XE_VM_FLAG_ASYNC_DEFAULT))) {
>>>>>>> -                       err = -EINVAL;
>>>>>>> -                       goto put_vm;
>>>>>>> -               }
>>>>>>> -       }
>>>>>>> -
>>>>>>>          err = down_write_killable(&vm->lock);
>>>>>>>          if (err)
>>>>>>>                  goto put_vm;
>>>>>>> @@ -3127,16 +3106,16 @@ int xe_vm_bind_ioctl(struct
>>>>>>> drm_device *dev, void *data, struct drm_file *file)
>>>>>>>                  }
>>>>>>>          }
>>>>>>> -       if (args->num_syncs) {
>>>>>>> -               syncs = kcalloc(args->num_syncs,
>>>>>>> sizeof(*syncs), GFP_KERNEL);
>>>>>>> +       if (args->syncs.num_syncs) {
>>>>>>> +               syncs = kcalloc(args->syncs.num_syncs,
>>>>>>> sizeof(*syncs), GFP_KERNEL);
>>>>>>>                  if (!syncs) {
>>>>>>>                          err = -ENOMEM;
>>>>>>>                          goto put_obj;
>>>>>>>                  }
>>>>>>>          }
>>>>>>> -       syncs_user = u64_to_user_ptr(args->syncs);
>>>>>>> -       for (num_syncs = 0; num_syncs < args->num_syncs;
>>>>>>> num_syncs++) {
>>>>>>> +       syncs_user = u64_to_user_ptr(args->syncs.syncs);
>>>>>>> +       for (num_syncs = 0; num_syncs < args-
>>>>>>>> syncs.num_syncs; num_syncs++) {
>>>>>>>                  err = xe_sync_entry_parse(xe, xef,
>>>>>>> &syncs[num_syncs],
>>>>>>>                                           
>>>>>>> &syncs_user[num_syncs],
>>>>>>>                                           
>>>>>>> (xe_vm_in_lr_mode(vm) ?
>>>>>>> @@ -3210,7 +3189,8 @@ int xe_vm_bind_ioctl(struct drm_device
>>>>>>> *dev, void *data, struct drm_file *file)
>>>>>>>          vm_bind_ioctl_ops_unwind(vm, ops, args->num_binds);
>>>>>>>     free_syncs:
>>>>>>>          if (err == -ENODATA)
>>>>>>> -               err = vm_bind_ioctl_signal_fences(vm, q,
>>>>>>> syncs, num_syncs);
>>>>>>> +               err = vm_bind_ioctl_signal_fences(vm, q,
>>>>>>> syncs, num_syncs,
>>>>>>> +                                                 async);
>>>>>>>          while (num_syncs--)
>>>>>>>                  xe_sync_entry_cleanup(&syncs[num_syncs]);
>>>>>>> diff --git a/drivers/gpu/drm/xe/xe_vm_types.h
>>>>>>> b/drivers/gpu/drm/xe/xe_vm_types.h
>>>>>>> index 23abdfd8622f..ce8b9bde7e9c 100644
>>>>>>> --- a/drivers/gpu/drm/xe/xe_vm_types.h
>>>>>>> +++ b/drivers/gpu/drm/xe/xe_vm_types.h
>>>>>>> @@ -167,13 +167,12 @@ struct xe_vm {
>>>>>>>           */
>>>>>>>     #define XE_VM_FLAG_64K                       BIT(0)
>>>>>>>     #define XE_VM_FLAG_LR_MODE           BIT(1)
>>>>>>> -#define XE_VM_FLAG_ASYNC_DEFAULT       BIT(2)
>>>>>>> -#define XE_VM_FLAG_MIGRATION           BIT(3)
>>>>>>> -#define XE_VM_FLAG_SCRATCH_PAGE                BIT(4)
>>>>>>> -#define XE_VM_FLAG_FAULT_MODE          BIT(5)
>>>>>>> -#define XE_VM_FLAG_BANNED              BIT(6)
>>>>>>> -#define XE_VM_FLAG_TILE_ID(flags)      FIELD_GET(GENMASK(8,
>>>>>>> 7), flags)
>>>>>>> -#define XE_VM_FLAG_SET_TILE_ID(tile)   FIELD_PREP(GENMASK(8,
>>>>>>> 7), (tile)->id)
>>>>>>> +#define XE_VM_FLAG_MIGRATION           BIT(2)
>>>>>>> +#define XE_VM_FLAG_SCRATCH_PAGE                BIT(3)
>>>>>>> +#define XE_VM_FLAG_FAULT_MODE          BIT(4)
>>>>>>> +#define XE_VM_FLAG_BANNED              BIT(5)
>>>>>>> +#define XE_VM_FLAG_TILE_ID(flags)      FIELD_GET(GENMASK(7,
>>>>>>> 6), flags)
>>>>>>> +#define XE_VM_FLAG_SET_TILE_ID(tile)   FIELD_PREP(GENMASK(7,
>>>>>>> 6), (tile)->id)
>>>>>>>          unsigned long flags;
>>>>>>>          /** @composite_fence_ctx: context composite fence */
>>>>>>> @@ -385,6 +384,8 @@ enum xe_vma_op_flags {
>>>>>>>          XE_VMA_OP_PREV_COMMITTED        = BIT(3),
>>>>>>>          /** @XE_VMA_OP_NEXT_COMMITTED: Next VMA operation
>>>>>>> committed */
>>>>>>>          XE_VMA_OP_NEXT_COMMITTED        = BIT(4),
>>>>>>> +       /** @XE_VMA_OP_ASYNC: operation is async */
>>>>>>> +       XE_VMA_OP_ASYNC                 = BIT(5),
>>>>>>>     };
>>>>>>>     /** struct xe_vma_op - VMA operation */
>>>>>>> diff --git a/include/uapi/drm/xe_drm.h
>>>>>>> b/include/uapi/drm/xe_drm.h
>>>>>>> index eb03a49c17a1..fd8172fe2d9a 100644
>>>>>>> --- a/include/uapi/drm/xe_drm.h
>>>>>>> +++ b/include/uapi/drm/xe_drm.h
>>>>>>> @@ -141,8 +141,7 @@ struct drm_xe_engine_class_instance {
>>>>>>>           * Kernel only classes (not actual hardware engine
>>>>>>> class). Used for
>>>>>>>           * creating ordered queues of VM bind operations.
>>>>>>>           */
>>>>>>> -#define DRM_XE_ENGINE_CLASS_VM_BIND_ASYNC      5
>>>>>>> -#define DRM_XE_ENGINE_CLASS_VM_BIND_SYNC       6
>>>>>>> +#define DRM_XE_ENGINE_CLASS_VM_BIND            5
>>>>>>>          __u16 engine_class;
>>>>>>>          __u16 engine_instance;
>>>>>>> @@ -660,7 +659,6 @@ struct drm_xe_vm_create {
>>>>>>>           * still enable recoverable pagefaults if supported
>>>>>>> by the device.
>>>>>>>           */
>>>>>>>     #define DRM_XE_VM_CREATE_FLAG_LR_MODE                (1 <<
>>>>>>> 1)
>>>>>>> -#define DRM_XE_VM_CREATE_FLAG_ASYNC_DEFAULT    (1 << 2)
>>>>>>>          /*
>>>>>>>           * DRM_XE_VM_CREATE_FLAG_FAULT_MODE requires also
>>>>>>>           * DRM_XE_VM_CREATE_FLAG_LR_MODE. It allows memory to
>>>>>>> be allocated
>>>>>>> @@ -668,7 +666,7 @@ struct drm_xe_vm_create {
>>>>>>>           * The xe driver internally uses recoverable
>>>>>>> pagefaults to implement
>>>>>>>           * this.
>>>>>>>           */
>>>>>>> -#define DRM_XE_VM_CREATE_FLAG_FAULT_MODE       (1 << 3)
>>>>>>> +#define DRM_XE_VM_CREATE_FLAG_FAULT_MODE       (1 << 2)
>>>>>>>          /** @flags: Flags */
>>>>>>>          __u32 flags;
>>>>>>> @@ -776,12 +774,11 @@ struct drm_xe_vm_bind_op {
>>>>>>>          __u32 op;
>>>>>>>     #define DRM_XE_VM_BIND_FLAG_READONLY (1 << 0)
>>>>>>> -#define DRM_XE_VM_BIND_FLAG_ASYNC      (1 << 1)
>>>>>>>          /*
>>>>>>>           * Valid on a faulting VM only, do the MAP operation
>>>>>>> immediately rather
>>>>>>>           * than deferring the MAP to the page fault handler.
>>>>>>>           */
>>>>>>> -#define DRM_XE_VM_BIND_FLAG_IMMEDIATE  (1 << 2)
>>>>>>> +#define DRM_XE_VM_BIND_FLAG_IMMEDIATE  (1 << 1)
>>>>>>>          /*
>>>>>>>           * When the NULL flag is set, the page tables are
>>>>>>> setup with a special
>>>>>>>           * bit which indicates writes are dropped and all
>>>>>>> reads return zero.  In
>>>>>>> @@ -789,7 +786,7 @@ struct drm_xe_vm_bind_op {
>>>>>>>           * operations, the BO handle MBZ, and the BO offset
>>>>>>> MBZ. This flag is
>>>>>>>           * intended to implement VK sparse bindings.
>>>>>>>           */
>>>>>>> -#define DRM_XE_VM_BIND_FLAG_NULL       (1 << 3)
>>>>>>> +#define DRM_XE_VM_BIND_FLAG_NULL       (1 << 2)
>>>>>>>          /** @flags: Bind flags */
>>>>>>>          __u32 flags;
>>>>>>> @@ -807,6 +804,27 @@ struct drm_xe_vm_bind_op {
>>>>>>>          __u64 reserved[3];
>>>>>>>     };
>>>>>>> +/**
>>>>>>> + * struct drm_xe_syncs - In / out syncs for IOCTLs.
>>>>>>> + */
>>>>>>> +struct drm_xe_syncs {
>>>>>>> +       /** @num_syncs: amount of syncs to wait on */
>>>>>>> +       __u32 num_syncs;
>>>>>>> +
>>>>>>> +       /*
>>>>>>> +        * Block in IOCTL until operation complete, num_syncs
>>>>>>> MBZ if set.
>>>>>>> +        */
>>>>>>> +#define DRM_XE_SYNCS_FLAG_WAIT_FOR_OP (1 << 0)
>>>>>>> +       /** @flags: Sync flags */
>>>>>>> +       __u32 flags;
>>>>>>> +
>>>>>>> +       /** @syncs: pointer to struct drm_xe_sync array */
>>>>>>> +       __u64 syncs;
>>>>>>> +
>>>>>>> +       /** @reserved: Reserved */
>>>>>>> +       __u64 reserved[2];
>>>>>>> +};
>>>>>>> +
>>>>>>>     struct drm_xe_vm_bind {
>>>>>>>          /** @extensions: Pointer to the first extension
>>>>>>> struct, if any */
>>>>>>>          __u64 extensions;
>>>>>>> @@ -838,14 +856,8 @@ struct drm_xe_vm_bind {
>>>>>>>                  __u64 vector_of_binds;
>>>>>>>          };
>>>>>>> -       /** @pad: MBZ */
>>>>>>> -       __u32 pad2;
>>>>>>> -
>>>>>>> -       /** @num_syncs: amount of syncs to wait on */
>>>>>>> -       __u32 num_syncs;
>>>>>>> -
>>>>>>> -       /** @syncs: pointer to struct drm_xe_sync array */
>>>>>>> -       __u64 syncs;
>>>>>>> +       /** @syncs: syncs for bind */
>>>>>>> +       struct drm_xe_syncs syncs;
>>>>>>>          /** @reserved: Reserved */
>>>>>>>          __u64 reserved[2];
>>>>>>> @@ -974,14 +986,14 @@ struct drm_xe_exec {
>>>>>>>          /** @extensions: Pointer to the first extension
>>>>>>> struct, if any */
>>>>>>>          __u64 extensions;
>>>>>>> +       /** @pad: MBZ */
>>>>>>> +       __u32 pad;
>>>>>>> +
>>>>>>>          /** @exec_queue_id: Exec queue ID for the batch
>>>>>>> buffer */
>>>>>>>          __u32 exec_queue_id;
>>>>>>> -       /** @num_syncs: Amount of struct drm_xe_sync in
>>>>>>> array. */
>>>>>>> -       __u32 num_syncs;
>>>>>>> -
>>>>>>> -       /** @syncs: Pointer to struct drm_xe_sync array. */
>>>>>>> -       __u64 syncs;
>>>>>>> +       /** @syncs: syncs for exec */
>>>>>>> +       struct drm_xe_syncs syncs;
>>>>>>>          /**
>>>>>>>           * @address: address of batch buffer if
>>>>>>> num_batch_buffer == 1 or an
>>>>>>> @@ -995,8 +1007,8 @@ struct drm_xe_exec {
>>>>>>>           */
>>>>>>>          __u16 num_batch_buffer;
>>>>>>> -       /** @pad: MBZ */
>>>>>>> -       __u16 pad[3];
>>>>>>> +       /** @pad2: MBZ */
>>>>>>> +       __u16 pad2[3];
>>>>>>>          /** @reserved: Reserved */
>>>>>>>          __u64 reserved[2];

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [Intel-xe] [RFC PATCH 5/7] drm/xe: Take in-syncs into account when num_execs or num_binds == 0
  2023-12-08 15:04   ` Thomas Hellström
@ 2023-12-12 17:18     ` Matthew Brost
  0 siblings, 0 replies; 22+ messages in thread
From: Matthew Brost @ 2023-12-12 17:18 UTC (permalink / raw)
  To: Thomas Hellström; +Cc: intel-xe

On Fri, Dec 08, 2023 at 04:04:57PM +0100, Thomas Hellström wrote:
> 
> On 12/7/23 06:57, Matthew Brost wrote:
> > Wait on in-syncs before signaling out-syncs if num_execs or num_binds ==
> > 0 in execbuf IOCTL or VM bind IOCTL respectfully.
> > 
> > v2: Wait on last fence in addition to in-fences (Thomas)
> > v3: Use function for in-fence signaling
> > 
> > Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> > ---
> >   drivers/gpu/drm/xe/xe_exec.c | 10 ++++-
> >   drivers/gpu/drm/xe/xe_sync.c | 74 ++++++++++++++++++++++++++++++++++++
> >   drivers/gpu/drm/xe/xe_sync.h |  5 +++
> >   drivers/gpu/drm/xe/xe_vm.c   | 41 ++++++++++++++++----
> >   4 files changed, 121 insertions(+), 9 deletions(-)
> 
> Should we move patch 5/7 and 6/7 up the series to that everything works as
> expected when we enable the functionality.
> 

5/6 build on 3/4, how about we squash them into a single patch?

Matt

> Reviewed-by: Thomas Hellström <thomas.hellstrom@linux.intel.com>
> 
> 

^ permalink raw reply	[flat|nested] 22+ messages in thread

end of thread, other threads:[~2023-12-12 17:18 UTC | newest]

Thread overview: 22+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2023-12-07  5:57 [Intel-xe] [RFC PATCH 0/7] Syncs vs async exec/bind uAPI change Matthew Brost
2023-12-07  5:57 ` [Intel-xe] [RFC PATCH 1/7] drm/xe: Use a flags field instead of bools for VMA create Matthew Brost
2023-12-07  5:57 ` [Intel-xe] [RFC PATCH 2/7] drm/xe: Use a flags field instead of bools for sync parse Matthew Brost
2023-12-07  5:57 ` [Intel-xe] [RFC PATCH 3/7] drm/xe: Allow num_binds == 0 in VM bind IOCTL Matthew Brost
2023-12-07  5:57 ` [Intel-xe] [RFC PATCH 4/7] drm/xe: Allow num_batch_buffer == 0 in exec IOCTL Matthew Brost
2023-12-07  5:57 ` [Intel-xe] [RFC PATCH 5/7] drm/xe: Take in-syncs into account when num_execs or num_binds == 0 Matthew Brost
2023-12-08 15:04   ` Thomas Hellström
2023-12-12 17:18     ` Matthew Brost
2023-12-07  5:57 ` [Intel-xe] [RFC PATCH 6/7] drm/xe: Add last fence as dependency for jobs on user exec queues Matthew Brost
2023-12-07  5:57 ` [Intel-xe] [RFC PATCH 7/7] drm/xe/uapi: Uniform async vs sync handling Matthew Brost
2023-12-07 19:51   ` Rodrigo Vivi
2023-12-08 15:00   ` Thomas Hellström
2023-12-08  9:45     ` Matthew Brost
2023-12-11 15:43       ` Thomas Hellström
2023-12-11 16:49         ` Matthew Brost
2023-12-11 18:11           ` Thomas Hellström
2023-12-11 21:11             ` Matthew Brost
2023-12-12  8:43               ` Thomas Hellström
2023-12-08 12:24     ` Matthew Brost
2023-12-11 15:34       ` Thomas Hellström
2023-12-11 16:50         ` Matthew Brost
2023-12-07  7:38 ` [Intel-xe] ✗ CI.Patch_applied: failure for Syncs vs async exec/bind uAPI change Patchwork

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox