[PATCH v3 2/2] drm/xe: Decouple bind queue last fence from TLB invalidations

Intel-XE Archive on lore.kernel.org
 help / color / mirror / Atom feed

From: Matthew Brost <matthew.brost@intel.com>
To: intel-xe@lists.freedesktop.org
Cc: thomas.hellstrom@linux.intel.com
Subject: [PATCH v3 2/2] drm/xe: Decouple bind queue last fence from TLB invalidations
Date: Fri, 24 Oct 2025 15:20:47 -0700	[thread overview]
Message-ID: <20251024222047.1481039-3-matthew.brost@intel.com> (raw)
In-Reply-To: <20251024222047.1481039-1-matthew.brost@intel.com>

Separate the bind queue's last fence to apply only to the bind job,
rather than combining it with associated TLB invalidation jobs. This
avoids unnecessary serialization of bind jobs on prior TLB
invalidations.

Since user fence signaling depends on the completion of both bind and
TLB invalidation jobs, their fences are merged later in the bind
pipeline to preserve correct signaling order.

Link: https://gitlab.freedesktop.org/drm/xe/kernel/-/issues/6047
Signed-off-by: Matthew Brost <matthew.brost@intel.com>

---
v3:
 - Fix lockdep assert for migrate queues (CI)
---
 drivers/gpu/drm/xe/xe_exec_queue.c |  3 +-
 drivers/gpu/drm/xe/xe_pt.c         | 64 ++++++++++--------------------
 drivers/gpu/drm/xe/xe_sync.c       | 58 +++++++++++++++++++++++----
 drivers/gpu/drm/xe/xe_vm.c         | 53 +++++++++++++------------
 4 files changed, 103 insertions(+), 75 deletions(-)

diff --git a/drivers/gpu/drm/xe/xe_exec_queue.c b/drivers/gpu/drm/xe/xe_exec_queue.c
index 036640916f97..36ddf98ff537 100644
--- a/drivers/gpu/drm/xe/xe_exec_queue.c
+++ b/drivers/gpu/drm/xe/xe_exec_queue.c
@@ -1006,7 +1006,8 @@ static void xe_exec_queue_last_fence_lockdep_assert(struct xe_exec_queue *q,
 		lockdep_assert_held(&vm->lock);
 	} else {
 		xe_vm_assert_held(vm);
-		lockdep_assert_held(&q->hwe->hw_engine_group->mode_sem);
+		if (!(q->flags & EXEC_QUEUE_FLAG_MIGRATE))
+			lockdep_assert_held(&q->hwe->hw_engine_group->mode_sem);
 	}
 }
 
diff --git a/drivers/gpu/drm/xe/xe_pt.c b/drivers/gpu/drm/xe/xe_pt.c
index d22fd1ccc0ba..7637757ca0dc 100644
--- a/drivers/gpu/drm/xe/xe_pt.c
+++ b/drivers/gpu/drm/xe/xe_pt.c
@@ -3,8 +3,6 @@
  * Copyright © 2022 Intel Corporation
  */
 
-#include <linux/dma-fence-array.h>
-
 #include "xe_pt.h"
 
 #include "regs/xe_gtt_defs.h"
@@ -2359,10 +2357,9 @@ xe_pt_update_ops_run(struct xe_tile *tile, struct xe_vma_ops *vops)
 	struct xe_vm *vm = vops->vm;
 	struct xe_vm_pgtable_update_ops *pt_update_ops =
 		&vops->pt_update_ops[tile->id];
-	struct dma_fence *fence, *ifence, *mfence;
+	struct xe_exec_queue *q = pt_update_ops->q;
+	struct dma_fence *fence, *ifence = NULL, *mfence = NULL;
 	struct xe_tlb_inval_job *ijob = NULL, *mjob = NULL;
-	struct dma_fence **fences = NULL;
-	struct dma_fence_array *cf = NULL;
 	struct xe_range_fence *rfence;
 	struct xe_vma_op *op;
 	int err = 0, i;
@@ -2390,7 +2387,6 @@ xe_pt_update_ops_run(struct xe_tile *tile, struct xe_vma_ops *vops)
 #endif
 
 	if (pt_update_ops->needs_invalidation) {
-		struct xe_exec_queue *q = pt_update_ops->q;
 		struct xe_dep_scheduler *dep_scheduler =
 			to_dep_scheduler(q, tile->primary_gt);
 
@@ -2419,17 +2415,6 @@ xe_pt_update_ops_run(struct xe_tile *tile, struct xe_vma_ops *vops)
 				goto free_ijob;
 			}
 			update.mjob = mjob;
-
-			fences = kmalloc_array(2, sizeof(*fences), GFP_KERNEL);
-			if (!fences) {
-				err = -ENOMEM;
-				goto free_ijob;
-			}
-			cf = dma_fence_array_alloc(2);
-			if (!cf) {
-				err = -ENOMEM;
-				goto free_ijob;
-			}
 		}
 	}
 
@@ -2460,31 +2445,12 @@ xe_pt_update_ops_run(struct xe_tile *tile, struct xe_vma_ops *vops)
 				  pt_update_ops->last, fence))
 		dma_fence_wait(fence, false);
 
-	/* tlb invalidation must be done before signaling unbind/rebind */
-	if (ijob) {
-		struct dma_fence *__fence;
-
+	if (ijob)
 		ifence = xe_tlb_inval_job_push(ijob, tile->migrate, fence);
-		__fence = ifence;
-
-		if (mjob) {
-			fences[0] = ifence;
-			mfence = xe_tlb_inval_job_push(mjob, tile->migrate,
-						       fence);
-			fences[1] = mfence;
-
-			dma_fence_array_init(cf, 2, fences,
-					     vm->composite_fence_ctx,
-					     vm->composite_fence_seqno++,
-					     false);
-			__fence = &cf->base;
-		}
+	if (mjob)
+		mfence = xe_tlb_inval_job_push(mjob, tile->migrate, fence);
 
-		dma_fence_put(fence);
-		fence = __fence;
-	}
-
-	if (!mjob) {
+	if (!mjob && !ijob) {
 		dma_resv_add_fence(xe_vm_resv(vm), fence,
 				   pt_update_ops->wait_vm_bookkeep ?
 				   DMA_RESV_USAGE_KERNEL :
@@ -2492,6 +2458,14 @@ xe_pt_update_ops_run(struct xe_tile *tile, struct xe_vma_ops *vops)
 
 		list_for_each_entry(op, &vops->list, link)
 			op_commit(vops->vm, tile, pt_update_ops, op, fence, NULL);
+	} else if (ijob && !mjob) {
+		dma_resv_add_fence(xe_vm_resv(vm), ifence,
+				   pt_update_ops->wait_vm_bookkeep ?
+				   DMA_RESV_USAGE_KERNEL :
+				   DMA_RESV_USAGE_BOOKKEEP);
+
+		list_for_each_entry(op, &vops->list, link)
+			op_commit(vops->vm, tile, pt_update_ops, op, ifence, NULL);
 	} else {
 		dma_resv_add_fence(xe_vm_resv(vm), ifence,
 				   pt_update_ops->wait_vm_bookkeep ?
@@ -2511,16 +2485,22 @@ xe_pt_update_ops_run(struct xe_tile *tile, struct xe_vma_ops *vops)
 	if (pt_update_ops->needs_svm_lock)
 		xe_svm_notifier_unlock(vm);
 
+	xe_exec_queue_last_fence_set(q, vm, fence);
+	xe_exec_queue_tlb_inval_last_fence_set(q, vm, ifence,
+					       XE_EXEC_QUEUE_TLB_INVAL_PRIMARY_GT);
+	xe_exec_queue_tlb_inval_last_fence_set(q, vm, mfence,
+					       XE_EXEC_QUEUE_TLB_INVAL_MEDIA_GT);
+
 	xe_tlb_inval_job_put(mjob);
 	xe_tlb_inval_job_put(ijob);
+	dma_fence_put(ifence);
+	dma_fence_put(mfence);
 
 	return fence;
 
 free_rfence:
 	kfree(rfence);
 free_ijob:
-	kfree(cf);
-	kfree(fences);
 	xe_tlb_inval_job_put(mjob);
 	xe_tlb_inval_job_put(ijob);
 kill_vm_tile1:
diff --git a/drivers/gpu/drm/xe/xe_sync.c b/drivers/gpu/drm/xe/xe_sync.c
index 82872a51f098..a5c073100b48 100644
--- a/drivers/gpu/drm/xe/xe_sync.c
+++ b/drivers/gpu/drm/xe/xe_sync.c
@@ -14,7 +14,7 @@
 #include <drm/drm_syncobj.h>
 #include <uapi/drm/xe_drm.h>
 
-#include "xe_device_types.h"
+#include "xe_device.h"
 #include "xe_exec_queue.h"
 #include "xe_macros.h"
 #include "xe_sched_job_types.h"
@@ -284,26 +284,70 @@ xe_sync_in_fence_get(struct xe_sync_entry *sync, int num_sync,
 	struct dma_fence **fences = NULL;
 	struct dma_fence_array *cf = NULL;
 	struct dma_fence *fence;
-	int i, num_in_fence = 0, current_fence = 0;
+	int i, num_fence = 0, current_fence = 0;
 
 	lockdep_assert_held(&vm->lock);
 
 	/* Count in-fences */
 	for (i = 0; i < num_sync; ++i) {
 		if (sync[i].fence) {
-			++num_in_fence;
+			++num_fence;
 			fence = sync[i].fence;
 		}
 	}
 
 	/* Easy case... */
-	if (!num_in_fence) {
+	if (!num_fence) {
+		if (q->flags & EXEC_QUEUE_FLAG_VM) {
+			struct xe_exec_queue *__q;
+			struct xe_tile *tile;
+			u8 id;
+
+			for_each_tile(tile, vm->xe, id)
+				num_fence += (1 + XE_MAX_GT_PER_TILE);
+
+			fences = kmalloc_array(num_fence, sizeof(*fences),
+					       GFP_KERNEL);
+			if (!fences)
+				return ERR_PTR(-ENOMEM);
+
+			fences[current_fence++] =
+				xe_exec_queue_last_fence_get(q, vm);
+			for_each_tlb_inval(i)
+				fences[current_fence++] =
+					xe_exec_queue_tlb_inval_last_fence_get(q, vm, i);
+			list_for_each_entry(__q, &q->multi_gt_list,
+					    multi_gt_link) {
+				fences[current_fence++] =
+					xe_exec_queue_last_fence_get(__q, vm);
+				for_each_tlb_inval(i)
+					fences[current_fence++] =
+						xe_exec_queue_tlb_inval_last_fence_get(__q, vm, i);
+			}
+
+			xe_assert(vm->xe, current_fence == num_fence);
+			cf = dma_fence_array_create(num_fence, fences,
+						    vm->composite_fence_ctx,
+						    vm->composite_fence_seqno++,
+						    false);
+			if (!cf) {
+				--vm->composite_fence_seqno;
+				goto err_out;
+			}
+
+			return &cf->base;
+		}
+
 		fence = xe_exec_queue_last_fence_get(q, vm);
 		return fence;
 	}
 
-	/* Create composite fence */
-	fences = kmalloc_array(num_in_fence + 1, sizeof(*fences), GFP_KERNEL);
+	/*
+	 * Create composite fence - FIXME - the below code doesn't work. This is
+	 * unused in Mesa so we are ok for the moment. Perhaps we just disable
+	 * this entire code path if number of in fences != 0.
+	 */
+	fences = kmalloc_array(num_fence + 1, sizeof(*fences), GFP_KERNEL);
 	if (!fences)
 		return ERR_PTR(-ENOMEM);
 	for (i = 0; i < num_sync; ++i) {
@@ -313,7 +357,7 @@ xe_sync_in_fence_get(struct xe_sync_entry *sync, int num_sync,
 		}
 	}
 	fences[current_fence++] = xe_exec_queue_last_fence_get(q, vm);
-	cf = dma_fence_array_create(num_in_fence, fences,
+	cf = dma_fence_array_create(num_fence, fences,
 				    vm->composite_fence_ctx,
 				    vm->composite_fence_seqno++,
 				    false);
diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c
index d2a2f823f1b3..f6fda1dd8f9d 100644
--- a/drivers/gpu/drm/xe/xe_vm.c
+++ b/drivers/gpu/drm/xe/xe_vm.c
@@ -3107,20 +3107,26 @@ static struct dma_fence *ops_execute(struct xe_vm *vm,
 	struct dma_fence *fence = NULL;
 	struct dma_fence **fences = NULL;
 	struct dma_fence_array *cf = NULL;
-	int number_tiles = 0, current_fence = 0, err;
+	int number_tiles = 0, current_fence = 0, n_fence = 0, err;
 	u8 id;
 
 	number_tiles = vm_ops_setup_tile_args(vm, vops);
 	if (number_tiles == 0)
 		return ERR_PTR(-ENODATA);
 
-	if (number_tiles > 1) {
-		fences = kmalloc_array(number_tiles, sizeof(*fences),
-				       GFP_KERNEL);
-		if (!fences) {
-			fence = ERR_PTR(-ENOMEM);
-			goto err_trace;
-		}
+	for_each_tile(tile, vm->xe, id)
+		n_fence += (1 + XE_MAX_GT_PER_TILE);
+
+	fences = kmalloc_array(n_fence, sizeof(*fences), GFP_KERNEL);
+	if (!fences) {
+		fence = ERR_PTR(-ENOMEM);
+		goto err_trace;
+	}
+
+	cf = dma_fence_array_alloc(n_fence);
+	if (!cf) {
+		fence = ERR_PTR(-ENOMEM);
+		goto err_out;
 	}
 
 	for_each_tile(tile, vm->xe, id) {
@@ -3137,29 +3143,28 @@ static struct dma_fence *ops_execute(struct xe_vm *vm,
 	trace_xe_vm_ops_execute(vops);
 
 	for_each_tile(tile, vm->xe, id) {
+		struct xe_exec_queue *q = vops->pt_update_ops[tile->id].q;
+		int i;
+
+		fence = NULL;
 		if (!vops->pt_update_ops[id].num_ops)
-			continue;
+			goto collect_fences;
 
 		fence = xe_pt_update_ops_run(tile, vops);
 		if (IS_ERR(fence))
 			goto err_out;
 
-		if (fences)
-			fences[current_fence++] = fence;
+collect_fences:
+		fences[current_fence++] = fence ?: dma_fence_get_stub();
+		for_each_tlb_inval(i)
+			fences[current_fence++] =
+				xe_exec_queue_tlb_inval_last_fence_get(q, vm, i);
 	}
 
-	if (fences) {
-		cf = dma_fence_array_create(number_tiles, fences,
-					    vm->composite_fence_ctx,
-					    vm->composite_fence_seqno++,
-					    false);
-		if (!cf) {
-			--vm->composite_fence_seqno;
-			fence = ERR_PTR(-ENOMEM);
-			goto err_out;
-		}
-		fence = &cf->base;
-	}
+	xe_assert(vm->xe, current_fence == n_fence);
+	dma_fence_array_init(cf, n_fence, fences, vm->composite_fence_ctx,
+			     vm->composite_fence_seqno++, false);
+	fence = &cf->base;
 
 	for_each_tile(tile, vm->xe, id) {
 		if (!vops->pt_update_ops[id].num_ops)
@@ -3220,7 +3225,6 @@ static void op_add_ufence(struct xe_vm *vm, struct xe_vma_op *op,
 static void vm_bind_ioctl_ops_fini(struct xe_vm *vm, struct xe_vma_ops *vops,
 				   struct dma_fence *fence)
 {
-	struct xe_exec_queue *wait_exec_queue = to_wait_exec_queue(vm, vops->q);
 	struct xe_user_fence *ufence;
 	struct xe_vma_op *op;
 	int i;
@@ -3241,7 +3245,6 @@ static void vm_bind_ioctl_ops_fini(struct xe_vm *vm, struct xe_vma_ops *vops,
 	if (fence) {
 		for (i = 0; i < vops->num_syncs; i++)
 			xe_sync_entry_signal(vops->syncs + i, fence);
-		xe_exec_queue_last_fence_set(wait_exec_queue, vm, fence);
 	}
 }
 
-- 
2.34.1

next prev parent reply	other threads:[~2025-10-24 22:20 UTC|newest]

Thread overview: 8+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-10-24 22:20 [PATCH v3 0/2] Fix serialization on burst of unbinds - v2 Matthew Brost
2025-10-24 22:20 ` [PATCH v3 1/2] drm/xe: Add last fence attachment to TLB invalidation job queues Matthew Brost
2025-10-24 22:20 ` Matthew Brost [this message]
2025-10-24 22:27 ` ✗ CI.checkpatch: warning for Fix serialization on burst of unbinds - v2 Patchwork
2025-10-24 22:28 ` ✓ CI.KUnit: success " Patchwork
2025-10-24 23:15 ` ✗ Xe.CI.BAT: failure " Patchwork
2025-10-25 11:11 ` ✗ Xe.CI.Full: " Patchwork
2025-10-27  2:13 ` [PATCH v3 0/2] " Matthew Brost

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:036640916f9 dfblob:36ddf98ff53 dfblob:d22fd1ccc0b
dfblob:7637757ca0d dfblob:82872a51f09 dfblob:a5c073100b4
dfblob:d2a2f823f1b dfblob:f6fda1dd8f9 )
 OR (
bs:"[PATCH v3 2/2] drm/xe: Decouple bind queue last fence from TLB invalidations" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20251024222047.1481039-3-matthew.brost@intel.com \
    --to=matthew.brost@intel.com \
    --cc=intel-xe@lists.freedesktop.org \
    --cc=thomas.hellstrom@linux.intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox