All of lore.kernel.org
 help / color / mirror / Atom feed
From: "Thomas Hellström" <thomas.hellstrom@linux.intel.com>
To: intel-xe@lists.freedesktop.org
Subject: [Intel-xe] [PATCH 2/2] drm/xe: Fix the separate bind-engine race using coarse-granularity dependencies
Date: Thu, 29 Jun 2023 22:51:34 +0200	[thread overview]
Message-ID: <20230629205134.111849-3-thomas.hellstrom@linux.intel.com> (raw)
In-Reply-To: <20230629205134.111849-1-thomas.hellstrom@linux.intel.com>

Separate bind-engines operating on the same VM range might race updating
page-tables. To make sure that doesn't happen, each page-table update
operation needs to collect internal dependencies to await before the
job is executed.

Provide an infrastructure to do that. Initially we save a single dma-fence
for the entire VM, which thus removes the benefit of separate bind-engines
in favour of fixing the race, but a more fine-grained dependency
tracking can be achieved by using, for example, the same method as the
i915 vma_resources (an interval tree storing unsignaled fences). That
of course comes with increasing code complexity.

This patch will break the xe_vm@bind-engines-independent igt test, but that
would need an update anyway to avoid the independent binds using the
same address range. In any case, such a test would not work with the
initial xe implementation unless the binds were using different vms.

Signed-off-by: Thomas Hellström <thomas.hellstrom@linux.intel.com>
---
 drivers/gpu/drm/xe/xe_migrate.c  |  2 ++
 drivers/gpu/drm/xe/xe_migrate.h  |  2 ++
 drivers/gpu/drm/xe/xe_pt.c       | 48 ++++++++++++++++++++++++++++++++
 drivers/gpu/drm/xe/xe_vm.c       |  1 +
 drivers/gpu/drm/xe/xe_vm_types.h |  8 ++++++
 5 files changed, 61 insertions(+)

diff --git a/drivers/gpu/drm/xe/xe_migrate.c b/drivers/gpu/drm/xe/xe_migrate.c
index 41c90f6710ee..ff0a422f59a5 100644
--- a/drivers/gpu/drm/xe/xe_migrate.c
+++ b/drivers/gpu/drm/xe/xe_migrate.c
@@ -1073,6 +1073,7 @@ xe_migrate_update_pgtables_cpu(struct xe_migrate *m,
 		return ERR_PTR(-ETIME);
 
 	if (ops->pre_commit) {
+		pt_update->job = NULL;
 		err = ops->pre_commit(pt_update);
 		if (err)
 			return ERR_PTR(err);
@@ -1294,6 +1295,7 @@ xe_migrate_update_pgtables(struct xe_migrate *m,
 		goto err_job;
 
 	if (ops->pre_commit) {
+		pt_update->job = job;
 		err = ops->pre_commit(pt_update);
 		if (err)
 			goto err_job;
diff --git a/drivers/gpu/drm/xe/xe_migrate.h b/drivers/gpu/drm/xe/xe_migrate.h
index 204337ea3b4e..b4135876e3f7 100644
--- a/drivers/gpu/drm/xe/xe_migrate.h
+++ b/drivers/gpu/drm/xe/xe_migrate.h
@@ -69,6 +69,8 @@ struct xe_migrate_pt_update {
 	const struct xe_migrate_pt_update_ops *ops;
 	/** @vma: The vma we're updating the pagetable for. */
 	struct xe_vma *vma;
+	/** @job: The job if a GPU page-table update. NULL otherwise */
+	struct xe_sched_job *job;
 };
 
 struct xe_migrate *xe_migrate_init(struct xe_tile *tile);
diff --git a/drivers/gpu/drm/xe/xe_pt.c b/drivers/gpu/drm/xe/xe_pt.c
index fe1c77b139e4..f38e7b5a3b32 100644
--- a/drivers/gpu/drm/xe/xe_pt.c
+++ b/drivers/gpu/drm/xe/xe_pt.c
@@ -1119,6 +1119,42 @@ struct xe_pt_migrate_pt_update {
 	bool locked;
 };
 
+/*
+ * This function adds the needed dependencies to a page-table update job
+ * to make sure racing jobs for separate bind engines don't race writing
+ * to the same page-table range, wreaking havoc. Initially use a single
+ * fence for the entire VM. An optimization would use smaller granularity.
+ */
+static int xe_pt_vm_dependencies(struct xe_sched_job *job, struct xe_vm *vm)
+{
+	int err;
+
+	if (!vm->last_update_fence)
+		return 0;
+
+	if (dma_fence_is_signaled(vm->last_update_fence)) {
+		dma_fence_put(vm->last_update_fence);
+		vm->last_update_fence = NULL;
+		return 0;
+	}
+
+	/* Is this a CPU update? GPU is busy updating, so return an error */
+	if (!job)
+		return -ETIME;
+
+	dma_fence_get(vm->last_update_fence);
+	err = drm_sched_job_add_dependency(&job->drm, vm->last_update_fence);
+	if (err)
+		dma_fence_put(vm->last_update_fence);
+
+	return err;
+}
+
+static int xe_pt_pre_commit(struct xe_migrate_pt_update *pt_update)
+{
+	return xe_pt_vm_dependencies(pt_update->job, pt_update->vma->vm);
+}
+
 static int xe_pt_userptr_pre_commit(struct xe_migrate_pt_update *pt_update)
 {
 	struct xe_pt_migrate_pt_update *userptr_update =
@@ -1126,6 +1162,10 @@ static int xe_pt_userptr_pre_commit(struct xe_migrate_pt_update *pt_update)
 	struct xe_vma *vma = pt_update->vma;
 	unsigned long notifier_seq = vma->userptr.notifier_seq;
 	struct xe_vm *vm = vma->vm;
+	int err = xe_pt_vm_dependencies(pt_update->job, vm);
+
+	if (err)
+		return err;
 
 	userptr_update->locked = false;
 
@@ -1164,6 +1204,7 @@ static int xe_pt_userptr_pre_commit(struct xe_migrate_pt_update *pt_update)
 
 static const struct xe_migrate_pt_update_ops bind_ops = {
 	.populate = xe_vm_populate_pgtable,
+	.pre_commit = xe_pt_pre_commit,
 };
 
 static const struct xe_migrate_pt_update_ops userptr_bind_ops = {
@@ -1345,6 +1386,9 @@ __xe_pt_bind_vma(struct xe_tile *tile, struct xe_vma *vma, struct xe_engine *e,
 	if (!IS_ERR(fence)) {
 		LLIST_HEAD(deferred);
 
+		dma_fence_put(vm->last_update_fence);
+		vm->last_update_fence = dma_fence_get(fence);
+
 		/* TLB invalidation must be done before signaling rebind */
 		if (ifence) {
 			int err = invalidation_fence_init(tile->primary_gt, ifence, fence,
@@ -1591,6 +1635,7 @@ xe_pt_commit_unbind(struct xe_vma *vma,
 
 static const struct xe_migrate_pt_update_ops unbind_ops = {
 	.populate = xe_migrate_clear_pgtable_callback,
+	.pre_commit = xe_pt_pre_commit,
 };
 
 static const struct xe_migrate_pt_update_ops userptr_unbind_ops = {
@@ -1666,6 +1711,9 @@ __xe_pt_unbind_vma(struct xe_tile *tile, struct xe_vma *vma, struct xe_engine *e
 	if (!IS_ERR(fence)) {
 		int err;
 
+		dma_fence_put(vm->last_update_fence);
+		vm->last_update_fence = dma_fence_get(fence);
+
 		/* TLB invalidation must be done before signaling unbind */
 		err = invalidation_fence_init(tile->primary_gt, ifence, fence, vma);
 		if (err) {
diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c
index 8b8c9c5aeb01..f90f3a7c6ede 100644
--- a/drivers/gpu/drm/xe/xe_vm.c
+++ b/drivers/gpu/drm/xe/xe_vm.c
@@ -1517,6 +1517,7 @@ static void vm_destroy_work_func(struct work_struct *w)
 
 	trace_xe_vm_free(vm);
 	dma_fence_put(vm->rebind_fence);
+	dma_fence_put(vm->last_update_fence);
 	dma_resv_fini(&vm->resv);
 	kfree(vm);
 }
diff --git a/drivers/gpu/drm/xe/xe_vm_types.h b/drivers/gpu/drm/xe/xe_vm_types.h
index c148dd49a6ca..5d9eebe5c6bb 100644
--- a/drivers/gpu/drm/xe/xe_vm_types.h
+++ b/drivers/gpu/drm/xe/xe_vm_types.h
@@ -343,6 +343,14 @@ struct xe_vm {
 		bool capture_once;
 	} error_capture;
 
+	/**
+	 * @last_update_fence: fence representing the last page-table
+	 * update on this VM. Used to avoid races between separate
+	 * bind engines. Ideally this should be an interval tree of
+	 * unsignaled fences. Protected by the vm resv.
+	 */
+	struct dma_fence *last_update_fence;
+
 	/** @batch_invalidate_tlb: Always invalidate TLB before batch start */
 	bool batch_invalidate_tlb;
 };
-- 
2.40.1


  parent reply	other threads:[~2023-06-29 20:51 UTC|newest]

Thread overview: 13+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-06-29 20:51 [Intel-xe] [PATCH 0/2] drm/xe: Fix two bind races Thomas Hellström
2023-06-29 20:51 ` [Intel-xe] [PATCH 1/2] drm/xe: Make page-table updates using the default engine happen in order Thomas Hellström
2023-06-30  2:18   ` Matthew Brost
2023-06-29 20:51 ` Thomas Hellström [this message]
2023-07-01  4:21   ` [Intel-xe] [PATCH 2/2] drm/xe: Fix the separate bind-engine race using coarse-granularity dependencies Matthew Brost
2023-07-02 21:13     ` Thomas Hellström
2023-06-29 20:54 ` [Intel-xe] ✓ CI.Patch_applied: success for drm/xe: Fix two bind races Patchwork
2023-06-29 20:54 ` [Intel-xe] ✓ CI.checkpatch: " Patchwork
2023-06-29 20:56 ` [Intel-xe] ✓ CI.KUnit: " Patchwork
2023-06-29 20:59 ` [Intel-xe] ✓ CI.Build: " Patchwork
2023-06-29 21:00 ` [Intel-xe] ✓ CI.Hooks: " Patchwork
2023-06-29 21:01 ` [Intel-xe] ✗ CI.checksparse: warning " Patchwork
2023-06-29 21:45 ` [Intel-xe] ○ CI.BAT: info " Patchwork

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20230629205134.111849-3-thomas.hellstrom@linux.intel.com \
    --to=thomas.hellstrom@linux.intel.com \
    --cc=intel-xe@lists.freedesktop.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.