[CI 2/3] drm/xe: Take the validation rwsem in exclusive mode on OOM

Intel-XE Archive on lore.kernel.org
 help / color / mirror / Atom feed

From: "Thomas Hellström" <thomas.hellstrom@linux.intel.com>
To: intel-xe@lists.freedesktop.org
Subject: [CI 2/3] drm/xe: Take the validation rwsem in exclusive mode on OOM
Date: Mon, 10 Jun 2024 17:20:16 +0200	[thread overview]
Message-ID: <20240610152017.43436-2-thomas.hellstrom@linux.intel.com> (raw)
In-Reply-To: <20240610152017.43436-1-thomas.hellstrom@linux.intel.com>

In the unlikely event that we hit an OOM from TTM validation, take
the validation rwsem in exclusive mode to block parallel validation
and submission on the same device.

Signed-off-by: Thomas Hellström <thomas.hellstrom@linux.intel.com>
---
 drivers/gpu/drm/xe/xe_exec.c         | 10 ++++-----
 drivers/gpu/drm/xe/xe_gt_pagefault.c | 10 ++++-----
 drivers/gpu/drm/xe/xe_vm.c           | 33 +++++++++++-----------------
 drivers/gpu/drm/xe/xe_vm.h           |  2 +-
 4 files changed, 24 insertions(+), 31 deletions(-)

diff --git a/drivers/gpu/drm/xe/xe_exec.c b/drivers/gpu/drm/xe/xe_exec.c
index fce1519e3b34..16b4224e58ae 100644
--- a/drivers/gpu/drm/xe/xe_exec.c
+++ b/drivers/gpu/drm/xe/xe_exec.c
@@ -123,7 +123,7 @@ int xe_exec_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
 	struct xe_sched_job *job;
 	struct xe_vm *vm;
 	bool write_locked, skip_retry = false;
-	ktime_t end = 0;
+	bool exclusive = false;
 	int err = 0;
 
 	if (XE_IOCTL_DBG(xe, args->extensions) ||
@@ -229,11 +229,11 @@ int xe_exec_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
 	vm_exec.vm = &vm->gpuvm;
 	vm_exec.flags = DRM_EXEC_INTERRUPTIBLE_WAIT;
 	if (xe_vm_in_lr_mode(vm)) {
-		xe_exec_init(exec, vm_exec.flags, 0, xe, false);
+		xe_exec_init(exec, vm_exec.flags, 0, xe, exclusive);
 	} else {
-		err = xe_gpuvm_exec_lock(&vm_exec, xe, false);
+		err = xe_gpuvm_exec_lock(&vm_exec, xe, exclusive);
 		if (err) {
-			if (xe_vm_validate_should_retry(exec, err, &end))
+			if (xe_vm_validate_should_retry(exec, err, &exclusive))
 				err = -EAGAIN;
 			goto err_unlock_list;
 		}
@@ -320,7 +320,7 @@ int xe_exec_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
 	if (err)
 		xe_sched_job_put(job);
 err_exec:
-	xe_exec_fini(exec, xe, false);
+	xe_exec_fini(exec, xe, exclusive);
 err_unlock_list:
 	up_read(&vm->lock);
 	if (err == -EAGAIN && !skip_retry)
diff --git a/drivers/gpu/drm/xe/xe_gt_pagefault.c b/drivers/gpu/drm/xe/xe_gt_pagefault.c
index a0b9b6c56db1..b8d0076a2dd2 100644
--- a/drivers/gpu/drm/xe/xe_gt_pagefault.c
+++ b/drivers/gpu/drm/xe/xe_gt_pagefault.c
@@ -132,7 +132,7 @@ static int handle_vma_pagefault(struct xe_tile *tile, struct pagefault *pf,
 	struct xe_vm *vm = xe_vma_vm(vma);
 	struct drm_exec exec;
 	struct dma_fence *fence;
-	ktime_t end = 0;
+	bool exclusive = false;
 	int err;
 	bool atomic;
 
@@ -154,11 +154,11 @@ static int handle_vma_pagefault(struct xe_tile *tile, struct pagefault *pf,
 	}
 
 	/* Lock VM and BOs dma-resv */
-	(void) xe_exec_init(&exec, 0, 0, vm->xe, false);
+	(void) xe_exec_init(&exec, 0, 0, vm->xe, exclusive);
 	drm_exec_until_all_locked(&exec) {
 		err = xe_pf_begin(&exec, vma, atomic, tile->id);
 		drm_exec_retry_on_contention(&exec);
-		if (xe_vm_validate_should_retry(&exec, err, &end))
+		if (xe_vm_validate_should_retry(&exec, err, &exclusive))
 			err = -EAGAIN;
 		if (err)
 			goto unlock_dma_resv;
@@ -168,7 +168,7 @@ static int handle_vma_pagefault(struct xe_tile *tile, struct pagefault *pf,
 		fence = xe_vma_rebind(vm, vma, BIT(tile->id));
 		if (IS_ERR(fence)) {
 			err = PTR_ERR(fence);
-			if (xe_vm_validate_should_retry(&exec, err, &end))
+			if (xe_vm_validate_should_retry(&exec, err, &exclusive))
 				err = -EAGAIN;
 			goto unlock_dma_resv;
 		}
@@ -179,7 +179,7 @@ static int handle_vma_pagefault(struct xe_tile *tile, struct pagefault *pf,
 	vma->tile_invalidated &= ~BIT(tile->id);
 
 unlock_dma_resv:
-	xe_exec_fini(&exec, vm->xe, false);
+	xe_exec_fini(&exec, vm->xe, exclusive);
 	if (err == -EAGAIN)
 		goto retry_userptr;
 
diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c
index 25cc4e68ca58..3399c7e5bf4d 100644
--- a/drivers/gpu/drm/xe/xe_vm.c
+++ b/drivers/gpu/drm/xe/xe_vm.c
@@ -341,32 +341,25 @@ static void xe_vm_kill(struct xe_vm *vm, bool unlocked)
  * xe_vm_validate_should_retry() - Whether to retry after a validate error.
  * @exec: The drm_exec object used for locking before validation.
  * @err: The error returned from ttm_bo_validate().
- * @end: A ktime_t cookie that should be set to 0 before first use and
- * that should be reused on subsequent calls.
+ * @exclusive: A pointer to a bool that holds the previous validation
+ * semaphore locking mode on input and the desired locking mode on output.
  *
  * With multiple active VMs, under memory pressure, it is possible that
  * ttm_bo_validate() run into -EDEADLK and in such case returns -ENOMEM.
  * Until ttm properly handles locking in such scenarios, best thing the
- * driver can do is retry with a timeout. Check if that is necessary, and
- * if so unlock the drm_exec's objects while keeping the ticket to prepare
- * for a rerun.
+ * driver can do is retry locking out other validators.
+ * Check if that is necessary, and also indicate the intended mode of
+ * the validation semaphore.
  *
  * Return: true if a retry after drm_exec_init() is recommended;
  * false otherwise.
  */
-bool xe_vm_validate_should_retry(struct drm_exec *exec, int err, ktime_t *end)
+bool xe_vm_validate_should_retry(struct drm_exec *exec, int err, bool *exclusive)
 {
-	ktime_t cur;
-
-	if (err != -ENOMEM)
-		return false;
-
-	cur = ktime_get();
-	*end = *end ? : ktime_add_ms(cur, XE_VM_REBIND_RETRY_TIMEOUT_MS);
-	if (!ktime_before(cur, *end))
+	if (err != -ENOMEM || *exclusive)
 		return false;
 
-	msleep(20);
+	*exclusive = true;
 	return true;
 }
 
@@ -473,7 +466,7 @@ static void preempt_rebind_work_func(struct work_struct *w)
 	struct drm_exec exec;
 	unsigned int fence_count = 0;
 	LIST_HEAD(preempt_fences);
-	ktime_t end = 0;
+	bool exclusive = false;
 	int err = 0;
 	long wait;
 	int __maybe_unused tries = 0;
@@ -496,7 +489,7 @@ static void preempt_rebind_work_func(struct work_struct *w)
 			goto out_unlock_outer;
 	}
 
-	err = xe_exec_init(&exec, DRM_EXEC_INTERRUPTIBLE_WAIT, 0, vm->xe, false);
+	err = xe_exec_init(&exec, DRM_EXEC_INTERRUPTIBLE_WAIT, 0, vm->xe, exclusive);
 	if (err)
 		goto out_unlock_outer;
 
@@ -506,8 +499,8 @@ static void preempt_rebind_work_func(struct work_struct *w)
 		err = xe_preempt_work_begin(&exec, vm, &done);
 		drm_exec_retry_on_contention(&exec);
 		if (err || done) {
-			xe_exec_fini(&exec, vm->xe, false);
-			if (err && xe_vm_validate_should_retry(&exec, err, &end))
+			xe_exec_fini(&exec, vm->xe, exclusive);
+			if (err && xe_vm_validate_should_retry(&exec, err, &exclusive))
 				err = -EAGAIN;
 
 			goto out_unlock_outer;
@@ -555,7 +548,7 @@ static void preempt_rebind_work_func(struct work_struct *w)
 	up_read(&vm->userptr.notifier_lock);
 
 out_unlock:
-	xe_exec_fini(&exec, vm->xe, false);
+	xe_exec_fini(&exec, vm->xe, exclusive);
 out_unlock_outer:
 	if (err == -EAGAIN) {
 		trace_xe_vm_rebind_worker_retry(vm);
diff --git a/drivers/gpu/drm/xe/xe_vm.h b/drivers/gpu/drm/xe/xe_vm.h
index b481608b12f1..e5d12196b102 100644
--- a/drivers/gpu/drm/xe/xe_vm.h
+++ b/drivers/gpu/drm/xe/xe_vm.h
@@ -241,7 +241,7 @@ int xe_vma_userptr_pin_pages(struct xe_userptr_vma *uvma);
 
 int xe_vma_userptr_check_repin(struct xe_userptr_vma *uvma);
 
-bool xe_vm_validate_should_retry(struct drm_exec *exec, int err, ktime_t *end);
+bool xe_vm_validate_should_retry(struct drm_exec *exec, int err, bool *exclusive);
 
 int xe_vm_lock_vma(struct drm_exec *exec, struct xe_vma *vma);
 
-- 
2.44.0

next prev parent reply	other threads:[~2024-06-10 15:21 UTC|newest]

Thread overview: 10+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-06-10 15:20 [CI 1/3] drm/xe: poor man's exhaustive eviction Thomas Hellström
2024-06-10 15:20 ` Thomas Hellström [this message]
2024-06-10 15:20 ` [CI 3/3] drm/xe/xe-for-ci: Check whether oom was due to ww mutex error injection Thomas Hellström
2024-06-10 15:28 ` ✓ CI.Patch_applied: success for series starting with [CI,1/3] drm/xe: poor man's exhaustive eviction Patchwork
2024-06-10 15:28 ` ✗ CI.checkpatch: warning " Patchwork
2024-06-10 15:30 ` ✓ CI.KUnit: success " Patchwork
2024-06-10 15:42 ` ✓ CI.Build: " Patchwork
2024-06-10 15:44 ` ✗ CI.Hooks: failure " Patchwork
2024-06-10 15:45 ` ✓ CI.checksparse: success " Patchwork
2024-06-10 16:33 ` ✓ CI.BAT: " Patchwork

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:fce1519e3b3 dfblob:16b4224e58a dfblob:a0b9b6c56db
dfblob:b8d0076a2dd dfblob:25cc4e68ca5 dfblob:3399c7e5bf4
dfblob:b481608b12f dfblob:e5d12196b10 )
 OR (
bs:"[CI 2/3] drm/xe: Take the validation rwsem in exclusive mode on OOM" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20240610152017.43436-2-thomas.hellstrom@linux.intel.com \
    --to=thomas.hellstrom@linux.intel.com \
    --cc=intel-xe@lists.freedesktop.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox