From: "Thomas Hellström" <thomas.hellstrom@linux.intel.com>
To: intel-xe@lists.freedesktop.org
Subject: [CI 2/3] drm/xe: Take the validation rwsem in exclusive mode on OOM
Date: Mon, 10 Jun 2024 17:20:16 +0200 [thread overview]
Message-ID: <20240610152017.43436-2-thomas.hellstrom@linux.intel.com> (raw)
In-Reply-To: <20240610152017.43436-1-thomas.hellstrom@linux.intel.com>
In the unlikely event that we hit an OOM from TTM validation, take
the validation rwsem in exclusive mode to block parallel validation
and submission on the same device.
Signed-off-by: Thomas Hellström <thomas.hellstrom@linux.intel.com>
---
drivers/gpu/drm/xe/xe_exec.c | 10 ++++-----
drivers/gpu/drm/xe/xe_gt_pagefault.c | 10 ++++-----
drivers/gpu/drm/xe/xe_vm.c | 33 +++++++++++-----------------
drivers/gpu/drm/xe/xe_vm.h | 2 +-
4 files changed, 24 insertions(+), 31 deletions(-)
diff --git a/drivers/gpu/drm/xe/xe_exec.c b/drivers/gpu/drm/xe/xe_exec.c
index fce1519e3b34..16b4224e58ae 100644
--- a/drivers/gpu/drm/xe/xe_exec.c
+++ b/drivers/gpu/drm/xe/xe_exec.c
@@ -123,7 +123,7 @@ int xe_exec_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
struct xe_sched_job *job;
struct xe_vm *vm;
bool write_locked, skip_retry = false;
- ktime_t end = 0;
+ bool exclusive = false;
int err = 0;
if (XE_IOCTL_DBG(xe, args->extensions) ||
@@ -229,11 +229,11 @@ int xe_exec_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
vm_exec.vm = &vm->gpuvm;
vm_exec.flags = DRM_EXEC_INTERRUPTIBLE_WAIT;
if (xe_vm_in_lr_mode(vm)) {
- xe_exec_init(exec, vm_exec.flags, 0, xe, false);
+ xe_exec_init(exec, vm_exec.flags, 0, xe, exclusive);
} else {
- err = xe_gpuvm_exec_lock(&vm_exec, xe, false);
+ err = xe_gpuvm_exec_lock(&vm_exec, xe, exclusive);
if (err) {
- if (xe_vm_validate_should_retry(exec, err, &end))
+ if (xe_vm_validate_should_retry(exec, err, &exclusive))
err = -EAGAIN;
goto err_unlock_list;
}
@@ -320,7 +320,7 @@ int xe_exec_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
if (err)
xe_sched_job_put(job);
err_exec:
- xe_exec_fini(exec, xe, false);
+ xe_exec_fini(exec, xe, exclusive);
err_unlock_list:
up_read(&vm->lock);
if (err == -EAGAIN && !skip_retry)
diff --git a/drivers/gpu/drm/xe/xe_gt_pagefault.c b/drivers/gpu/drm/xe/xe_gt_pagefault.c
index a0b9b6c56db1..b8d0076a2dd2 100644
--- a/drivers/gpu/drm/xe/xe_gt_pagefault.c
+++ b/drivers/gpu/drm/xe/xe_gt_pagefault.c
@@ -132,7 +132,7 @@ static int handle_vma_pagefault(struct xe_tile *tile, struct pagefault *pf,
struct xe_vm *vm = xe_vma_vm(vma);
struct drm_exec exec;
struct dma_fence *fence;
- ktime_t end = 0;
+ bool exclusive = false;
int err;
bool atomic;
@@ -154,11 +154,11 @@ static int handle_vma_pagefault(struct xe_tile *tile, struct pagefault *pf,
}
/* Lock VM and BOs dma-resv */
- (void) xe_exec_init(&exec, 0, 0, vm->xe, false);
+ (void) xe_exec_init(&exec, 0, 0, vm->xe, exclusive);
drm_exec_until_all_locked(&exec) {
err = xe_pf_begin(&exec, vma, atomic, tile->id);
drm_exec_retry_on_contention(&exec);
- if (xe_vm_validate_should_retry(&exec, err, &end))
+ if (xe_vm_validate_should_retry(&exec, err, &exclusive))
err = -EAGAIN;
if (err)
goto unlock_dma_resv;
@@ -168,7 +168,7 @@ static int handle_vma_pagefault(struct xe_tile *tile, struct pagefault *pf,
fence = xe_vma_rebind(vm, vma, BIT(tile->id));
if (IS_ERR(fence)) {
err = PTR_ERR(fence);
- if (xe_vm_validate_should_retry(&exec, err, &end))
+ if (xe_vm_validate_should_retry(&exec, err, &exclusive))
err = -EAGAIN;
goto unlock_dma_resv;
}
@@ -179,7 +179,7 @@ static int handle_vma_pagefault(struct xe_tile *tile, struct pagefault *pf,
vma->tile_invalidated &= ~BIT(tile->id);
unlock_dma_resv:
- xe_exec_fini(&exec, vm->xe, false);
+ xe_exec_fini(&exec, vm->xe, exclusive);
if (err == -EAGAIN)
goto retry_userptr;
diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c
index 25cc4e68ca58..3399c7e5bf4d 100644
--- a/drivers/gpu/drm/xe/xe_vm.c
+++ b/drivers/gpu/drm/xe/xe_vm.c
@@ -341,32 +341,25 @@ static void xe_vm_kill(struct xe_vm *vm, bool unlocked)
* xe_vm_validate_should_retry() - Whether to retry after a validate error.
* @exec: The drm_exec object used for locking before validation.
* @err: The error returned from ttm_bo_validate().
- * @end: A ktime_t cookie that should be set to 0 before first use and
- * that should be reused on subsequent calls.
+ * @exclusive: A pointer to a bool that holds the previous validation
+ * semaphore locking mode on input and the desired locking mode on output.
*
* With multiple active VMs, under memory pressure, it is possible that
* ttm_bo_validate() run into -EDEADLK and in such case returns -ENOMEM.
* Until ttm properly handles locking in such scenarios, best thing the
- * driver can do is retry with a timeout. Check if that is necessary, and
- * if so unlock the drm_exec's objects while keeping the ticket to prepare
- * for a rerun.
+ * driver can do is retry locking out other validators.
+ * Check if that is necessary, and also indicate the intended mode of
+ * the validation semaphore.
*
* Return: true if a retry after drm_exec_init() is recommended;
* false otherwise.
*/
-bool xe_vm_validate_should_retry(struct drm_exec *exec, int err, ktime_t *end)
+bool xe_vm_validate_should_retry(struct drm_exec *exec, int err, bool *exclusive)
{
- ktime_t cur;
-
- if (err != -ENOMEM)
- return false;
-
- cur = ktime_get();
- *end = *end ? : ktime_add_ms(cur, XE_VM_REBIND_RETRY_TIMEOUT_MS);
- if (!ktime_before(cur, *end))
+ if (err != -ENOMEM || *exclusive)
return false;
- msleep(20);
+ *exclusive = true;
return true;
}
@@ -473,7 +466,7 @@ static void preempt_rebind_work_func(struct work_struct *w)
struct drm_exec exec;
unsigned int fence_count = 0;
LIST_HEAD(preempt_fences);
- ktime_t end = 0;
+ bool exclusive = false;
int err = 0;
long wait;
int __maybe_unused tries = 0;
@@ -496,7 +489,7 @@ static void preempt_rebind_work_func(struct work_struct *w)
goto out_unlock_outer;
}
- err = xe_exec_init(&exec, DRM_EXEC_INTERRUPTIBLE_WAIT, 0, vm->xe, false);
+ err = xe_exec_init(&exec, DRM_EXEC_INTERRUPTIBLE_WAIT, 0, vm->xe, exclusive);
if (err)
goto out_unlock_outer;
@@ -506,8 +499,8 @@ static void preempt_rebind_work_func(struct work_struct *w)
err = xe_preempt_work_begin(&exec, vm, &done);
drm_exec_retry_on_contention(&exec);
if (err || done) {
- xe_exec_fini(&exec, vm->xe, false);
- if (err && xe_vm_validate_should_retry(&exec, err, &end))
+ xe_exec_fini(&exec, vm->xe, exclusive);
+ if (err && xe_vm_validate_should_retry(&exec, err, &exclusive))
err = -EAGAIN;
goto out_unlock_outer;
@@ -555,7 +548,7 @@ static void preempt_rebind_work_func(struct work_struct *w)
up_read(&vm->userptr.notifier_lock);
out_unlock:
- xe_exec_fini(&exec, vm->xe, false);
+ xe_exec_fini(&exec, vm->xe, exclusive);
out_unlock_outer:
if (err == -EAGAIN) {
trace_xe_vm_rebind_worker_retry(vm);
diff --git a/drivers/gpu/drm/xe/xe_vm.h b/drivers/gpu/drm/xe/xe_vm.h
index b481608b12f1..e5d12196b102 100644
--- a/drivers/gpu/drm/xe/xe_vm.h
+++ b/drivers/gpu/drm/xe/xe_vm.h
@@ -241,7 +241,7 @@ int xe_vma_userptr_pin_pages(struct xe_userptr_vma *uvma);
int xe_vma_userptr_check_repin(struct xe_userptr_vma *uvma);
-bool xe_vm_validate_should_retry(struct drm_exec *exec, int err, ktime_t *end);
+bool xe_vm_validate_should_retry(struct drm_exec *exec, int err, bool *exclusive);
int xe_vm_lock_vma(struct drm_exec *exec, struct xe_vma *vma);
--
2.44.0
next prev parent reply other threads:[~2024-06-10 15:21 UTC|newest]
Thread overview: 10+ messages / expand[flat|nested] mbox.gz Atom feed top
2024-06-10 15:20 [CI 1/3] drm/xe: poor man's exhaustive eviction Thomas Hellström
2024-06-10 15:20 ` Thomas Hellström [this message]
2024-06-10 15:20 ` [CI 3/3] drm/xe/xe-for-ci: Check whether oom was due to ww mutex error injection Thomas Hellström
2024-06-10 15:28 ` ✓ CI.Patch_applied: success for series starting with [CI,1/3] drm/xe: poor man's exhaustive eviction Patchwork
2024-06-10 15:28 ` ✗ CI.checkpatch: warning " Patchwork
2024-06-10 15:30 ` ✓ CI.KUnit: success " Patchwork
2024-06-10 15:42 ` ✓ CI.Build: " Patchwork
2024-06-10 15:44 ` ✗ CI.Hooks: failure " Patchwork
2024-06-10 15:45 ` ✓ CI.checksparse: success " Patchwork
2024-06-10 16:33 ` ✓ CI.BAT: " Patchwork
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20240610152017.43436-2-thomas.hellstrom@linux.intel.com \
--to=thomas.hellstrom@linux.intel.com \
--cc=intel-xe@lists.freedesktop.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox