* [Intel-gfx] [PATCH] drm/i915: Fix random -ENOSPC eviction errors due to locked vma objects
@ 2022-08-17 6:55 Sviatoslav Peleshko
2022-08-17 7:34 ` Thomas Hellström
2022-08-17 19:04 ` [Intel-gfx] ✗ Fi.CI.BAT: failure for " Patchwork
0 siblings, 2 replies; 4+ messages in thread
From: Sviatoslav Peleshko @ 2022-08-17 6:55 UTC (permalink / raw)
Cc: Sviatoslav Peleshko, Thomas Hellström, David Airlie,
intel-gfx, linux-kernel, dri-devel, Daniel Vetter, Rodrigo Vivi
The i915_gem_object_trylock we had in the grab_vma() makes it return false
when the vma->obj is already locked. In this case we'll skip this vma
during eviction, and eventually might be forced to return -ENOSPC even
though we could've evicted this vma if we waited for the lock a bit.
To fix this, replace the i915_gem_object_trylock with i915_gem_object_lock.
And because we have to worry about the potential deadlock now, bubble-up
the error code, so it will be correctly handled by the WW mechanism.
This fixes the issue https://gitlab.freedesktop.org/drm/intel/-/issues/6564
Fixes: 7e00897be8bf ("drm/i915: Add object locking to i915_gem_evict_for_node and i915_gem_evict_something, v2.")
Signed-off-by: Sviatoslav Peleshko <sviatoslav.peleshko@globallogic.com>
---
drivers/gpu/drm/i915/i915_gem_evict.c | 69 ++++++++++++++++++---------
1 file changed, 46 insertions(+), 23 deletions(-)
diff --git a/drivers/gpu/drm/i915/i915_gem_evict.c b/drivers/gpu/drm/i915/i915_gem_evict.c
index f025ee4fa526..9d43f213f68f 100644
--- a/drivers/gpu/drm/i915/i915_gem_evict.c
+++ b/drivers/gpu/drm/i915/i915_gem_evict.c
@@ -55,49 +55,58 @@ static int ggtt_flush(struct intel_gt *gt)
return intel_gt_wait_for_idle(gt, MAX_SCHEDULE_TIMEOUT);
}
-static bool grab_vma(struct i915_vma *vma, struct i915_gem_ww_ctx *ww)
+static int grab_vma(struct i915_vma *vma, struct i915_gem_ww_ctx *ww)
{
+ int ret = 0;
+
/*
* We add the extra refcount so the object doesn't drop to zero until
* after ungrab_vma(), this way trylock is always paired with unlock.
*/
if (i915_gem_object_get_rcu(vma->obj)) {
- if (!i915_gem_object_trylock(vma->obj, ww)) {
+ ret = i915_gem_object_lock(vma->obj, ww);
+ if (ret)
i915_gem_object_put(vma->obj);
- return false;
- }
} else {
/* Dead objects don't need pins */
atomic_and(~I915_VMA_PIN_MASK, &vma->flags);
}
- return true;
+ return ret;
}
-static void ungrab_vma(struct i915_vma *vma)
+static void ungrab_vma(struct i915_vma *vma, struct i915_gem_ww_ctx *ww)
{
if (dying_vma(vma))
return;
- i915_gem_object_unlock(vma->obj);
+ if (!ww)
+ i915_gem_object_unlock(vma->obj);
+
i915_gem_object_put(vma->obj);
}
-static bool
+static int
mark_free(struct drm_mm_scan *scan,
struct i915_gem_ww_ctx *ww,
struct i915_vma *vma,
unsigned int flags,
struct list_head *unwind)
{
+ int err;
+
if (i915_vma_is_pinned(vma))
- return false;
+ return -ENOSPC;
- if (!grab_vma(vma, ww))
- return false;
+ err = grab_vma(vma, ww);
+ if (err)
+ return err;
list_add(&vma->evict_link, unwind);
- return drm_mm_scan_add_block(scan, &vma->node);
+ if (!drm_mm_scan_add_block(scan, &vma->node))
+ return -ENOSPC;
+
+ return 0;
}
static bool defer_evict(struct i915_vma *vma)
@@ -150,6 +159,7 @@ i915_gem_evict_something(struct i915_address_space *vm,
enum drm_mm_insert_mode mode;
struct i915_vma *active;
int ret;
+ int err = 0;
lockdep_assert_held(&vm->mutex);
trace_i915_gem_evict(vm, min_size, alignment, flags);
@@ -210,17 +220,23 @@ i915_gem_evict_something(struct i915_address_space *vm,
continue;
}
- if (mark_free(&scan, ww, vma, flags, &eviction_list))
+ err = mark_free(&scan, ww, vma, flags, &eviction_list);
+ if (!err)
goto found;
+ if (err == -EDEADLK)
+ break;
}
/* Nothing found, clean up and bail out! */
list_for_each_entry_safe(vma, next, &eviction_list, evict_link) {
ret = drm_mm_scan_remove_block(&scan, &vma->node);
BUG_ON(ret);
- ungrab_vma(vma);
+ ungrab_vma(vma, ww);
}
+ if (err == -EDEADLK)
+ return err;
+
/*
* Can we unpin some objects such as idle hw contents,
* or pending flips? But since only the GGTT has global entries
@@ -267,7 +283,7 @@ i915_gem_evict_something(struct i915_address_space *vm,
__i915_vma_pin(vma);
} else {
list_del(&vma->evict_link);
- ungrab_vma(vma);
+ ungrab_vma(vma, ww);
}
}
@@ -277,17 +293,21 @@ i915_gem_evict_something(struct i915_address_space *vm,
__i915_vma_unpin(vma);
if (ret == 0)
ret = __i915_vma_unbind(vma);
- ungrab_vma(vma);
+ ungrab_vma(vma, ww);
}
while (ret == 0 && (node = drm_mm_scan_color_evict(&scan))) {
vma = container_of(node, struct i915_vma, node);
/* If we find any non-objects (!vma), we cannot evict them */
- if (vma->node.color != I915_COLOR_UNEVICTABLE &&
- grab_vma(vma, ww)) {
- ret = __i915_vma_unbind(vma);
- ungrab_vma(vma);
+ if (vma->node.color != I915_COLOR_UNEVICTABLE) {
+ ret = grab_vma(vma, ww);
+ if (!ret) {
+ ret = __i915_vma_unbind(vma);
+ ungrab_vma(vma, ww);
+ } else if (ret != -EDEADLK) {
+ ret = -ENOSPC;
+ }
} else {
ret = -ENOSPC;
}
@@ -382,8 +402,11 @@ int i915_gem_evict_for_node(struct i915_address_space *vm,
break;
}
- if (!grab_vma(vma, ww)) {
- ret = -ENOSPC;
+ ret = grab_vma(vma, ww);
+ if (ret) {
+ if (ret != -EDEADLK)
+ ret = -ENOSPC;
+
break;
}
@@ -405,7 +428,7 @@ int i915_gem_evict_for_node(struct i915_address_space *vm,
if (ret == 0)
ret = __i915_vma_unbind(vma);
- ungrab_vma(vma);
+ ungrab_vma(vma, ww);
}
return ret;
--
2.37.1
^ permalink raw reply related [flat|nested] 4+ messages in thread
* Re: [Intel-gfx] [PATCH] drm/i915: Fix random -ENOSPC eviction errors due to locked vma objects
2022-08-17 6:55 [Intel-gfx] [PATCH] drm/i915: Fix random -ENOSPC eviction errors due to locked vma objects Sviatoslav Peleshko
@ 2022-08-17 7:34 ` Thomas Hellström
2022-08-30 14:19 ` Tvrtko Ursulin
2022-08-17 19:04 ` [Intel-gfx] ✗ Fi.CI.BAT: failure for " Patchwork
1 sibling, 1 reply; 4+ messages in thread
From: Thomas Hellström @ 2022-08-17 7:34 UTC (permalink / raw)
To: Sviatoslav Peleshko
Cc: David Airlie, intel-gfx, linux-kernel, dri-devel, Daniel Vetter,
Rodrigo Vivi
On Wed, 2022-08-17 at 09:55 +0300, Sviatoslav Peleshko wrote:
> The i915_gem_object_trylock we had in the grab_vma() makes it return
> false
> when the vma->obj is already locked. In this case we'll skip this vma
> during eviction, and eventually might be forced to return -ENOSPC
> even
> though we could've evicted this vma if we waited for the lock a bit.
>
> To fix this, replace the i915_gem_object_trylock with
> i915_gem_object_lock.
> And because we have to worry about the potential deadlock now,
> bubble-up
> the error code, so it will be correctly handled by the WW mechanism.
>
> This fixes the issue
> https://gitlab.freedesktop.org/drm/intel/-/issues/6564
>
> Fixes: 7e00897be8bf ("drm/i915: Add object locking to
> i915_gem_evict_for_node and i915_gem_evict_something, v2.")
> Signed-off-by: Sviatoslav Peleshko
> <sviatoslav.peleshko@globallogic.com>
> ---
> drivers/gpu/drm/i915/i915_gem_evict.c | 69 ++++++++++++++++++-------
> --
> 1 file changed, 46 insertions(+), 23 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_gem_evict.c
> b/drivers/gpu/drm/i915/i915_gem_evict.c
> index f025ee4fa526..9d43f213f68f 100644
> --- a/drivers/gpu/drm/i915/i915_gem_evict.c
> +++ b/drivers/gpu/drm/i915/i915_gem_evict.c
> @@ -55,49 +55,58 @@ static int ggtt_flush(struct intel_gt *gt)
> return intel_gt_wait_for_idle(gt, MAX_SCHEDULE_TIMEOUT);
> }
>
> -static bool grab_vma(struct i915_vma *vma, struct i915_gem_ww_ctx
> *ww)
> +static int grab_vma(struct i915_vma *vma, struct i915_gem_ww_ctx
> *ww)
> {
> + int ret = 0;
> +
> /*
> * We add the extra refcount so the object doesn't drop to
> zero until
> * after ungrab_vma(), this way trylock is always paired with
> unlock.
> */
> if (i915_gem_object_get_rcu(vma->obj)) {
> - if (!i915_gem_object_trylock(vma->obj, ww)) {
> + ret = i915_gem_object_lock(vma->obj, ww);
Isn't the vm->mutex held here? If so, then this would be a lockdep
violation.
/Thomas
> + if (ret)
> i915_gem_object_put(vma->obj);
> - return false;
> - }
> } else {
> /* Dead objects don't need pins */
> atomic_and(~I915_VMA_PIN_MASK, &vma->flags);
> }
>
> - return true;
> + return ret;
> }
>
> -static void ungrab_vma(struct i915_vma *vma)
> +static void ungrab_vma(struct i915_vma *vma, struct i915_gem_ww_ctx
> *ww)
> {
> if (dying_vma(vma))
> return;
>
> - i915_gem_object_unlock(vma->obj);
> + if (!ww)
> + i915_gem_object_unlock(vma->obj);
> +
> i915_gem_object_put(vma->obj);
> }
>
> -static bool
> +static int
> mark_free(struct drm_mm_scan *scan,
> struct i915_gem_ww_ctx *ww,
> struct i915_vma *vma,
> unsigned int flags,
> struct list_head *unwind)
> {
> + int err;
> +
> if (i915_vma_is_pinned(vma))
> - return false;
> + return -ENOSPC;
>
> - if (!grab_vma(vma, ww))
> - return false;
> + err = grab_vma(vma, ww);
> + if (err)
> + return err;
>
> list_add(&vma->evict_link, unwind);
> - return drm_mm_scan_add_block(scan, &vma->node);
> + if (!drm_mm_scan_add_block(scan, &vma->node))
> + return -ENOSPC;
> +
> + return 0;
> }
>
> static bool defer_evict(struct i915_vma *vma)
> @@ -150,6 +159,7 @@ i915_gem_evict_something(struct
> i915_address_space *vm,
> enum drm_mm_insert_mode mode;
> struct i915_vma *active;
> int ret;
> + int err = 0;
>
> lockdep_assert_held(&vm->mutex);
> trace_i915_gem_evict(vm, min_size, alignment, flags);
> @@ -210,17 +220,23 @@ i915_gem_evict_something(struct
> i915_address_space *vm,
> continue;
> }
>
> - if (mark_free(&scan, ww, vma, flags, &eviction_list))
> + err = mark_free(&scan, ww, vma, flags,
> &eviction_list);
> + if (!err)
> goto found;
> + if (err == -EDEADLK)
> + break;
> }
>
> /* Nothing found, clean up and bail out! */
> list_for_each_entry_safe(vma, next, &eviction_list,
> evict_link) {
> ret = drm_mm_scan_remove_block(&scan, &vma->node);
> BUG_ON(ret);
> - ungrab_vma(vma);
> + ungrab_vma(vma, ww);
> }
>
> + if (err == -EDEADLK)
> + return err;
> +
> /*
> * Can we unpin some objects such as idle hw contents,
> * or pending flips? But since only the GGTT has global
> entries
> @@ -267,7 +283,7 @@ i915_gem_evict_something(struct
> i915_address_space *vm,
> __i915_vma_pin(vma);
> } else {
> list_del(&vma->evict_link);
> - ungrab_vma(vma);
> + ungrab_vma(vma, ww);
> }
> }
>
> @@ -277,17 +293,21 @@ i915_gem_evict_something(struct
> i915_address_space *vm,
> __i915_vma_unpin(vma);
> if (ret == 0)
> ret = __i915_vma_unbind(vma);
> - ungrab_vma(vma);
> + ungrab_vma(vma, ww);
> }
>
> while (ret == 0 && (node = drm_mm_scan_color_evict(&scan))) {
> vma = container_of(node, struct i915_vma, node);
>
> /* If we find any non-objects (!vma), we cannot evict
> them */
> - if (vma->node.color != I915_COLOR_UNEVICTABLE &&
> - grab_vma(vma, ww)) {
> - ret = __i915_vma_unbind(vma);
> - ungrab_vma(vma);
> + if (vma->node.color != I915_COLOR_UNEVICTABLE) {
> + ret = grab_vma(vma, ww);
> + if (!ret) {
> + ret = __i915_vma_unbind(vma);
> + ungrab_vma(vma, ww);
> + } else if (ret != -EDEADLK) {
> + ret = -ENOSPC;
> + }
> } else {
> ret = -ENOSPC;
> }
> @@ -382,8 +402,11 @@ int i915_gem_evict_for_node(struct
> i915_address_space *vm,
> break;
> }
>
> - if (!grab_vma(vma, ww)) {
> - ret = -ENOSPC;
> + ret = grab_vma(vma, ww);
> + if (ret) {
> + if (ret != -EDEADLK)
> + ret = -ENOSPC;
> +
> break;
> }
>
> @@ -405,7 +428,7 @@ int i915_gem_evict_for_node(struct
> i915_address_space *vm,
> if (ret == 0)
> ret = __i915_vma_unbind(vma);
>
> - ungrab_vma(vma);
> + ungrab_vma(vma, ww);
> }
>
> return ret;
^ permalink raw reply [flat|nested] 4+ messages in thread
* [Intel-gfx] ✗ Fi.CI.BAT: failure for drm/i915: Fix random -ENOSPC eviction errors due to locked vma objects
2022-08-17 6:55 [Intel-gfx] [PATCH] drm/i915: Fix random -ENOSPC eviction errors due to locked vma objects Sviatoslav Peleshko
2022-08-17 7:34 ` Thomas Hellström
@ 2022-08-17 19:04 ` Patchwork
1 sibling, 0 replies; 4+ messages in thread
From: Patchwork @ 2022-08-17 19:04 UTC (permalink / raw)
To: Sviatoslav Peleshko; +Cc: intel-gfx
[-- Attachment #1: Type: text/plain, Size: 7791 bytes --]
== Series Details ==
Series: drm/i915: Fix random -ENOSPC eviction errors due to locked vma objects
URL : https://patchwork.freedesktop.org/series/107419/
State : failure
== Summary ==
CI Bug Log - changes from CI_DRM_11994 -> Patchwork_107419v1
====================================================
Summary
-------
**FAILURE**
Serious unknown changes coming with Patchwork_107419v1 absolutely need to be
verified manually.
If you think the reported changes have nothing to do with the changes
introduced in Patchwork_107419v1, please notify your bug team to allow them
to document this new failure mode, which will reduce false positives in CI.
External URL: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_107419v1/index.html
Participating hosts (30 -> 30)
------------------------------
Additional (2): fi-kbl-soraka fi-hsw-4770
Missing (2): bat-rpls-1 fi-kbl-guc
Possible new issues
-------------------
Here are the unknown changes that may have been introduced in Patchwork_107419v1:
### IGT changes ###
#### Possible regressions ####
* igt@gem_exec_gttfill@basic:
- fi-elk-e7500: [PASS][1] -> [INCOMPLETE][2]
[1]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11994/fi-elk-e7500/igt@gem_exec_gttfill@basic.html
[2]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_107419v1/fi-elk-e7500/igt@gem_exec_gttfill@basic.html
* igt@i915_selftest@live@hangcheck:
- fi-bdw-gvtdvm: [PASS][3] -> [DMESG-WARN][4]
[3]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11994/fi-bdw-gvtdvm/igt@i915_selftest@live@hangcheck.html
[4]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_107419v1/fi-bdw-gvtdvm/igt@i915_selftest@live@hangcheck.html
Known issues
------------
Here are the changes found in Patchwork_107419v1 that come from known issues:
### IGT changes ###
#### Issues hit ####
* igt@gem_exec_suspend@basic-s3@smem:
- fi-hsw-4770: NOTRUN -> [INCOMPLETE][5] ([i915#4939])
[5]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_107419v1/fi-hsw-4770/igt@gem_exec_suspend@basic-s3@smem.html
* igt@i915_module_load@reload:
- fi-hsw-4770: NOTRUN -> [WARN][6] ([i915#6596])
[6]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_107419v1/fi-hsw-4770/igt@i915_module_load@reload.html
* igt@i915_pm_backlight@basic-brightness:
- fi-hsw-4770: NOTRUN -> [SKIP][7] ([fdo#109271] / [i915#3012])
[7]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_107419v1/fi-hsw-4770/igt@i915_pm_backlight@basic-brightness.html
* igt@i915_pm_rpm@module-reload:
- fi-hsw-4770: NOTRUN -> [FAIL][8] ([i915#6593])
[8]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_107419v1/fi-hsw-4770/igt@i915_pm_rpm@module-reload.html
* igt@i915_selftest@live:
- fi-hsw-4770: NOTRUN -> [SKIP][9] ([fdo#109271]) +10 similar issues
[9]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_107419v1/fi-hsw-4770/igt@i915_selftest@live.html
* igt@i915_selftest@live@gem:
- fi-blb-e6850: NOTRUN -> [DMESG-FAIL][10] ([i915#4528])
[10]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_107419v1/fi-blb-e6850/igt@i915_selftest@live@gem.html
* igt@i915_selftest@live@hangcheck:
- bat-dg1-6: NOTRUN -> [DMESG-FAIL][11] ([i915#4494] / [i915#4957])
[11]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_107419v1/bat-dg1-6/igt@i915_selftest@live@hangcheck.html
* igt@i915_suspend@basic-s2idle-without-i915:
- fi-hsw-4770: NOTRUN -> [FAIL][12] ([i915#6559]) +1 similar issue
[12]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_107419v1/fi-hsw-4770/igt@i915_suspend@basic-s2idle-without-i915.html
- bat-dg1-6: NOTRUN -> [INCOMPLETE][13] ([i915#6011])
[13]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_107419v1/bat-dg1-6/igt@i915_suspend@basic-s2idle-without-i915.html
* igt@kms_chamelium@dp-crc-fast:
- fi-hsw-4770: NOTRUN -> [SKIP][14] ([fdo#109271] / [fdo#111827]) +7 similar issues
[14]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_107419v1/fi-hsw-4770/igt@kms_chamelium@dp-crc-fast.html
* igt@kms_psr@sprite_plane_onoff:
- fi-hsw-4770: NOTRUN -> [SKIP][15] ([fdo#109271] / [i915#1072]) +3 similar issues
[15]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_107419v1/fi-hsw-4770/igt@kms_psr@sprite_plane_onoff.html
* igt@runner@aborted:
- fi-elk-e7500: NOTRUN -> [FAIL][16] ([i915#4312])
[16]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_107419v1/fi-elk-e7500/igt@runner@aborted.html
- fi-kbl-soraka: NOTRUN -> [FAIL][17] ([i915#6219])
[17]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_107419v1/fi-kbl-soraka/igt@runner@aborted.html
- fi-bdw-gvtdvm: NOTRUN -> [FAIL][18] ([i915#4312])
[18]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_107419v1/fi-bdw-gvtdvm/igt@runner@aborted.html
#### Possible fixes ####
* igt@i915_selftest@live@gt_engines:
- bat-dg1-6: [INCOMPLETE][19] ([i915#4418]) -> [PASS][20]
[19]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11994/bat-dg1-6/igt@i915_selftest@live@gt_engines.html
[20]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_107419v1/bat-dg1-6/igt@i915_selftest@live@gt_engines.html
* igt@i915_selftest@live@requests:
- fi-blb-e6850: [DMESG-FAIL][21] ([i915#4528]) -> [PASS][22]
[21]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11994/fi-blb-e6850/igt@i915_selftest@live@requests.html
[22]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_107419v1/fi-blb-e6850/igt@i915_selftest@live@requests.html
* igt@kms_cursor_legacy@basic-busy-flip-before-cursor@atomic-transitions:
- fi-bsw-kefka: [FAIL][23] ([i915#6298]) -> [PASS][24]
[23]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11994/fi-bsw-kefka/igt@kms_cursor_legacy@basic-busy-flip-before-cursor@atomic-transitions.html
[24]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_107419v1/fi-bsw-kefka/igt@kms_cursor_legacy@basic-busy-flip-before-cursor@atomic-transitions.html
[fdo#109271]: https://bugs.freedesktop.org/show_bug.cgi?id=109271
[fdo#111827]: https://bugs.freedesktop.org/show_bug.cgi?id=111827
[i915#1072]: https://gitlab.freedesktop.org/drm/intel/issues/1072
[i915#3012]: https://gitlab.freedesktop.org/drm/intel/issues/3012
[i915#4312]: https://gitlab.freedesktop.org/drm/intel/issues/4312
[i915#4418]: https://gitlab.freedesktop.org/drm/intel/issues/4418
[i915#4494]: https://gitlab.freedesktop.org/drm/intel/issues/4494
[i915#4528]: https://gitlab.freedesktop.org/drm/intel/issues/4528
[i915#4939]: https://gitlab.freedesktop.org/drm/intel/issues/4939
[i915#4957]: https://gitlab.freedesktop.org/drm/intel/issues/4957
[i915#6011]: https://gitlab.freedesktop.org/drm/intel/issues/6011
[i915#6219]: https://gitlab.freedesktop.org/drm/intel/issues/6219
[i915#6298]: https://gitlab.freedesktop.org/drm/intel/issues/6298
[i915#6559]: https://gitlab.freedesktop.org/drm/intel/issues/6559
[i915#6593]: https://gitlab.freedesktop.org/drm/intel/issues/6593
[i915#6596]: https://gitlab.freedesktop.org/drm/intel/issues/6596
Build changes
-------------
* Linux: CI_DRM_11994 -> Patchwork_107419v1
CI-20190529: 20190529
CI_DRM_11994: 8a680797ba1c5c96edf296a7e6a46365efe5ef6f @ git://anongit.freedesktop.org/gfx-ci/linux
IGT_6630: b5e2222c9a988015bdf237e6ebb9f5b6d87ac7e1 @ https://gitlab.freedesktop.org/drm/igt-gpu-tools.git
Patchwork_107419v1: 8a680797ba1c5c96edf296a7e6a46365efe5ef6f @ git://anongit.freedesktop.org/gfx-ci/linux
### Linux commits
149efb8bf00d drm/i915: Fix random -ENOSPC eviction errors due to locked vma objects
== Logs ==
For more details see: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_107419v1/index.html
[-- Attachment #2: Type: text/html, Size: 9205 bytes --]
^ permalink raw reply [flat|nested] 4+ messages in thread
* Re: [Intel-gfx] [PATCH] drm/i915: Fix random -ENOSPC eviction errors due to locked vma objects
2022-08-17 7:34 ` Thomas Hellström
@ 2022-08-30 14:19 ` Tvrtko Ursulin
0 siblings, 0 replies; 4+ messages in thread
From: Tvrtko Ursulin @ 2022-08-30 14:19 UTC (permalink / raw)
To: Thomas Hellström, Sviatoslav Peleshko
Cc: David Airlie, intel-gfx, linux-kernel, dri-devel, Daniel Vetter,
Rodrigo Vivi
On 17/08/2022 08:34, Thomas Hellström wrote:
> On Wed, 2022-08-17 at 09:55 +0300, Sviatoslav Peleshko wrote:
>> The i915_gem_object_trylock we had in the grab_vma() makes it return
>> false
>> when the vma->obj is already locked. In this case we'll skip this vma
>> during eviction, and eventually might be forced to return -ENOSPC
>> even
>> though we could've evicted this vma if we waited for the lock a bit.
>>
>> To fix this, replace the i915_gem_object_trylock with
>> i915_gem_object_lock.
>> And because we have to worry about the potential deadlock now,
>> bubble-up
>> the error code, so it will be correctly handled by the WW mechanism.
>>
>> This fixes the issue
>> https://gitlab.freedesktop.org/drm/intel/-/issues/6564
>>
>> Fixes: 7e00897be8bf ("drm/i915: Add object locking to
>> i915_gem_evict_for_node and i915_gem_evict_something, v2.")
>> Signed-off-by: Sviatoslav Peleshko
>> <sviatoslav.peleshko@globallogic.com>
>> ---
>> drivers/gpu/drm/i915/i915_gem_evict.c | 69 ++++++++++++++++++-------
>> --
>> 1 file changed, 46 insertions(+), 23 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/i915/i915_gem_evict.c
>> b/drivers/gpu/drm/i915/i915_gem_evict.c
>> index f025ee4fa526..9d43f213f68f 100644
>> --- a/drivers/gpu/drm/i915/i915_gem_evict.c
>> +++ b/drivers/gpu/drm/i915/i915_gem_evict.c
>> @@ -55,49 +55,58 @@ static int ggtt_flush(struct intel_gt *gt)
>> return intel_gt_wait_for_idle(gt, MAX_SCHEDULE_TIMEOUT);
>> }
>>
>> -static bool grab_vma(struct i915_vma *vma, struct i915_gem_ww_ctx
>> *ww)
>> +static int grab_vma(struct i915_vma *vma, struct i915_gem_ww_ctx
>> *ww)
>> {
>> + int ret = 0;
>> +
>> /*
>> * We add the extra refcount so the object doesn't drop to
>> zero until
>> * after ungrab_vma(), this way trylock is always paired with
>> unlock.
>> */
>> if (i915_gem_object_get_rcu(vma->obj)) {
>> - if (!i915_gem_object_trylock(vma->obj, ww)) {
>> + ret = i915_gem_object_lock(vma->obj, ww);
>
> Isn't the vm->mutex held here? If so, then this would be a lockdep
> violation.
Hm.. could the trylock site use a comment exaplaining the reasoning? Ie.
trylock not just skipping "busy" objects but truly unavoidable?
Regardless, is the analysis on the spot and are we working on fixing in
somehow?
Regards,
Tvrtko
>
> /Thomas
>
>
>
>
>> + if (ret)
>> i915_gem_object_put(vma->obj);
>> - return false;
>> - }
>> } else {
>> /* Dead objects don't need pins */
>> atomic_and(~I915_VMA_PIN_MASK, &vma->flags);
>> }
>>
>> - return true;
>> + return ret;
>> }
>>
>> -static void ungrab_vma(struct i915_vma *vma)
>> +static void ungrab_vma(struct i915_vma *vma, struct i915_gem_ww_ctx
>> *ww)
>> {
>> if (dying_vma(vma))
>> return;
>>
>> - i915_gem_object_unlock(vma->obj);
>> + if (!ww)
>> + i915_gem_object_unlock(vma->obj);
>> +
>> i915_gem_object_put(vma->obj);
>> }
>>
>> -static bool
>> +static int
>> mark_free(struct drm_mm_scan *scan,
>> struct i915_gem_ww_ctx *ww,
>> struct i915_vma *vma,
>> unsigned int flags,
>> struct list_head *unwind)
>> {
>> + int err;
>> +
>> if (i915_vma_is_pinned(vma))
>> - return false;
>> + return -ENOSPC;
>>
>> - if (!grab_vma(vma, ww))
>> - return false;
>> + err = grab_vma(vma, ww);
>> + if (err)
>> + return err;
>>
>> list_add(&vma->evict_link, unwind);
>> - return drm_mm_scan_add_block(scan, &vma->node);
>> + if (!drm_mm_scan_add_block(scan, &vma->node))
>> + return -ENOSPC;
>> +
>> + return 0;
>> }
>>
>> static bool defer_evict(struct i915_vma *vma)
>> @@ -150,6 +159,7 @@ i915_gem_evict_something(struct
>> i915_address_space *vm,
>> enum drm_mm_insert_mode mode;
>> struct i915_vma *active;
>> int ret;
>> + int err = 0;
>>
>> lockdep_assert_held(&vm->mutex);
>> trace_i915_gem_evict(vm, min_size, alignment, flags);
>> @@ -210,17 +220,23 @@ i915_gem_evict_something(struct
>> i915_address_space *vm,
>> continue;
>> }
>>
>> - if (mark_free(&scan, ww, vma, flags, &eviction_list))
>> + err = mark_free(&scan, ww, vma, flags,
>> &eviction_list);
>> + if (!err)
>> goto found;
>> + if (err == -EDEADLK)
>> + break;
>> }
>>
>> /* Nothing found, clean up and bail out! */
>> list_for_each_entry_safe(vma, next, &eviction_list,
>> evict_link) {
>> ret = drm_mm_scan_remove_block(&scan, &vma->node);
>> BUG_ON(ret);
>> - ungrab_vma(vma);
>> + ungrab_vma(vma, ww);
>> }
>>
>> + if (err == -EDEADLK)
>> + return err;
>> +
>> /*
>> * Can we unpin some objects such as idle hw contents,
>> * or pending flips? But since only the GGTT has global
>> entries
>> @@ -267,7 +283,7 @@ i915_gem_evict_something(struct
>> i915_address_space *vm,
>> __i915_vma_pin(vma);
>> } else {
>> list_del(&vma->evict_link);
>> - ungrab_vma(vma);
>> + ungrab_vma(vma, ww);
>> }
>> }
>>
>> @@ -277,17 +293,21 @@ i915_gem_evict_something(struct
>> i915_address_space *vm,
>> __i915_vma_unpin(vma);
>> if (ret == 0)
>> ret = __i915_vma_unbind(vma);
>> - ungrab_vma(vma);
>> + ungrab_vma(vma, ww);
>> }
>>
>> while (ret == 0 && (node = drm_mm_scan_color_evict(&scan))) {
>> vma = container_of(node, struct i915_vma, node);
>>
>> /* If we find any non-objects (!vma), we cannot evict
>> them */
>> - if (vma->node.color != I915_COLOR_UNEVICTABLE &&
>> - grab_vma(vma, ww)) {
>> - ret = __i915_vma_unbind(vma);
>> - ungrab_vma(vma);
>> + if (vma->node.color != I915_COLOR_UNEVICTABLE) {
>> + ret = grab_vma(vma, ww);
>> + if (!ret) {
>> + ret = __i915_vma_unbind(vma);
>> + ungrab_vma(vma, ww);
>> + } else if (ret != -EDEADLK) {
>> + ret = -ENOSPC;
>> + }
>> } else {
>> ret = -ENOSPC;
>> }
>> @@ -382,8 +402,11 @@ int i915_gem_evict_for_node(struct
>> i915_address_space *vm,
>> break;
>> }
>>
>> - if (!grab_vma(vma, ww)) {
>> - ret = -ENOSPC;
>> + ret = grab_vma(vma, ww);
>> + if (ret) {
>> + if (ret != -EDEADLK)
>> + ret = -ENOSPC;
>> +
>> break;
>> }
>>
>> @@ -405,7 +428,7 @@ int i915_gem_evict_for_node(struct
>> i915_address_space *vm,
>> if (ret == 0)
>> ret = __i915_vma_unbind(vma);
>>
>> - ungrab_vma(vma);
>> + ungrab_vma(vma, ww);
>> }
>>
>> return ret;
>
^ permalink raw reply [flat|nested] 4+ messages in thread
end of thread, other threads:[~2022-08-30 14:19 UTC | newest]
Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2022-08-17 6:55 [Intel-gfx] [PATCH] drm/i915: Fix random -ENOSPC eviction errors due to locked vma objects Sviatoslav Peleshko
2022-08-17 7:34 ` Thomas Hellström
2022-08-30 14:19 ` Tvrtko Ursulin
2022-08-17 19:04 ` [Intel-gfx] ✗ Fi.CI.BAT: failure for " Patchwork
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox