public inbox for intel-gfx@lists.freedesktop.org
 help / color / mirror / Atom feed
From: "Thomas Hellström" <thomas.hellstrom@linux.intel.com>
To: Sviatoslav Peleshko <sviatoslav.peleshko@globallogic.com>
Cc: David Airlie <airlied@linux.ie>,
	intel-gfx@lists.freedesktop.org, linux-kernel@vger.kernel.org,
	dri-devel@lists.freedesktop.org, Daniel Vetter <daniel@ffwll.ch>,
	Rodrigo Vivi <rodrigo.vivi@intel.com>
Subject: Re: [Intel-gfx] [PATCH] drm/i915: Fix random -ENOSPC eviction errors due to locked vma objects
Date: Wed, 17 Aug 2022 09:34:40 +0200	[thread overview]
Message-ID: <49c1adc00bdd7aba3d3ffd1ed5e53de2d7d5f1a1.camel@linux.intel.com> (raw)
In-Reply-To: <20220817065541.30101-1-sviatoslav.peleshko@globallogic.com>

On Wed, 2022-08-17 at 09:55 +0300, Sviatoslav Peleshko wrote:
> The i915_gem_object_trylock we had in the grab_vma() makes it return
> false
> when the vma->obj is already locked. In this case we'll skip this vma
> during eviction, and eventually might be forced to return -ENOSPC
> even
> though we could've evicted this vma if we waited for the lock a bit.
> 
> To fix this, replace the i915_gem_object_trylock with
> i915_gem_object_lock.
> And because we have to worry about the potential deadlock now,
> bubble-up
> the error code, so it will be correctly handled by the WW mechanism.
> 
> This fixes the issue
> https://gitlab.freedesktop.org/drm/intel/-/issues/6564
> 
> Fixes: 7e00897be8bf ("drm/i915: Add object locking to
> i915_gem_evict_for_node and i915_gem_evict_something, v2.")
> Signed-off-by: Sviatoslav Peleshko
> <sviatoslav.peleshko@globallogic.com>
> ---
>  drivers/gpu/drm/i915/i915_gem_evict.c | 69 ++++++++++++++++++-------
> --
>  1 file changed, 46 insertions(+), 23 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_gem_evict.c
> b/drivers/gpu/drm/i915/i915_gem_evict.c
> index f025ee4fa526..9d43f213f68f 100644
> --- a/drivers/gpu/drm/i915/i915_gem_evict.c
> +++ b/drivers/gpu/drm/i915/i915_gem_evict.c
> @@ -55,49 +55,58 @@ static int ggtt_flush(struct intel_gt *gt)
>         return intel_gt_wait_for_idle(gt, MAX_SCHEDULE_TIMEOUT);
>  }
>  
> -static bool grab_vma(struct i915_vma *vma, struct i915_gem_ww_ctx
> *ww)
> +static int grab_vma(struct i915_vma *vma, struct i915_gem_ww_ctx
> *ww)
>  {
> +       int ret = 0;
> +
>         /*
>          * We add the extra refcount so the object doesn't drop to
> zero until
>          * after ungrab_vma(), this way trylock is always paired with
> unlock.
>          */
>         if (i915_gem_object_get_rcu(vma->obj)) {
> -               if (!i915_gem_object_trylock(vma->obj, ww)) {
> +               ret = i915_gem_object_lock(vma->obj, ww);

Isn't the vm->mutex held here? If so, then this would be a lockdep
violation.

/Thomas




> +               if (ret)
>                         i915_gem_object_put(vma->obj);
> -                       return false;
> -               }
>         } else {
>                 /* Dead objects don't need pins */
>                 atomic_and(~I915_VMA_PIN_MASK, &vma->flags);
>         }
>  
> -       return true;
> +       return ret;
>  }
>  
> -static void ungrab_vma(struct i915_vma *vma)
> +static void ungrab_vma(struct i915_vma *vma, struct i915_gem_ww_ctx
> *ww)
>  {
>         if (dying_vma(vma))
>                 return;
>  
> -       i915_gem_object_unlock(vma->obj);
> +       if (!ww)
> +               i915_gem_object_unlock(vma->obj);
> +
>         i915_gem_object_put(vma->obj);
>  }
>  
> -static bool
> +static int
>  mark_free(struct drm_mm_scan *scan,
>           struct i915_gem_ww_ctx *ww,
>           struct i915_vma *vma,
>           unsigned int flags,
>           struct list_head *unwind)
>  {
> +       int err;
> +
>         if (i915_vma_is_pinned(vma))
> -               return false;
> +               return -ENOSPC;
>  
> -       if (!grab_vma(vma, ww))
> -               return false;
> +       err = grab_vma(vma, ww);
> +       if (err)
> +               return err;
>  
>         list_add(&vma->evict_link, unwind);
> -       return drm_mm_scan_add_block(scan, &vma->node);
> +       if (!drm_mm_scan_add_block(scan, &vma->node))
> +               return -ENOSPC;
> +
> +       return 0;
>  }
>  
>  static bool defer_evict(struct i915_vma *vma)
> @@ -150,6 +159,7 @@ i915_gem_evict_something(struct
> i915_address_space *vm,
>         enum drm_mm_insert_mode mode;
>         struct i915_vma *active;
>         int ret;
> +       int err = 0;
>  
>         lockdep_assert_held(&vm->mutex);
>         trace_i915_gem_evict(vm, min_size, alignment, flags);
> @@ -210,17 +220,23 @@ i915_gem_evict_something(struct
> i915_address_space *vm,
>                         continue;
>                 }
>  
> -               if (mark_free(&scan, ww, vma, flags, &eviction_list))
> +               err = mark_free(&scan, ww, vma, flags,
> &eviction_list);
> +               if (!err)
>                         goto found;
> +               if (err == -EDEADLK)
> +                       break;
>         }
>  
>         /* Nothing found, clean up and bail out! */
>         list_for_each_entry_safe(vma, next, &eviction_list,
> evict_link) {
>                 ret = drm_mm_scan_remove_block(&scan, &vma->node);
>                 BUG_ON(ret);
> -               ungrab_vma(vma);
> +               ungrab_vma(vma, ww);
>         }
>  
> +       if (err == -EDEADLK)
> +               return err;
> +
>         /*
>          * Can we unpin some objects such as idle hw contents,
>          * or pending flips? But since only the GGTT has global
> entries
> @@ -267,7 +283,7 @@ i915_gem_evict_something(struct
> i915_address_space *vm,
>                         __i915_vma_pin(vma);
>                 } else {
>                         list_del(&vma->evict_link);
> -                       ungrab_vma(vma);
> +                       ungrab_vma(vma, ww);
>                 }
>         }
>  
> @@ -277,17 +293,21 @@ i915_gem_evict_something(struct
> i915_address_space *vm,
>                 __i915_vma_unpin(vma);
>                 if (ret == 0)
>                         ret = __i915_vma_unbind(vma);
> -               ungrab_vma(vma);
> +               ungrab_vma(vma, ww);
>         }
>  
>         while (ret == 0 && (node = drm_mm_scan_color_evict(&scan))) {
>                 vma = container_of(node, struct i915_vma, node);
>  
>                 /* If we find any non-objects (!vma), we cannot evict
> them */
> -               if (vma->node.color != I915_COLOR_UNEVICTABLE &&
> -                   grab_vma(vma, ww)) {
> -                       ret = __i915_vma_unbind(vma);
> -                       ungrab_vma(vma);
> +               if (vma->node.color != I915_COLOR_UNEVICTABLE) {
> +                       ret = grab_vma(vma, ww);
> +                       if (!ret) {
> +                               ret = __i915_vma_unbind(vma);
> +                               ungrab_vma(vma, ww);
> +                       } else if (ret != -EDEADLK) {
> +                               ret = -ENOSPC;
> +                       }
>                 } else {
>                         ret = -ENOSPC;
>                 }
> @@ -382,8 +402,11 @@ int i915_gem_evict_for_node(struct
> i915_address_space *vm,
>                         break;
>                 }
>  
> -               if (!grab_vma(vma, ww)) {
> -                       ret = -ENOSPC;
> +               ret = grab_vma(vma, ww);
> +               if (ret) {
> +                       if (ret != -EDEADLK)
> +                               ret = -ENOSPC;
> +
>                         break;
>                 }
>  
> @@ -405,7 +428,7 @@ int i915_gem_evict_for_node(struct
> i915_address_space *vm,
>                 if (ret == 0)
>                         ret = __i915_vma_unbind(vma);
>  
> -               ungrab_vma(vma);
> +               ungrab_vma(vma, ww);
>         }
>  
>         return ret;


  reply	other threads:[~2022-08-24 18:06 UTC|newest]

Thread overview: 4+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-08-17  6:55 [Intel-gfx] [PATCH] drm/i915: Fix random -ENOSPC eviction errors due to locked vma objects Sviatoslav Peleshko
2022-08-17  7:34 ` Thomas Hellström [this message]
2022-08-30 14:19   ` Tvrtko Ursulin
2022-08-17 19:04 ` [Intel-gfx] ✗ Fi.CI.BAT: failure for " Patchwork

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=49c1adc00bdd7aba3d3ffd1ed5e53de2d7d5f1a1.camel@linux.intel.com \
    --to=thomas.hellstrom@linux.intel.com \
    --cc=airlied@linux.ie \
    --cc=daniel@ffwll.ch \
    --cc=dri-devel@lists.freedesktop.org \
    --cc=intel-gfx@lists.freedesktop.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=rodrigo.vivi@intel.com \
    --cc=sviatoslav.peleshko@globallogic.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox