From: Ben Widawsky <ben@bwidawsk.net>
To: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Cc: intel-gfx@lists.freedesktop.org, miku@iki.fi,
Ben Widawsky <benjamin.widawsky@intel.com>
Subject: Re: [PATCH 1/3] drm/i915: Rely on accurate request tracking for finding hung batches
Date: Tue, 11 Feb 2014 17:57:19 -0800 [thread overview]
Message-ID: <20140212015719.GD23746@bwidawsk.net> (raw)
In-Reply-To: <1392042651-3278-1-git-send-email-mika.kuoppala@intel.com>
On Mon, Feb 10, 2014 at 04:30:49PM +0200, Mika Kuoppala wrote:
> From: Chris Wilson <chris@chris-wilson.co.uk>
>
> In the past, it was possible to have multiple batches per request due to
> a stray signal or ENOMEM. As a result we had to scan each active object
> (filtered by those having the COMMAND domain) for the one that contained
> the ACTHD pointer. This was then made more complicated by the
> introduction of ppgtt, whereby ACTHD then pointed into the address space
> of the context and so also needed to be taken into account.
>
> This is a fairly robust approach (though the implementation is a little
> fragile and depends upon the per-generation setup, registers and
> parameters). However, due to the requirements for hangstats, we needed a
> robust method for associating batches with a particular request and
> having that we can rely upon it for finding the associated batch object
> for error capture.
>
> If the batch buffer tracking is not robust enough, that should become
> apparent quite quickly through an erroneous error capture. That should
> also help to make sure that the runtime reporting to userspace is
> robust. It also means that we then report the oldest incomplete batch on
> each ring, which can be useful for determining the state of userspace at
> the time of a hang.
>
> v2: Use i915_gem_find_active_request (Mika)
>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> (v1)
> Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com> (v2)
> Cc: Ben Widawsky <benjamin.widawsky@intel.com>
> ---
> drivers/gpu/drm/i915/i915_drv.h | 3 ++
> drivers/gpu/drm/i915/i915_gem.c | 13 +++++--
> drivers/gpu/drm/i915/i915_gpu_error.c | 66 ++++-----------------------------
> 3 files changed, 20 insertions(+), 62 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index b1e91c3..eb260bc 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -2147,6 +2147,9 @@ i915_gem_object_unpin_fence(struct drm_i915_gem_object *obj)
> }
> }
>
> +struct drm_i915_gem_request *
> +i915_gem_find_active_request(struct intel_ring_buffer *ring);
> +
> bool i915_gem_retire_requests(struct drm_device *dev);
> void i915_gem_retire_requests_ring(struct intel_ring_buffer *ring);
> int __must_check i915_gem_check_wedge(struct i915_gpu_error *error,
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index b0a244a..20e55ef 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -2298,11 +2298,16 @@ static void i915_gem_free_request(struct drm_i915_gem_request *request)
> kfree(request);
> }
>
> -static struct drm_i915_gem_request *
> -i915_gem_find_first_non_complete(struct intel_ring_buffer *ring)
> +struct drm_i915_gem_request *
> +i915_gem_find_active_request(struct intel_ring_buffer *ring)
> {
> struct drm_i915_gem_request *request;
> - const u32 completed_seqno = ring->get_seqno(ring, false);
> + u32 completed_seqno;
> +
> + if (WARN_ON(!ring->get_seqno))
> + return NULL;
ring->get_seqno(ring, false) ?
This looks like it's currently wrong below as well.
> +
> + completed_seqno = ring->get_seqno(ring, false);
>
> list_for_each_entry(request, &ring->request_list, list) {
> if (i915_seqno_passed(completed_seqno, request->seqno))
> @@ -2320,7 +2325,7 @@ static void i915_gem_reset_ring_status(struct drm_i915_private *dev_priv,
> struct drm_i915_gem_request *request;
> bool ring_hung;
>
> - request = i915_gem_find_first_non_complete(ring);
> + request = i915_gem_find_active_request(ring);
>
> if (request == NULL)
> return;
> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
> index dc47bb9..5bd075a 100644
> --- a/drivers/gpu/drm/i915/i915_gpu_error.c
> +++ b/drivers/gpu/drm/i915/i915_gpu_error.c
> @@ -713,46 +713,14 @@ static void i915_gem_record_fences(struct drm_device *dev,
> }
> }
>
> -/* This assumes all batchbuffers are executed from the PPGTT. It might have to
> - * change in the future. */
> -static bool is_active_vm(struct i915_address_space *vm,
> - struct intel_ring_buffer *ring)
> -{
> - struct drm_device *dev = vm->dev;
> - struct drm_i915_private *dev_priv = dev->dev_private;
> - struct i915_hw_ppgtt *ppgtt;
> -
> - if (INTEL_INFO(dev)->gen < 7)
> - return i915_is_ggtt(vm);
> -
> - /* FIXME: This ignores that the global gtt vm is also on this list. */
> - ppgtt = container_of(vm, struct i915_hw_ppgtt, base);
> -
> - if (INTEL_INFO(dev)->gen >= 8) {
> - u64 pdp0 = (u64)I915_READ(GEN8_RING_PDP_UDW(ring, 0)) << 32;
> - pdp0 |= I915_READ(GEN8_RING_PDP_LDW(ring, 0));
> - return pdp0 == ppgtt->pd_dma_addr[0];
> - } else {
> - u32 pp_db;
> - pp_db = I915_READ(RING_PP_DIR_BASE(ring));
> - return (pp_db >> 10) == ppgtt->pd_offset;
> - }
> -}
> -
> static struct drm_i915_error_object *
> i915_error_first_batchbuffer(struct drm_i915_private *dev_priv,
> struct intel_ring_buffer *ring)
> {
> - struct i915_address_space *vm;
> - struct i915_vma *vma;
> - struct drm_i915_gem_object *obj;
> - bool found_active = false;
> - u32 seqno;
> -
> - if (!ring->get_seqno)
> - return NULL;
> + struct drm_i915_gem_request *request;
>
> if (HAS_BROKEN_CS_TLB(dev_priv->dev)) {
> + struct drm_i915_gem_object *obj;
> u32 acthd = I915_READ(ACTHD);
>
> if (WARN_ON(ring->id != RCS))
> @@ -765,32 +733,14 @@ i915_error_first_batchbuffer(struct drm_i915_private *dev_priv,
> return i915_error_ggtt_object_create(dev_priv, obj);
> }
>
> - seqno = ring->get_seqno(ring, false);
> - list_for_each_entry(vm, &dev_priv->vm_list, global_link) {
> - if (!is_active_vm(vm, ring))
> - continue;
> -
> - found_active = true;
> -
> - list_for_each_entry(vma, &vm->active_list, mm_list) {
> - obj = vma->obj;
> - if (obj->ring != ring)
> - continue;
> -
> - if (i915_seqno_passed(seqno, obj->last_read_seqno))
> - continue;
> -
> - if ((obj->base.read_domains & I915_GEM_DOMAIN_COMMAND) == 0)
> - continue;
> -
> - /* We need to copy these to an anonymous buffer as the simplest
> - * method to avoid being overwritten by userspace.
> - */
> - return i915_error_object_create(dev_priv, obj, vm);
> - }
> + request = i915_gem_find_active_request(ring);
> + if (request) {
> + /* We need to copy these to an anonymous buffer as the simplest
> + * method to avoid being overwritten by userspace.
> + */
> + return i915_error_object_create(dev_priv, request->batch_obj, request->ctx->vm);
Wrap this one. It's too long even for my undiscerning eye.
> }
>
> - WARN_ON(!found_active);
> return NULL;
> }
>
I still would have preferred to keep both methods for a while until we
felt really comfortable with this... the commit message's statement that
getting bad error state is inevitably a good thing is total bunk, I
think.
However, I'll act as a Daniel proxy here who seems in favor of this
path. The code is definitely simpler, I am just a chicken.
With the above fixed:
Reviewed-by: Ben Widawsky <ben@bwidawsk.net>
--
Ben Widawsky, Intel Open Source Technology Center
next prev parent reply other threads:[~2014-02-12 1:57 UTC|newest]
Thread overview: 19+ messages / expand[flat|nested] mbox.gz Atom feed top
2014-02-10 14:30 [PATCH 1/3] drm/i915: Rely on accurate request tracking for finding hung batches Mika Kuoppala
2014-02-10 14:30 ` [PATCH 2/3] drm/i915: Record pid/comm of hanging task Mika Kuoppala
2014-02-12 2:07 ` Ben Widawsky
2014-02-12 8:15 ` Chris Wilson
2014-02-12 18:55 ` Ben Widawsky
2014-02-12 19:18 ` Chris Wilson
2014-02-12 19:32 ` Ben Widawsky
2014-03-05 13:08 ` Chris Wilson
2014-03-05 13:34 ` Daniel Vetter
2014-03-05 13:50 ` Chris Wilson
2014-02-10 14:30 ` [PATCH 3/3] drm/i915: Record error state capture reason Mika Kuoppala
2014-02-12 2:32 ` Ben Widawsky
2014-02-12 8:13 ` Chris Wilson
2014-02-12 19:05 ` Ben Widawsky
2014-02-12 1:57 ` Ben Widawsky [this message]
2014-02-12 8:25 ` [PATCH 1/3] drm/i915: Rely on accurate request tracking for finding hung batches Chris Wilson
2014-02-12 19:02 ` Ben Widawsky
2014-02-12 19:15 ` Chris Wilson
2014-02-12 17:47 ` Chris Wilson
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20140212015719.GD23746@bwidawsk.net \
--to=ben@bwidawsk.net \
--cc=benjamin.widawsky@intel.com \
--cc=intel-gfx@lists.freedesktop.org \
--cc=mika.kuoppala@linux.intel.com \
--cc=miku@iki.fi \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox