Re: [PATCH 1/3] drm/i915: Rely on accurate request tracking for finding hung batches

public inbox for intel-gfx@lists.freedesktop.org
 help / color / mirror / Atom feed

From: Ben Widawsky <ben@bwidawsk.net>
To: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Cc: intel-gfx@lists.freedesktop.org, miku@iki.fi,
	Ben Widawsky <benjamin.widawsky@intel.com>
Subject: Re: [PATCH 1/3] drm/i915: Rely on accurate request tracking for finding hung batches
Date: Tue, 11 Feb 2014 17:57:19 -0800	[thread overview]
Message-ID: <20140212015719.GD23746@bwidawsk.net> (raw)
In-Reply-To: <1392042651-3278-1-git-send-email-mika.kuoppala@intel.com>

On Mon, Feb 10, 2014 at 04:30:49PM +0200, Mika Kuoppala wrote:
> From: Chris Wilson <chris@chris-wilson.co.uk>
> 
> In the past, it was possible to have multiple batches per request due to
> a stray signal or ENOMEM. As a result we had to scan each active object
> (filtered by those having the COMMAND domain) for the one that contained
> the ACTHD pointer. This was then made more complicated by the
> introduction of ppgtt, whereby ACTHD then pointed into the address space
> of the context and so also needed to be taken into account.
> 
> This is a fairly robust approach (though the implementation is a little
> fragile and depends upon the per-generation setup, registers and
> parameters). However, due to the requirements for hangstats, we needed a
> robust method for associating batches with a particular request and
> having that we can rely upon it for finding the associated batch object
> for error capture.
> 
> If the batch buffer tracking is not robust enough, that should become
> apparent quite quickly through an erroneous error capture. That should
> also help to make sure that the runtime reporting to userspace is
> robust. It also means that we then report the oldest incomplete batch on
> each ring, which can be useful for determining the state of userspace at
> the time of a hang.
> 
> v2: Use i915_gem_find_active_request (Mika)
> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> (v1)
> Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com> (v2)
> Cc: Ben Widawsky <benjamin.widawsky@intel.com>
> ---
>  drivers/gpu/drm/i915/i915_drv.h       |    3 ++
>  drivers/gpu/drm/i915/i915_gem.c       |   13 +++++--
>  drivers/gpu/drm/i915/i915_gpu_error.c |   66 ++++-----------------------------
>  3 files changed, 20 insertions(+), 62 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index b1e91c3..eb260bc 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -2147,6 +2147,9 @@ i915_gem_object_unpin_fence(struct drm_i915_gem_object *obj)
>  	}
>  }
>  
> +struct drm_i915_gem_request *
> +i915_gem_find_active_request(struct intel_ring_buffer *ring);
> +
>  bool i915_gem_retire_requests(struct drm_device *dev);
>  void i915_gem_retire_requests_ring(struct intel_ring_buffer *ring);
>  int __must_check i915_gem_check_wedge(struct i915_gpu_error *error,
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index b0a244a..20e55ef 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -2298,11 +2298,16 @@ static void i915_gem_free_request(struct drm_i915_gem_request *request)
>  	kfree(request);
>  }
>  
> -static struct drm_i915_gem_request *
> -i915_gem_find_first_non_complete(struct intel_ring_buffer *ring)
> +struct drm_i915_gem_request *
> +i915_gem_find_active_request(struct intel_ring_buffer *ring)
>  {
>  	struct drm_i915_gem_request *request;
> -	const u32 completed_seqno = ring->get_seqno(ring, false);
> +	u32 completed_seqno;
> +
> +	if (WARN_ON(!ring->get_seqno))
> +		return NULL;

ring->get_seqno(ring, false) ?

This looks like it's currently wrong below as well.

> +
> +	completed_seqno = ring->get_seqno(ring, false);
>  
>  	list_for_each_entry(request, &ring->request_list, list) {
>  		if (i915_seqno_passed(completed_seqno, request->seqno))
> @@ -2320,7 +2325,7 @@ static void i915_gem_reset_ring_status(struct drm_i915_private *dev_priv,
>  	struct drm_i915_gem_request *request;
>  	bool ring_hung;
>  
> -	request = i915_gem_find_first_non_complete(ring);
> +	request = i915_gem_find_active_request(ring);
>  
>  	if (request == NULL)
>  		return;
> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
> index dc47bb9..5bd075a 100644
> --- a/drivers/gpu/drm/i915/i915_gpu_error.c
> +++ b/drivers/gpu/drm/i915/i915_gpu_error.c
> @@ -713,46 +713,14 @@ static void i915_gem_record_fences(struct drm_device *dev,
>  	}
>  }
>  
> -/* This assumes all batchbuffers are executed from the PPGTT. It might have to
> - * change in the future. */
> -static bool is_active_vm(struct i915_address_space *vm,
> -			 struct intel_ring_buffer *ring)
> -{
> -	struct drm_device *dev = vm->dev;
> -	struct drm_i915_private *dev_priv = dev->dev_private;
> -	struct i915_hw_ppgtt *ppgtt;
> -
> -	if (INTEL_INFO(dev)->gen < 7)
> -		return i915_is_ggtt(vm);
> -
> -	/* FIXME: This ignores that the global gtt vm is also on this list. */
> -	ppgtt = container_of(vm, struct i915_hw_ppgtt, base);
> -
> -	if (INTEL_INFO(dev)->gen >= 8) {
> -		u64 pdp0 = (u64)I915_READ(GEN8_RING_PDP_UDW(ring, 0)) << 32;
> -		pdp0 |=  I915_READ(GEN8_RING_PDP_LDW(ring, 0));
> -		return pdp0 == ppgtt->pd_dma_addr[0];
> -	} else {
> -		u32 pp_db;
> -		pp_db = I915_READ(RING_PP_DIR_BASE(ring));
> -		return (pp_db >> 10) == ppgtt->pd_offset;
> -	}
> -}
> -
>  static struct drm_i915_error_object *
>  i915_error_first_batchbuffer(struct drm_i915_private *dev_priv,
>  			     struct intel_ring_buffer *ring)
>  {
> -	struct i915_address_space *vm;
> -	struct i915_vma *vma;
> -	struct drm_i915_gem_object *obj;
> -	bool found_active = false;
> -	u32 seqno;
> -
> -	if (!ring->get_seqno)
> -		return NULL;
> +	struct drm_i915_gem_request *request;
>  
>  	if (HAS_BROKEN_CS_TLB(dev_priv->dev)) {
> +		struct drm_i915_gem_object *obj;
>  		u32 acthd = I915_READ(ACTHD);
>  
>  		if (WARN_ON(ring->id != RCS))
> @@ -765,32 +733,14 @@ i915_error_first_batchbuffer(struct drm_i915_private *dev_priv,
>  			return i915_error_ggtt_object_create(dev_priv, obj);
>  	}
>  
> -	seqno = ring->get_seqno(ring, false);
> -	list_for_each_entry(vm, &dev_priv->vm_list, global_link) {
> -		if (!is_active_vm(vm, ring))
> -			continue;
> -
> -		found_active = true;
> -
> -		list_for_each_entry(vma, &vm->active_list, mm_list) {
> -			obj = vma->obj;
> -			if (obj->ring != ring)
> -				continue;
> -
> -			if (i915_seqno_passed(seqno, obj->last_read_seqno))
> -				continue;
> -
> -			if ((obj->base.read_domains & I915_GEM_DOMAIN_COMMAND) == 0)
> -				continue;
> -
> -			/* We need to copy these to an anonymous buffer as the simplest
> -			 * method to avoid being overwritten by userspace.
> -			 */
> -			return i915_error_object_create(dev_priv, obj, vm);
> -		}
> +	request = i915_gem_find_active_request(ring);
> +	if (request) {
> +		/* We need to copy these to an anonymous buffer as the simplest
> +		 * method to avoid being overwritten by userspace.
> +		 */
> +		return i915_error_object_create(dev_priv, request->batch_obj, request->ctx->vm);

Wrap this one. It's too long even for my undiscerning eye.

>  	}
>  
> -	WARN_ON(!found_active);
>  	return NULL;
>  }
>  


I still would have preferred to keep both methods for a while until we
felt really comfortable with this... the commit message's statement that
getting bad error state is inevitably a good thing is total bunk, I
think.

However, I'll act as a Daniel proxy here who seems in favor of this
path. The code is definitely simpler, I am just a chicken.

With the above fixed:
Reviewed-by: Ben Widawsky <ben@bwidawsk.net>

-- 
Ben Widawsky, Intel Open Source Technology Center

next prev parent reply	other threads:[~2014-02-12  1:57 UTC|newest]

Thread overview: 19+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2014-02-10 14:30 [PATCH 1/3] drm/i915: Rely on accurate request tracking for finding hung batches Mika Kuoppala
2014-02-10 14:30 ` [PATCH 2/3] drm/i915: Record pid/comm of hanging task Mika Kuoppala
2014-02-12  2:07   ` Ben Widawsky
2014-02-12  8:15     ` Chris Wilson
2014-02-12 18:55       ` Ben Widawsky
2014-02-12 19:18         ` Chris Wilson
2014-02-12 19:32           ` Ben Widawsky
2014-03-05 13:08             ` Chris Wilson
2014-03-05 13:34               ` Daniel Vetter
2014-03-05 13:50                 ` Chris Wilson
2014-02-10 14:30 ` [PATCH 3/3] drm/i915: Record error state capture reason Mika Kuoppala
2014-02-12  2:32   ` Ben Widawsky
2014-02-12  8:13     ` Chris Wilson
2014-02-12 19:05       ` Ben Widawsky
2014-02-12  1:57 ` Ben Widawsky [this message]
2014-02-12  8:25   ` [PATCH 1/3] drm/i915: Rely on accurate request tracking for finding hung batches Chris Wilson
2014-02-12 19:02     ` Ben Widawsky
2014-02-12 19:15       ` Chris Wilson
2014-02-12 17:47 ` Chris Wilson

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20140212015719.GD23746@bwidawsk.net \
    --to=ben@bwidawsk.net \
    --cc=benjamin.widawsky@intel.com \
    --cc=intel-gfx@lists.freedesktop.org \
    --cc=mika.kuoppala@linux.intel.com \
    --cc=miku@iki.fi \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox