All of lore.kernel.org
 help / color / mirror / Atom feed
From: Mika Kuoppala <mika.kuoppala@linux.intel.com>
To: Chris Wilson <chris@chris-wilson.co.uk>, intel-gfx@lists.freedesktop.org
Subject: Re: [PATCH 1/3] drm/i915/execlists: Reinitialise context	image after GPU hang
Date: Mon, 03 Oct 2016 15:25:16 +0300	[thread overview]
Message-ID: <87twct4o2b.fsf@gaia.fi.intel.com> (raw)
In-Reply-To: <20160930075032.30967-1-chris@chris-wilson.co.uk>

Chris Wilson <chris@chris-wilson.co.uk> writes:

> On Braswell, at least, we observe that the context image is written in
> multiple phases. The first phase is to clear the register state, and
> subsequently rewrite it. A GPU reset at the right moment can interrupt
> the context update leaving it corrupt, and our update of the RING_HEAD
> is not sufficient to restart the engine afterwards. To recover, we need
> to reset the registers back to their original values. The context state
> is lost. What we need is a better mechanism to serialise the reset with
> pending flushes from the GPU.
>
> Fixes: 821ed7df6e2a ("drm/i915: Update reset path to fix incomplete requests")
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Mika Kuoppala <mika.kuoppala@intel.com>
> ---
>  drivers/gpu/drm/i915/intel_lrc.c | 95 +++++++++++++++++++++++-----------------
>  1 file changed, 56 insertions(+), 39 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index 2d8eb2eb2b72..d6e762718ff4 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -226,10 +226,16 @@ enum {
>  /* Typical size of the average request (2 pipecontrols and a MI_BB) */
>  #define EXECLISTS_REQUEST_SIZE 64 /* bytes */
>  
> +#define WA_TAIL_DWORDS 2
> +
>  static int execlists_context_deferred_alloc(struct i915_gem_context *ctx,
>  					    struct intel_engine_cs *engine);
>  static int intel_lr_context_pin(struct i915_gem_context *ctx,
>  				struct intel_engine_cs *engine);
> +static void execlists_init_reg_state(u32 *reg_state,
> +				     struct i915_gem_context *ctx,
> +				     struct intel_engine_cs *engine,
> +				     struct intel_ring *ring);
>  
>  /**
>   * intel_sanitize_enable_execlists() - sanitize i915.enable_execlists
> @@ -707,7 +713,6 @@ static int intel_lr_context_pin(struct i915_gem_context *ctx,
>  {
>  	struct intel_context *ce = &ctx->engine[engine->id];
>  	void *vaddr;
> -	u32 *lrc_reg_state;
>  	int ret;
>  
>  	lockdep_assert_held(&ctx->i915->drm.struct_mutex);
> @@ -726,17 +731,16 @@ static int intel_lr_context_pin(struct i915_gem_context *ctx,
>  		goto unpin_vma;
>  	}
>  
> -	lrc_reg_state = vaddr + LRC_STATE_PN * PAGE_SIZE;
> -
>  	ret = intel_ring_pin(ce->ring);
>  	if (ret)
>  		goto unpin_map;
>  
>  	intel_lr_context_descriptor_update(ctx, engine);
>  
> -	lrc_reg_state[CTX_RING_BUFFER_START+1] =
> +	ce->lrc_reg_state = vaddr + LRC_STATE_PN * PAGE_SIZE;
> +	ce->lrc_reg_state[CTX_RING_BUFFER_START+1] =
>  		i915_ggtt_offset(ce->ring->vma);
> -	ce->lrc_reg_state = lrc_reg_state;
> +
>  	ce->state->obj->dirty = true;
>  
>  	/* Invalidate GuC TLB. */
> @@ -1284,8 +1288,14 @@ static void reset_common_ring(struct intel_engine_cs *engine,
>  	struct execlist_port *port = engine->execlist_port;
>  	struct intel_context *ce = &request->ctx->engine[engine->id];
>  
> +	execlists_init_reg_state(ce->lrc_reg_state,
> +				 request->ctx, engine, ce->ring);
> +
>  	/* Move the RING_HEAD onto the breadcrumb, past the hanging batch */
> +	ce->lrc_reg_state[CTX_RING_BUFFER_START+1] =
> +		i915_ggtt_offset(ce->ring->vma);
>  	ce->lrc_reg_state[CTX_RING_HEAD+1] = request->postfix;
> +
>  	request->ring->head = request->postfix;
>  	request->ring->last_retired_head = -1;
>  	intel_ring_update_space(request->ring);
> @@ -1305,6 +1315,9 @@ static void reset_common_ring(struct intel_engine_cs *engine,
>  	GEM_BUG_ON(request->ctx != port[0].request->ctx);
>  	port[0].count = 0;
>  	port[1].count = 0;
> +
> +	/* Reset WaIdleLiteRestore:bdw,skl as well */
> +	request->tail = request->wa_tail - WA_TAIL_DWORDS * sizeof(u32);
>  }
>  
>  static int intel_logical_ring_emit_pdps(struct drm_i915_gem_request *req)
> @@ -1542,7 +1555,6 @@ static void bxt_a_seqno_barrier(struct intel_engine_cs *engine)
>   * used as a workaround for not being allowed to do lite
>   * restore with HEAD==TAIL (WaIdleLiteRestore).
>   */
> -#define WA_TAIL_DWORDS 2
>  
>  static int gen8_emit_request(struct drm_i915_gem_request *request)
>  {
> @@ -1889,38 +1901,13 @@ static u32 intel_lr_indirect_ctx_offset(struct intel_engine_cs *engine)
>  	return indirect_ctx_offset;
>  }
>  
> -static int
> -populate_lr_context(struct i915_gem_context *ctx,
> -		    struct drm_i915_gem_object *ctx_obj,
> -		    struct intel_engine_cs *engine,
> -		    struct intel_ring *ring)
> +static void execlists_init_reg_state(u32 *reg_state,
> +				     struct i915_gem_context *ctx,
> +				     struct intel_engine_cs *engine,
> +				     struct intel_ring *ring)
>  {
> -	struct drm_i915_private *dev_priv = ctx->i915;
> -	struct i915_hw_ppgtt *ppgtt = ctx->ppgtt;
> -	void *vaddr;
> -	u32 *reg_state;
> -	int ret;
> -
> -	if (!ppgtt)
> -		ppgtt = dev_priv->mm.aliasing_ppgtt;
> -
> -	ret = i915_gem_object_set_to_cpu_domain(ctx_obj, true);
> -	if (ret) {
> -		DRM_DEBUG_DRIVER("Could not set to CPU domain\n");
> -		return ret;
> -	}
> -
> -	vaddr = i915_gem_object_pin_map(ctx_obj, I915_MAP_WB);
> -	if (IS_ERR(vaddr)) {
> -		ret = PTR_ERR(vaddr);
> -		DRM_DEBUG_DRIVER("Could not map object pages! (%d)\n", ret);
> -		return ret;
> -	}
> -	ctx_obj->dirty = true;
> -
> -	/* The second page of the context object contains some fields which must
> -	 * be set up prior to the first execution. */
> -	reg_state = vaddr + LRC_STATE_PN * PAGE_SIZE;
> +	struct drm_i915_private *dev_priv = engine->i915;
> +	struct i915_hw_ppgtt *ppgtt = ctx->ppgtt ?: dev_priv->mm.aliasing_ppgtt;
>  
>  	/* A context is actually a big batch buffer with several MI_LOAD_REGISTER_IMM
>  	 * commands followed by (reg, value) pairs. The values we are setting here are
> @@ -1934,7 +1921,7 @@ populate_lr_context(struct i915_gem_context *ctx,
>  		       _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH |
>  					  CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT |
>  					  (HAS_RESOURCE_STREAMER(dev_priv) ?
> -					    CTX_CTRL_RS_CTX_ENABLE : 0)));
> +					   CTX_CTRL_RS_CTX_ENABLE : 0)));
>  	ASSIGN_CTX_REG(reg_state, CTX_RING_HEAD, RING_HEAD(engine->mmio_base),
>  		       0);
>  	ASSIGN_CTX_REG(reg_state, CTX_RING_TAIL, RING_TAIL(engine->mmio_base),
> @@ -1946,7 +1933,7 @@ populate_lr_context(struct i915_gem_context *ctx,
>  		       RING_START(engine->mmio_base), 0);
>  	ASSIGN_CTX_REG(reg_state, CTX_RING_BUFFER_CONTROL,
>  		       RING_CTL(engine->mmio_base),
> -		       ((ring->size - PAGE_SIZE) & RING_NR_PAGES) | RING_VALID);
> +		       (ring->size - PAGE_SIZE) | RING_VALID);

Patch looks good.

Reviewed-by: Mika Kuoppala <mika.kuoppala@intel.com>

Not exactly problems with this patch, but as we are in
the territory:

I still would like the ring->size setting to be accompanied
with comment about it matching page shift. I have fallen
for it twice now so I suspect the next reader will too.

And for that matter, removal of misleading comment

/ * It is written to the context image in execlists_update_context() */

in execlists_init_reg_state()

Thanks,
-Mika


>  	ASSIGN_CTX_REG(reg_state, CTX_BB_HEAD_U,
>  		       RING_BBADDR_UDW(engine->mmio_base), 0);
>  	ASSIGN_CTX_REG(reg_state, CTX_BB_HEAD_L,
> @@ -2024,6 +2011,36 @@ populate_lr_context(struct i915_gem_context *ctx,
>  		ASSIGN_CTX_REG(reg_state, CTX_R_PWR_CLK_STATE, GEN8_R_PWR_CLK_STATE,
>  			       make_rpcs(dev_priv));
>  	}
> +}
> +
> +static int
> +populate_lr_context(struct i915_gem_context *ctx,
> +		    struct drm_i915_gem_object *ctx_obj,
> +		    struct intel_engine_cs *engine,
> +		    struct intel_ring *ring)
> +{
> +	void *vaddr;
> +	int ret;
> +
> +	ret = i915_gem_object_set_to_cpu_domain(ctx_obj, true);
> +	if (ret) {
> +		DRM_DEBUG_DRIVER("Could not set to CPU domain\n");
> +		return ret;
> +	}
> +
> +	vaddr = i915_gem_object_pin_map(ctx_obj, I915_MAP_WB);
> +	if (IS_ERR(vaddr)) {
> +		ret = PTR_ERR(vaddr);
> +		DRM_DEBUG_DRIVER("Could not map object pages! (%d)\n", ret);
> +		return ret;
> +	}
> +	ctx_obj->dirty = true;
> +
> +	/* The second page of the context object contains some fields which must
> +	 * be set up prior to the first execution. */
> +
> +	execlists_init_reg_state(vaddr + LRC_STATE_PN * PAGE_SIZE,
> +				 ctx, engine, ring);
>  
>  	i915_gem_object_unpin_map(ctx_obj);
>  
> -- 
> 2.9.3
>
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/intel-gfx
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

      parent reply	other threads:[~2016-10-03 12:25 UTC|newest]

Thread overview: 10+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2016-09-30  7:50 [PATCH 1/3] drm/i915/execlists: Reinitialise context image after GPU hang Chris Wilson
2016-09-30  7:50 ` [PATCH 2/3] drm/i915: Call synchronize_irq() after resetting the GPU Chris Wilson
2016-09-30  8:26   ` Mika Kuoppala
2016-09-30  8:38     ` Chris Wilson
2016-09-30 11:25     ` [PATCH] drm/i915: Disable irqs across GPU reset Chris Wilson
2016-10-03 12:42       ` Mika Kuoppala
2016-09-30  7:50 ` [PATCH 3/3] drm/i915: Show RING registers through debugfs Chris Wilson
2016-09-30  8:20 ` ✗ Fi.CI.BAT: warning for series starting with [1/3] drm/i915/execlists: Reinitialise context image after GPU hang Patchwork
2016-09-30 13:18 ` ✗ Fi.CI.BAT: warning for series starting with [1/3] drm/i915/execlists: Reinitialise context image after GPU hang (rev2) Patchwork
2016-10-03 12:25 ` Mika Kuoppala [this message]

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=87twct4o2b.fsf@gaia.fi.intel.com \
    --to=mika.kuoppala@linux.intel.com \
    --cc=chris@chris-wilson.co.uk \
    --cc=intel-gfx@lists.freedesktop.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.