Re: [FIXES 3/3] drm/i915/execlists: Move reset_active() from schedule-out to schedule-in

From: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
To: Chris Wilson <chris@chris-wilson.co.uk>, intel-gfx@lists.freedesktop.org
Subject: Re: [FIXES 3/3] drm/i915/execlists: Move reset_active() from schedule-out to schedule-in
Date: Mon, 11 Nov 2019 16:31:09 +0000	[thread overview]
Message-ID: <3261ff60-d953-d13b-bb42-5bf0972c6d2e@linux.intel.com> (raw)
In-Reply-To: <20191111133205.11590-3-chris@chris-wilson.co.uk>

On 11/11/2019 13:32, Chris Wilson wrote:
> The gem_ctx_persistence/smoketest was detecting an odd coherency issue
> inside the LRC context image; that the address of the ring buffer did
> not match our associated struct intel_ring. As we set the address into
> the context image when we pin the ring buffer into place before the
> context is active, that leaves the question of where did it get
> overwritten. Either the HW context save occurred after our pin which
> would imply that our idle barriers are broken, or we overwrote the
> context image ourselves. It is only in reset_active() where we dabble
> inside the context image outside of a serialised path from schedule-out;
> but we could equally perform the operation inside schedule-in which is
> then fully serialised with the context pin -- and remains serialised by
> the engine pulse with kill_context(). (The only downside, aside from
> doing more work inside the engine->active.lock, was the plan to merge
> all the reset paths into doing their context scrubbing on schedule-out
> needs more thought.)

So process_csb is not under the engine lock and hence schedule_out can 
race with schedule_in meaning schedule_out should refrain from doing 
non-trivial work.

Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

Regards,

Tvrtko

> 
> Fixes: d12acee84ffb ("drm/i915/execlists: Cancel banned contexts on schedule-out")
> Testcase: igt/gem_ctx_persistence/smoketest
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
> ---
>   drivers/gpu/drm/i915/gt/intel_lrc.c | 114 ++++++++++++++--------------
>   1 file changed, 57 insertions(+), 57 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c
> index e57345795c08..4b6d9e6b1bfd 100644
> --- a/drivers/gpu/drm/i915/gt/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/gt/intel_lrc.c
> @@ -1042,6 +1042,59 @@ execlists_check_context(const struct intel_context *ce,
>   	WARN_ONCE(!valid, "Invalid lrc state found before submission\n");
>   }
>   
> +static void restore_default_state(struct intel_context *ce,
> +				  struct intel_engine_cs *engine)
> +{
> +	u32 *regs = ce->lrc_reg_state;
> +
> +	if (engine->pinned_default_state)
> +		memcpy(regs, /* skip restoring the vanilla PPHWSP */
> +		       engine->pinned_default_state + LRC_STATE_PN * PAGE_SIZE,
> +		       engine->context_size - PAGE_SIZE);
> +
> +	execlists_init_reg_state(regs, ce, engine, ce->ring, false);
> +}
> +
> +static void reset_active(struct i915_request *rq,
> +			 struct intel_engine_cs *engine)
> +{
> +	struct intel_context * const ce = rq->hw_context;
> +	u32 head;
> +
> +	/*
> +	 * The executing context has been cancelled. We want to prevent
> +	 * further execution along this context and propagate the error on
> +	 * to anything depending on its results.
> +	 *
> +	 * In __i915_request_submit(), we apply the -EIO and remove the
> +	 * requests' payloads for any banned requests. But first, we must
> +	 * rewind the context back to the start of the incomplete request so
> +	 * that we do not jump back into the middle of the batch.
> +	 *
> +	 * We preserve the breadcrumbs and semaphores of the incomplete
> +	 * requests so that inter-timeline dependencies (i.e other timelines)
> +	 * remain correctly ordered. And we defer to __i915_request_submit()
> +	 * so that all asynchronous waits are correctly handled.
> +	 */
> +	GEM_TRACE("%s(%s): { rq=%llx:%lld }\n",
> +		  __func__, engine->name, rq->fence.context, rq->fence.seqno);
> +
> +	/* On resubmission of the active request, payload will be scrubbed */
> +	if (i915_request_completed(rq))
> +		head = rq->tail;
> +	else
> +		head = active_request(ce->timeline, rq)->head;
> +	ce->ring->head = intel_ring_wrap(ce->ring, head);
> +	intel_ring_update_space(ce->ring);
> +
> +	/* Scrub the context image to prevent replaying the previous batch */
> +	restore_default_state(ce, engine);
> +	__execlists_update_reg_state(ce, engine);
> +
> +	/* We've switched away, so this should be a no-op, but intent matters */
> +	ce->lrc_desc |= CTX_DESC_FORCE_RESTORE;
> +}
> +
>   static inline struct intel_engine_cs *
>   __execlists_schedule_in(struct i915_request *rq)
>   {
> @@ -1050,8 +1103,11 @@ __execlists_schedule_in(struct i915_request *rq)
>   
>   	intel_context_get(ce);
>   
> +	if (unlikely(i915_gem_context_is_banned(ce->gem_context)))
> +		reset_active(rq, engine);
> +
>   	if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
> -		execlists_check_context(ce, rq->engine);
> +		execlists_check_context(ce, engine);
>   
>   	if (ce->tag) {
>   		/* Use a fixed tag for OA and friends */
> @@ -1102,59 +1158,6 @@ static void kick_siblings(struct i915_request *rq, struct intel_context *ce)
>   		tasklet_schedule(&ve->base.execlists.tasklet);
>   }
>   
> -static void restore_default_state(struct intel_context *ce,
> -				  struct intel_engine_cs *engine)
> -{
> -	u32 *regs = ce->lrc_reg_state;
> -
> -	if (engine->pinned_default_state)
> -		memcpy(regs, /* skip restoring the vanilla PPHWSP */
> -		       engine->pinned_default_state + LRC_STATE_PN * PAGE_SIZE,
> -		       engine->context_size - PAGE_SIZE);
> -
> -	execlists_init_reg_state(regs, ce, engine, ce->ring, false);
> -}
> -
> -static void reset_active(struct i915_request *rq,
> -			 struct intel_engine_cs *engine)
> -{
> -	struct intel_context * const ce = rq->hw_context;
> -	u32 head;
> -
> -	/*
> -	 * The executing context has been cancelled. We want to prevent
> -	 * further execution along this context and propagate the error on
> -	 * to anything depending on its results.
> -	 *
> -	 * In __i915_request_submit(), we apply the -EIO and remove the
> -	 * requests' payloads for any banned requests. But first, we must
> -	 * rewind the context back to the start of the incomplete request so
> -	 * that we do not jump back into the middle of the batch.
> -	 *
> -	 * We preserve the breadcrumbs and semaphores of the incomplete
> -	 * requests so that inter-timeline dependencies (i.e other timelines)
> -	 * remain correctly ordered. And we defer to __i915_request_submit()
> -	 * so that all asynchronous waits are correctly handled.
> -	 */
> -	GEM_TRACE("%s(%s): { rq=%llx:%lld }\n",
> -		  __func__, engine->name, rq->fence.context, rq->fence.seqno);
> -
> -	/* On resubmission of the active request, payload will be scrubbed */
> -	if (i915_request_completed(rq))
> -		head = rq->tail;
> -	else
> -		head = active_request(ce->timeline, rq)->head;
> -	ce->ring->head = intel_ring_wrap(ce->ring, head);
> -	intel_ring_update_space(ce->ring);
> -
> -	/* Scrub the context image to prevent replaying the previous batch */
> -	restore_default_state(ce, engine);
> -	__execlists_update_reg_state(ce, engine);
> -
> -	/* We've switched away, so this should be a no-op, but intent matters */
> -	ce->lrc_desc |= CTX_DESC_FORCE_RESTORE;
> -}
> -
>   static inline void
>   __execlists_schedule_out(struct i915_request *rq,
>   			 struct intel_engine_cs * const engine)
> @@ -1165,9 +1168,6 @@ __execlists_schedule_out(struct i915_request *rq,
>   	execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_OUT);
>   	intel_gt_pm_put(engine->gt);
>   
> -	if (unlikely(i915_gem_context_is_banned(ce->gem_context)))
> -		reset_active(rq, engine);
> -
>   	/*
>   	 * If this is part of a virtual engine, its next request may
>   	 * have been blocked waiting for access to the active context.
> 
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx