Intel-GFX Archive on lore.kernel.org
 help / color / mirror / Atom feed
From: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
To: Chris Wilson <chris@chris-wilson.co.uk>, intel-gfx@lists.freedesktop.org
Subject: Re: [Intel-gfx] [PATCH 02/31] drm/i915: Move context revocation to scheduler
Date: Mon, 8 Feb 2021 11:18:35 +0000	[thread overview]
Message-ID: <fe5a1ffc-4f9d-c69f-14e9-05bf5e9e5f0c@linux.intel.com> (raw)
In-Reply-To: <20210208105236.28498-2-chris@chris-wilson.co.uk>


On 08/02/2021 10:52, Chris Wilson wrote:
> Centralise the means by which to remove a context from execution to the
> scheduler, allowing the backends to specialise as necessary. Note that
> without backend support, we can simplify the procedure to forcibly reset
> the HW to remove the context.
> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>   drivers/gpu/drm/i915/gem/i915_gem_context.c   | 117 +-----------------
>   .../drm/i915/gt/intel_execlists_submission.c  |  47 +++++++
>   drivers/gpu/drm/i915/i915_scheduler.c         |  20 +++
>   drivers/gpu/drm/i915/i915_scheduler_types.h   |   5 +
>   4 files changed, 75 insertions(+), 114 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.c b/drivers/gpu/drm/i915/gem/i915_gem_context.c
> index ca37d93ef5e7..be75f861db67 100644
> --- a/drivers/gpu/drm/i915/gem/i915_gem_context.c
> +++ b/drivers/gpu/drm/i915/gem/i915_gem_context.c
> @@ -382,104 +382,9 @@ __context_engines_static(const struct i915_gem_context *ctx)
>   	return rcu_dereference_protected(ctx->engines, true);
>   }
>   
> -static void __reset_context(struct i915_gem_context *ctx,
> -			    struct intel_engine_cs *engine)
> -{
> -	intel_gt_handle_error(engine->gt, engine->mask, 0,
> -			      "context closure in %s", ctx->name);
> -}
> -
> -static bool __cancel_engine(struct intel_engine_cs *engine)
> -{
> -	/*
> -	 * Send a "high priority pulse" down the engine to cause the
> -	 * current request to be momentarily preempted. (If it fails to
> -	 * be preempted, it will be reset). As we have marked our context
> -	 * as banned, any incomplete request, including any running, will
> -	 * be skipped following the preemption.
> -	 *
> -	 * If there is no hangchecking (one of the reasons why we try to
> -	 * cancel the context) and no forced preemption, there may be no
> -	 * means by which we reset the GPU and evict the persistent hog.
> -	 * Ergo if we are unable to inject a preemptive pulse that can
> -	 * kill the banned context, we fallback to doing a local reset
> -	 * instead.
> -	 */
> -	return intel_engine_pulse(engine) == 0;
> -}
> -
> -static bool
> -__active_engine(struct i915_request *rq, struct intel_engine_cs **active)
> -{
> -	struct intel_engine_cs *engine, *locked;
> -	bool ret = false;
> -
> -	/*
> -	 * Serialise with __i915_request_submit() so that it sees
> -	 * is-banned?, or we know the request is already inflight.
> -	 *
> -	 * Note that rq->engine is unstable, and so we double
> -	 * check that we have acquired the lock on the final engine.
> -	 */
> -	locked = READ_ONCE(rq->engine);
> -	spin_lock_irq(&locked->sched.lock);
> -	while (unlikely(locked != (engine = READ_ONCE(rq->engine)))) {
> -		spin_unlock(&locked->sched.lock);
> -		locked = engine;
> -		spin_lock(&locked->sched.lock);
> -	}
> -
> -	if (i915_request_is_active(rq)) {
> -		if (!__i915_request_is_complete(rq))
> -			*active = locked;
> -		ret = true;
> -	}
> -
> -	spin_unlock_irq(&locked->sched.lock);
> -
> -	return ret;
> -}
> -
> -static struct intel_engine_cs *active_engine(struct intel_context *ce)
> -{
> -	struct intel_engine_cs *engine = NULL;
> -	struct i915_request *rq;
> -
> -	if (intel_context_has_inflight(ce))
> -		return intel_context_inflight(ce);
> -
> -	if (!ce->timeline)
> -		return NULL;
> -
> -	/*
> -	 * rq->link is only SLAB_TYPESAFE_BY_RCU, we need to hold a reference
> -	 * to the request to prevent it being transferred to a new timeline
> -	 * (and onto a new timeline->requests list).
> -	 */
> -	rcu_read_lock();
> -	list_for_each_entry_reverse(rq, &ce->timeline->requests, link) {
> -		bool found;
> -
> -		/* timeline is already completed upto this point? */
> -		if (!i915_request_get_rcu(rq))
> -			break;
> -
> -		/* Check with the backend if the request is inflight */
> -		found = true;
> -		if (likely(rcu_access_pointer(rq->timeline) == ce->timeline))
> -			found = __active_engine(rq, &engine);
> -
> -		i915_request_put(rq);
> -		if (found)
> -			break;
> -	}
> -	rcu_read_unlock();
> -
> -	return engine;
> -}
> -
>   static void kill_engines(struct i915_gem_engines *engines, bool ban)
>   {
> +	const int error = ban ? -EIO : -EAGAIN;
>   	struct i915_gem_engines_iter it;
>   	struct intel_context *ce;
>   
> @@ -491,28 +396,12 @@ static void kill_engines(struct i915_gem_engines *engines, bool ban)
>   	 * engines on which there are incomplete requests.
>   	 */
>   	for_each_gem_engine(ce, engines, it) {
> -		struct intel_engine_cs *engine;
> +		struct i915_sched *se = intel_engine_get_scheduler(ce->engine);
>   
>   		if (ban && intel_context_set_banned(ce))
>   			continue;
>   
> -		/*
> -		 * Check the current active state of this context; if we
> -		 * are currently executing on the GPU we need to evict
> -		 * ourselves. On the other hand, if we haven't yet been
> -		 * submitted to the GPU or if everything is complete,
> -		 * we have nothing to do.
> -		 */
> -		engine = active_engine(ce);
> -
> -		/* First attempt to gracefully cancel the context */
> -		if (engine && !__cancel_engine(engine) && ban)
> -			/*
> -			 * If we are unable to send a preemptive pulse to bump
> -			 * the context from the GPU, we have to resort to a full
> -			 * reset. We hope the collateral damage is worth it.
> -			 */
> -			__reset_context(engines->ctx, engine);
> +		se->revoke_context(ce, ban ? engines->ctx->name : NULL, error);
>   	}
>   }
>   
> diff --git a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
> index 85ff5fe861b4..e51112302fb8 100644
> --- a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
> +++ b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
> @@ -114,6 +114,7 @@
>   #include "gen8_engine_cs.h"
>   #include "intel_breadcrumbs.h"
>   #include "intel_context.h"
> +#include "intel_engine_heartbeat.h"
>   #include "intel_engine_pm.h"
>   #include "intel_engine_stats.h"
>   #include "intel_execlists_submission.h"
> @@ -2774,6 +2775,50 @@ execlists_active_request(const struct i915_sched *se)
>   	return rq;
>   }
>   
> +static bool __cancel_engine(struct intel_engine_cs *engine)
> +{
> +	/*
> +	 * Send a "high priority pulse" down the engine to cause the
> +	 * current request to be momentarily preempted. (If it fails to
> +	 * be preempted, it will be reset). As we have marked our context
> +	 * as banned, any incomplete request, including any running, will
> +	 * be skipped following the preemption.
> +	 *
> +	 * If there is no hangchecking (one of the reasons why we try to
> +	 * cancel the context) and no forced preemption, there may be no
> +	 * means by which we reset the GPU and evict the persistent hog.
> +	 * Ergo if we are unable to inject a preemptive pulse that can
> +	 * kill the banned context, we fallback to doing a local reset
> +	 * instead.
> +	 */
> +	return intel_engine_pulse(engine) == 0;
> +}
> +
> +static void
> +execlists_revoke_context(struct intel_context *ce, const char *force, int error)
> +{
> +	struct intel_engine_cs *engine;
> +
> +	/*
> +	 * Check the current active state of this context; if we
> +	 * are currently executing on the GPU we need to evict
> +	 * ourselves. On the other hand, if we haven't yet been
> +	 * submitted to the GPU or if everything is complete,
> +	 * we have nothing to do.
> +	 */
> +	engine = intel_context_inflight(ce);
> +
> +	/* First attempt to gracefully cancel the context */
> +	if (engine && !__cancel_engine(engine) && force)
> +		/*
> +		 * If we are unable to send a preemptive pulse to bump
> +		 * the context from the GPU, we have to resort to a full
> +		 * reset. We hope the collateral damage is worth it.
> +		 */
> +		intel_gt_handle_error(engine->gt, engine->mask, 0,
> +				      "context revoked from %s", force);
> +}
> +
>   static bool can_preempt(struct intel_engine_cs *engine)
>   {
>   	if (INTEL_GEN(engine->i915) > 8)
> @@ -2911,6 +2956,7 @@ static void init_execlists(struct intel_engine_cs *engine)
>   	u32 base = engine->mmio_base;
>   
>   	engine->sched.active_request = execlists_active_request;
> +	engine->sched.revoke_context = execlists_revoke_context;
>   	engine->sched.show = execlists_show;
>   	tasklet_setup(&engine->sched.tasklet, execlists_submission_tasklet);
>   
> @@ -3454,6 +3500,7 @@ intel_execlists_create_virtual(struct intel_engine_cs **siblings,
>   			ENGINE_VIRTUAL);
>   
>   	ve->base.sched.submit_request = virtual_submit_request;
> +	ve->base.sched.revoke_context = execlists_revoke_context;
>   	tasklet_setup(&ve->base.sched.tasklet, virtual_submission_tasklet);
>   
>   	virtual_engine_initial_hint(ve);
> diff --git a/drivers/gpu/drm/i915/i915_scheduler.c b/drivers/gpu/drm/i915/i915_scheduler.c
> index a8fb787278e6..7855601a4958 100644
> --- a/drivers/gpu/drm/i915/i915_scheduler.c
> +++ b/drivers/gpu/drm/i915/i915_scheduler.c
> @@ -135,6 +135,25 @@ i915_sched_default_active_request(const struct i915_sched *se)
>   	return active;
>   }
>   
> +static bool context_active(struct intel_context *ce)
> +{
> +	return i915_active_fence_isset(&ce->timeline->last_request);
> +}
> +
> +static void
> +i915_sched_default_revoke_context(struct intel_context *ce,
> +				  const char *force,
> +				  int error)
> +{
> +	/*
> +	 * Without backend support, we cannot remove the context from the
> +	 * HW gracefully. All we can do is force a reset, as a last resort.
> +	 */
> +	if (force && context_active(ce))
> +		intel_gt_handle_error(ce->engine->gt, ce->engine->mask, 0,
> +				      "context revoked from %s", force);
> +}
> +
>   void i915_sched_init(struct i915_sched *se,
>   		     struct device *dev,
>   		     const char *name,
> @@ -158,6 +177,7 @@ void i915_sched_init(struct i915_sched *se,
>   
>   	se->submit_request = i915_request_enqueue;
>   	se->active_request = i915_sched_default_active_request;
> +	se->revoke_context = i915_sched_default_revoke_context;
>   }
>   
>   void i915_sched_park(struct i915_sched *se)
> diff --git a/drivers/gpu/drm/i915/i915_scheduler_types.h b/drivers/gpu/drm/i915/i915_scheduler_types.h
> index a8502c94d7c5..84232a07163f 100644
> --- a/drivers/gpu/drm/i915/i915_scheduler_types.h
> +++ b/drivers/gpu/drm/i915/i915_scheduler_types.h
> @@ -15,6 +15,7 @@
>   
>   struct drm_printer;
>   struct i915_request;
> +struct intel_context;
>   
>   /**
>    * struct i915_sched - funnels requests towards hardware
> @@ -40,6 +41,10 @@ struct i915_sched {
>   
>   	struct i915_request *(*active_request)(const struct i915_sched *se);
>   
> +	void (*revoke_context)(struct intel_context *ce,
> +			       const char *whom,
> +			       int error);
> +
>   	void (*show)(struct drm_printer *m,
>   		     struct i915_sched *se,
>   		     void (*show_request)(struct drm_printer *m,
> 

Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

Regards,

Tvrtko
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

  reply	other threads:[~2021-02-08 11:18 UTC|newest]

Thread overview: 54+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-02-08 10:52 [Intel-gfx] [PATCH 01/31] drm/i915/gt: Ratelimit heartbeat completion probing Chris Wilson
2021-02-08 10:52 ` [Intel-gfx] [PATCH 02/31] drm/i915: Move context revocation to scheduler Chris Wilson
2021-02-08 11:18   ` Tvrtko Ursulin [this message]
2021-02-08 10:52 ` [Intel-gfx] [PATCH 03/31] drm/i915: Introduce the scheduling mode Chris Wilson
2021-02-08 10:52 ` [Intel-gfx] [PATCH 04/31] drm/i915: Move timeslicing flag to scheduler Chris Wilson
2021-02-08 11:43   ` Tvrtko Ursulin
2021-02-08 10:52 ` [Intel-gfx] [PATCH 05/31] drm/i915/gt: Declare when we enabled timeslicing Chris Wilson
2021-02-08 11:44   ` Tvrtko Ursulin
2021-02-08 10:52 ` [Intel-gfx] [PATCH 06/31] drm/i915: Move busywaiting control to the scheduler Chris Wilson
2021-02-08 10:52 ` [Intel-gfx] [PATCH 07/31] drm/i915: Move preempt-reset flag " Chris Wilson
2021-02-08 10:52 ` [Intel-gfx] [PATCH 08/31] drm/i915: Fix the iterative dfs for defering requests Chris Wilson
2021-02-08 10:52 ` [Intel-gfx] [PATCH 09/31] drm/i915: Replace priolist rbtree with a skiplist Chris Wilson
2021-02-08 12:29   ` Tvrtko Ursulin
2021-02-08 12:46     ` Chris Wilson
2021-02-08 15:10       ` Tvrtko Ursulin
2021-02-08 15:23   ` Tvrtko Ursulin
2021-02-08 16:19     ` Chris Wilson
2021-02-09 16:11       ` Tvrtko Ursulin
2021-02-08 10:52 ` [Intel-gfx] [PATCH 10/31] drm/i915: Fair low-latency scheduling Chris Wilson
2021-02-08 14:56   ` Tvrtko Ursulin
2021-02-08 15:29     ` Chris Wilson
2021-02-08 16:03       ` Tvrtko Ursulin
2021-02-08 16:11         ` Chris Wilson
2021-02-09  9:37   ` Tvrtko Ursulin
2021-02-09 10:31     ` Chris Wilson
2021-02-09 10:40       ` Tvrtko Ursulin
2021-02-08 10:52 ` [Intel-gfx] [PATCH 11/31] drm/i915/gt: Specify a deadline for the heartbeat Chris Wilson
2021-02-08 10:52 ` [Intel-gfx] [PATCH 12/31] drm/i915: Extend the priority boosting for the display with a deadline Chris Wilson
2021-02-08 10:52 ` [Intel-gfx] [PATCH 13/31] drm/i915/gt: Support virtual engine queues Chris Wilson
2021-02-08 10:52 ` [Intel-gfx] [PATCH 14/31] drm/i915: Move saturated workload detection back to the context Chris Wilson
2021-02-08 10:52 ` [Intel-gfx] [PATCH 15/31] drm/i915: Bump default timeslicing quantum to 5ms Chris Wilson
2021-02-08 10:52 ` [Intel-gfx] [PATCH 16/31] drm/i915/gt: Delay taking irqoff for execlists submission Chris Wilson
2021-02-08 10:52 ` [Intel-gfx] [PATCH 17/31] drm/i915/gt: Convert the legacy ring submission to use the scheduling interface Chris Wilson
2021-02-08 10:52 ` [Intel-gfx] [PATCH 18/31] drm/i915/gt: Wrap intel_timeline.has_initial_breadcrumb Chris Wilson
2021-02-08 10:52 ` [Intel-gfx] [PATCH 19/31] drm/i915/gt: Track timeline GGTT offset separately from subpage offset Chris Wilson
2021-02-08 10:52 ` [Intel-gfx] [PATCH 20/31] drm/i915/gt: Add timeline "mode" Chris Wilson
2021-02-08 10:52 ` [Intel-gfx] [PATCH 21/31] drm/i915/gt: Use indices for writing into relative timelines Chris Wilson
2021-02-08 10:52 ` [Intel-gfx] [PATCH 22/31] drm/i915/selftests: Exercise relative timeline modes Chris Wilson
2021-02-08 10:52 ` [Intel-gfx] [PATCH 23/31] drm/i915/gt: Use ppHWSP for unshared non-semaphore related timelines Chris Wilson
2021-02-08 10:52 ` [Intel-gfx] [PATCH 24/31] Restore "drm/i915: drop engine_pin/unpin_breadcrumbs_irq" Chris Wilson
2021-02-08 10:52 ` [Intel-gfx] [PATCH 25/31] drm/i915/gt: Support creation of 'internal' rings Chris Wilson
2021-02-08 10:52 ` [Intel-gfx] [PATCH 26/31] drm/i915/gt: Use client timeline address for seqno writes Chris Wilson
2021-02-08 10:52 ` [Intel-gfx] [PATCH 27/31] drm/i915/gt: Infrastructure for ring scheduling Chris Wilson
2021-02-08 10:52 ` [Intel-gfx] [PATCH 28/31] drm/i915/gt: Implement ring scheduler for gen4-7 Chris Wilson
2021-02-08 10:52 ` [Intel-gfx] [PATCH 29/31] drm/i915/gt: Enable ring scheduling for gen5-7 Chris Wilson
2021-02-08 10:52 ` [Intel-gfx] [PATCH 30/31] drm/i915: Support secure dispatch on gen6/gen7 Chris Wilson
2021-02-08 20:55   ` Dave Airlie
2021-02-08 22:49     ` Chris Wilson
2021-02-09 11:02     ` Tvrtko Ursulin
2021-02-08 10:52 ` [Intel-gfx] [PATCH 31/31] drm/i915/gt: Limit C-states while waiting for requests Chris Wilson
2021-02-08 15:43 ` [Intel-gfx] ✗ Fi.CI.CHECKPATCH: warning for series starting with [01/31] drm/i915/gt: Ratelimit heartbeat completion probing Patchwork
2021-02-08 15:45 ` [Intel-gfx] ✗ Fi.CI.SPARSE: " Patchwork
2021-02-08 16:13 ` [Intel-gfx] ✗ Fi.CI.BAT: failure " Patchwork
2021-02-09 17:52 ` [Intel-gfx] [PATCH 01/31] " Mika Kuoppala

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=fe5a1ffc-4f9d-c69f-14e9-05bf5e9e5f0c@linux.intel.com \
    --to=tvrtko.ursulin@linux.intel.com \
    --cc=chris@chris-wilson.co.uk \
    --cc=intel-gfx@lists.freedesktop.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox