From: "Ceraolo Spurio, Daniele" <daniele.ceraolospurio@intel.com>
To: <John.C.Harrison@Intel.com>, <Intel-GFX@Lists.FreeDesktop.Org>
Cc: DRI-Devel@Lists.FreeDesktop.Org
Subject: Re: [Intel-gfx] [PATCH v4 2/7] drm/i915: Fix up locking around dumping requests lists
Date: Tue, 24 Jan 2023 16:31:21 -0800 [thread overview]
Message-ID: <edd0d977-fe21-0004-0a1b-61ed97d42147@intel.com> (raw)
In-Reply-To: <20230120232831.28177-3-John.C.Harrison@Intel.com>
On 1/20/2023 3:28 PM, John.C.Harrison@Intel.com wrote:
> From: John Harrison <John.C.Harrison@Intel.com>
>
> The debugfs dump of requests was confused about what state requires
> the execlist lock versus the GuC lock. There was also a bunch of
> duplicated messy code between it and the error capture code.
>
> So refactor the hung request search into a re-usable function. And
> reduce the span of the execlist state lock to only the execlist
> specific code paths. In order to do that, also move the report of hold
> count (which is an execlist only concept) from the top level dump
> function to the lower level execlist specific function. Also, move the
> execlist specific code into the execlist source file.
>
> v2: Rename some functions and move to more appropriate files (Daniele).
>
> Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
> ---
> drivers/gpu/drm/i915/gt/intel_engine.h | 4 +-
> drivers/gpu/drm/i915/gt/intel_engine_cs.c | 74 +++++++++----------
> .../drm/i915/gt/intel_execlists_submission.c | 27 +++++++
> .../drm/i915/gt/intel_execlists_submission.h | 4 +
> drivers/gpu/drm/i915/i915_gpu_error.c | 26 +------
> 5 files changed, 73 insertions(+), 62 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/gt/intel_engine.h b/drivers/gpu/drm/i915/gt/intel_engine.h
> index 0e24af5efee9c..b58c30ac8ef02 100644
> --- a/drivers/gpu/drm/i915/gt/intel_engine.h
> +++ b/drivers/gpu/drm/i915/gt/intel_engine.h
> @@ -250,8 +250,8 @@ void intel_engine_dump_active_requests(struct list_head *requests,
> ktime_t intel_engine_get_busy_time(struct intel_engine_cs *engine,
> ktime_t *now);
>
> -struct i915_request *
> -intel_engine_execlist_find_hung_request(struct intel_engine_cs *engine);
> +void intel_engine_get_hung_entity(struct intel_engine_cs *engine,
> + struct intel_context **ce, struct i915_request **rq);
>
> u32 intel_engine_context_size(struct intel_gt *gt, u8 class);
> struct intel_context *
> diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
> index fbc0a81617e89..1d77e27801bce 100644
> --- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c
> +++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
> @@ -2114,17 +2114,6 @@ static void print_request_ring(struct drm_printer *m, struct i915_request *rq)
> }
> }
>
> -static unsigned long list_count(struct list_head *list)
> -{
> - struct list_head *pos;
> - unsigned long count = 0;
> -
> - list_for_each(pos, list)
> - count++;
> -
> - return count;
> -}
> -
> static unsigned long read_ul(void *p, size_t x)
> {
> return *(unsigned long *)(p + x);
> @@ -2216,11 +2205,11 @@ void intel_engine_dump_active_requests(struct list_head *requests,
> }
> }
>
> -static void engine_dump_active_requests(struct intel_engine_cs *engine, struct drm_printer *m)
> +static void engine_dump_active_requests(struct intel_engine_cs *engine,
> + struct drm_printer *m)
> {
> + struct intel_context *hung_ce = NULL;
> struct i915_request *hung_rq = NULL;
> - struct intel_context *ce;
> - bool guc;
>
> /*
> * No need for an engine->irq_seqno_barrier() before the seqno reads.
> @@ -2229,29 +2218,20 @@ static void engine_dump_active_requests(struct intel_engine_cs *engine, struct d
> * But the intention here is just to report an instantaneous snapshot
> * so that's fine.
> */
> - lockdep_assert_held(&engine->sched_engine->lock);
> + intel_engine_get_hung_entity(engine, &hung_ce, &hung_rq);
>
> drm_printf(m, "\tRequests:\n");
>
> - guc = intel_uc_uses_guc_submission(&engine->gt->uc);
> - if (guc) {
> - ce = intel_engine_get_hung_context(engine);
> - if (ce)
> - hung_rq = intel_context_find_active_request_get(ce);
> - } else {
> - hung_rq = intel_engine_execlist_find_hung_request(engine);
> - if (hung_rq)
> - hung_rq = i915_request_get_rcu(hung_rq);
> - }
> -
> if (hung_rq)
> engine_dump_request(hung_rq, m, "\t\thung");
> + else if (hung_ce)
> + drm_printf(m, "\t\tGot hung ce but no hung rq!\n");
>
> - if (guc)
> + if (intel_uc_uses_guc_submission(&engine->gt->uc))
> intel_guc_dump_active_requests(engine, hung_rq, m);
> else
> - intel_engine_dump_active_requests(&engine->sched_engine->requests,
> - hung_rq, m);
> + intel_execlist_dump_active_requests(engine, hung_rq, m);
> +
> if (hung_rq)
> i915_request_put(hung_rq);
> }
> @@ -2263,7 +2243,6 @@ void intel_engine_dump(struct intel_engine_cs *engine,
> struct i915_gpu_error * const error = &engine->i915->gpu_error;
> struct i915_request *rq;
> intel_wakeref_t wakeref;
> - unsigned long flags;
> ktime_t dummy;
>
> if (header) {
> @@ -2300,13 +2279,8 @@ void intel_engine_dump(struct intel_engine_cs *engine,
> i915_reset_count(error));
> print_properties(engine, m);
>
> - spin_lock_irqsave(&engine->sched_engine->lock, flags);
> engine_dump_active_requests(engine, m);
>
> - drm_printf(m, "\tOn hold?: %lu\n",
> - list_count(&engine->sched_engine->hold));
> - spin_unlock_irqrestore(&engine->sched_engine->lock, flags);
> -
> drm_printf(m, "\tMMIO base: 0x%08x\n", engine->mmio_base);
> wakeref = intel_runtime_pm_get_if_in_use(engine->uncore->rpm);
> if (wakeref) {
> @@ -2352,8 +2326,7 @@ intel_engine_create_virtual(struct intel_engine_cs **siblings,
> return siblings[0]->cops->create_virtual(siblings, count, flags);
> }
>
> -struct i915_request *
> -intel_engine_execlist_find_hung_request(struct intel_engine_cs *engine)
> +static struct i915_request *engine_execlist_find_hung_request(struct intel_engine_cs *engine)
> {
> struct i915_request *request, *active = NULL;
>
> @@ -2405,6 +2378,33 @@ intel_engine_execlist_find_hung_request(struct intel_engine_cs *engine)
> return active;
> }
>
> +void intel_engine_get_hung_entity(struct intel_engine_cs *engine,
> + struct intel_context **ce, struct i915_request **rq)
> +{
> + unsigned long flags;
> +
> + *ce = intel_engine_get_hung_context(engine);
> + if (*ce) {
> + intel_engine_clear_hung_context(engine);
> +
> + *rq = intel_context_find_active_request_get(*ce);
> + return;
> + }
> +
> + /*
> + * Getting here with GuC enabled means it is a forced error capture
> + * with no actual hang. So, no need to attempt the execlist search.
> + */
> + if (intel_uc_uses_guc_submission(&engine->gt->uc))
> + return;
> +
> + spin_lock_irqsave(&engine->sched_engine->lock, flags);
> + *rq = engine_execlist_find_hung_request(engine);
> + if (*rq)
> + *rq = i915_request_get_rcu(*rq);
> + spin_unlock_irqrestore(&engine->sched_engine->lock, flags);
> +}
> +
> void xehp_enable_ccs_engines(struct intel_engine_cs *engine)
> {
> /*
> diff --git a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
> index 18ffe55282e59..05995c8577bef 100644
> --- a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
> +++ b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
> @@ -4150,6 +4150,33 @@ void intel_execlists_show_requests(struct intel_engine_cs *engine,
> spin_unlock_irqrestore(&sched_engine->lock, flags);
> }
>
> +static unsigned long list_count(struct list_head *list)
> +{
> + struct list_head *pos;
> + unsigned long count = 0;
> +
> + list_for_each(pos, list)
> + count++;
> +
> + return count;
> +}
> +
> +void intel_execlist_dump_active_requests(struct intel_engine_cs *engine,
nit: we usually use "execlists" and not "execlist".
Apart from this the patch LGTM.
Reviewed-by: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
Daniele
> + struct i915_request *hung_rq,
> + struct drm_printer *m)
> +{
> + unsigned long flags;
> +
> + spin_lock_irqsave(&engine->sched_engine->lock, flags);
> +
> + intel_engine_dump_active_requests(&engine->sched_engine->requests, hung_rq, m);
> +
> + drm_printf(m, "\tOn hold?: %lu\n",
> + list_count(&engine->sched_engine->hold));
> +
> + spin_unlock_irqrestore(&engine->sched_engine->lock, flags);
> +}
> +
> #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
> #include "selftest_execlists.c"
> #endif
> diff --git a/drivers/gpu/drm/i915/gt/intel_execlists_submission.h b/drivers/gpu/drm/i915/gt/intel_execlists_submission.h
> index a1aa92c983a51..cb07488a03764 100644
> --- a/drivers/gpu/drm/i915/gt/intel_execlists_submission.h
> +++ b/drivers/gpu/drm/i915/gt/intel_execlists_submission.h
> @@ -32,6 +32,10 @@ void intel_execlists_show_requests(struct intel_engine_cs *engine,
> int indent),
> unsigned int max);
>
> +void intel_execlist_dump_active_requests(struct intel_engine_cs *engine,
> + struct i915_request *hung_rq,
> + struct drm_printer *m);
> +
> bool
> intel_engine_in_execlists_submission_mode(const struct intel_engine_cs *engine);
>
> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
> index 5c73dfa2fb3f6..b20bd6365615b 100644
> --- a/drivers/gpu/drm/i915/i915_gpu_error.c
> +++ b/drivers/gpu/drm/i915/i915_gpu_error.c
> @@ -1596,35 +1596,15 @@ capture_engine(struct intel_engine_cs *engine,
> {
> struct intel_engine_capture_vma *capture = NULL;
> struct intel_engine_coredump *ee;
> - struct intel_context *ce;
> + struct intel_context *ce = NULL;
> struct i915_request *rq = NULL;
> - unsigned long flags;
>
> ee = intel_engine_coredump_alloc(engine, ALLOW_FAIL, dump_flags);
> if (!ee)
> return NULL;
>
> - ce = intel_engine_get_hung_context(engine);
> - if (ce) {
> - intel_engine_clear_hung_context(engine);
> - rq = intel_context_find_active_request_get(ce);
> - if (!rq || !i915_request_started(rq))
> - goto no_request_capture;
> - } else {
> - /*
> - * Getting here with GuC enabled means it is a forced error capture
> - * with no actual hang. So, no need to attempt the execlist search.
> - */
> - if (!intel_uc_uses_guc_submission(&engine->gt->uc)) {
> - spin_lock_irqsave(&engine->sched_engine->lock, flags);
> - rq = intel_engine_execlist_find_hung_request(engine);
> - if (rq)
> - rq = i915_request_get_rcu(rq);
> - spin_unlock_irqrestore(&engine->sched_engine->lock,
> - flags);
> - }
> - }
> - if (!rq)
> + intel_engine_get_hung_entity(engine, &ce, &rq);
> + if (!rq || !i915_request_started(rq))
> goto no_request_capture;
>
> capture = intel_engine_coredump_add_request(ee, rq, ATOMIC_MAYFAIL);
next prev parent reply other threads:[~2023-01-25 0:31 UTC|newest]
Thread overview: 22+ messages / expand[flat|nested] mbox.gz Atom feed top
2023-01-20 23:28 [Intel-gfx] [PATCH v4 0/7] Allow error capture without a request & fix locking issues John.C.Harrison
2023-01-20 23:28 ` [Intel-gfx] [PATCH v4 1/7] drm/i915: Fix request locking during error capture & debugfs dump John.C.Harrison
2023-01-23 17:51 ` Tvrtko Ursulin
2023-01-23 20:35 ` John Harrison
2023-01-25 22:04 ` John Harrison
2023-01-20 23:28 ` [Intel-gfx] [PATCH v4 2/7] drm/i915: Fix up locking around dumping requests lists John.C.Harrison
2023-01-20 23:40 ` John Harrison
2023-01-24 14:40 ` Tvrtko Ursulin
2023-01-25 18:00 ` John Harrison
2023-01-25 18:12 ` Tvrtko Ursulin
2023-01-25 18:17 ` John Harrison
2023-01-25 0:31 ` Ceraolo Spurio, Daniele [this message]
2023-01-20 23:28 ` [Intel-gfx] [PATCH v4 3/7] drm/i915: Allow error capture without a request John.C.Harrison
2023-01-25 0:39 ` Ceraolo Spurio, Daniele
2023-01-25 0:56 ` John Harrison
2023-01-20 23:28 ` [Intel-gfx] [PATCH v4 4/7] drm/i915: Allow error capture of a pending request John.C.Harrison
2023-01-20 23:28 ` [Intel-gfx] [PATCH v4 5/7] drm/i915/guc: Look for a guilty context when an engine reset fails John.C.Harrison
2023-01-20 23:28 ` [Intel-gfx] [PATCH v4 6/7] drm/i915/guc: Add a debug print on GuC triggered reset John.C.Harrison
2023-01-20 23:28 ` [Intel-gfx] [PATCH v4 7/7] drm/i915/guc: Rename GuC register state capture node to be more obvious John.C.Harrison
2023-01-25 0:44 ` Ceraolo Spurio, Daniele
2023-01-20 23:57 ` [Intel-gfx] ✓ Fi.CI.BAT: success for Allow error capture without a request & fix locking issues (rev2) Patchwork
2023-01-21 21:41 ` [Intel-gfx] ✓ Fi.CI.IGT: " Patchwork
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=edd0d977-fe21-0004-0a1b-61ed97d42147@intel.com \
--to=daniele.ceraolospurio@intel.com \
--cc=DRI-Devel@Lists.FreeDesktop.Org \
--cc=Intel-GFX@Lists.FreeDesktop.Org \
--cc=John.C.Harrison@Intel.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox