Re: [Intel-gfx] [PATCH v2 2/5] drm/i915: Allow error capture without a request

Intel-GFX Archive on lore.kernel.org
 help / color / mirror / Atom feed

From: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
To: John.C.Harrison@Intel.com, Intel-GFX@Lists.FreeDesktop.Org
Cc: DRI-Devel@Lists.FreeDesktop.Org
Subject: Re: [Intel-gfx] [PATCH v2 2/5] drm/i915: Allow error capture without a request
Date: Wed, 18 Jan 2023 16:34:25 +0000	[thread overview]
Message-ID: <548703fc-ebb4-b991-6b64-a4a378feb507@linux.intel.com> (raw)
In-Reply-To: <20230117213630.2897570-3-John.C.Harrison@Intel.com>


On 17/01/2023 21:36, John.C.Harrison@Intel.com wrote:
> From: John Harrison <John.C.Harrison@Intel.com>
> 
> There was a report of error captures occurring without any hung
> context being indicated despite the capture being initiated by a 'hung
> context notification' from GuC. The problem was not reproducible.
> However, it is possible to happen if the context in question has no
> active requests. For example, if the hang was in the context switch
> itself then the breadcrumb write would have occurred and the KMD would
> see an idle context.
> 
> In the interests of attempting to provide as much information as
> possible about a hang, it seems wise to include the engine info
> regardless of whether a request was found or not. As opposed to just
> prentending there was no hang at all.
> 
> So update the error capture code to always record engine information
> if an engine is given. Which means updating record_context() to take a
> context instead of a request (which it only ever used to find the
> context anyway). And split the request agnostic parts of
> intel_engine_coredump_add_request() out into a seaprate function.
> 
> v2: Remove a duplicate 'if' statement (Umesh) and fix a put of a null
> pointer.
> v3: Tidy up request locking code flow (Tvrtko)
> 
> Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
> Reviewed-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>
> ---
>   drivers/gpu/drm/i915/i915_gpu_error.c | 69 ++++++++++++++++++---------
>   1 file changed, 46 insertions(+), 23 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
> index 4107a0dfcca7d..461489d599a7e 100644
> --- a/drivers/gpu/drm/i915/i915_gpu_error.c
> +++ b/drivers/gpu/drm/i915/i915_gpu_error.c
> @@ -1370,14 +1370,14 @@ static void engine_record_execlists(struct intel_engine_coredump *ee)
>   }
>   
>   static bool record_context(struct i915_gem_context_coredump *e,
> -			   const struct i915_request *rq)
> +			   struct intel_context *ce)
>   {
>   	struct i915_gem_context *ctx;
>   	struct task_struct *task;
>   	bool simulated;
>   
>   	rcu_read_lock();
> -	ctx = rcu_dereference(rq->context->gem_context);
> +	ctx = rcu_dereference(ce->gem_context);
>   	if (ctx && !kref_get_unless_zero(&ctx->ref))
>   		ctx = NULL;
>   	rcu_read_unlock();
> @@ -1396,8 +1396,8 @@ static bool record_context(struct i915_gem_context_coredump *e,
>   	e->guilty = atomic_read(&ctx->guilty_count);
>   	e->active = atomic_read(&ctx->active_count);
>   
> -	e->total_runtime = intel_context_get_total_runtime_ns(rq->context);
> -	e->avg_runtime = intel_context_get_avg_runtime_ns(rq->context);
> +	e->total_runtime = intel_context_get_total_runtime_ns(ce);
> +	e->avg_runtime = intel_context_get_avg_runtime_ns(ce);
>   
>   	simulated = i915_gem_context_no_error_capture(ctx);
>   
> @@ -1532,15 +1532,37 @@ intel_engine_coredump_alloc(struct intel_engine_cs *engine, gfp_t gfp, u32 dump_
>   	return ee;
>   }
>   
> +static struct intel_engine_capture_vma *
> +engine_coredump_add_context(struct intel_engine_coredump *ee,
> +			    struct intel_context *ce,
> +			    gfp_t gfp)
> +{
> +	struct intel_engine_capture_vma *vma = NULL;
> +
> +	ee->simulated |= record_context(&ee->context, ce);
> +	if (ee->simulated)
> +		return NULL;
> +
> +	/*
> +	 * We need to copy these to an anonymous buffer
> +	 * as the simplest method to avoid being overwritten
> +	 * by userspace.
> +	 */
> +	vma = capture_vma(vma, ce->ring->vma, "ring", gfp);
> +	vma = capture_vma(vma, ce->state, "HW context", gfp);
> +
> +	return vma;
> +}
> +
>   struct intel_engine_capture_vma *
>   intel_engine_coredump_add_request(struct intel_engine_coredump *ee,
>   				  struct i915_request *rq,
>   				  gfp_t gfp)
>   {
> -	struct intel_engine_capture_vma *vma = NULL;
> +	struct intel_engine_capture_vma *vma;
>   
> -	ee->simulated |= record_context(&ee->context, rq);
> -	if (ee->simulated)
> +	vma = engine_coredump_add_context(ee, rq->context, gfp);
> +	if (!vma)
>   		return NULL;
>   
>   	/*
> @@ -1550,8 +1572,6 @@ intel_engine_coredump_add_request(struct intel_engine_coredump *ee,
>   	 */
>   	vma = capture_vma_snapshot(vma, rq->batch_res, gfp, "batch");
>   	vma = capture_user(vma, rq, gfp);
> -	vma = capture_vma(vma, rq->ring->vma, "ring", gfp);
> -	vma = capture_vma(vma, rq->context->state, "HW context", gfp);
>   
>   	ee->rq_head = rq->head;
>   	ee->rq_post = rq->postfix;
> @@ -1609,8 +1629,12 @@ capture_engine(struct intel_engine_cs *engine,
>   		intel_engine_clear_hung_context(engine);
>   		/* This will reference count the request (if found) */
>   		rq = intel_context_find_active_request(ce);
> -		if (!rq || !i915_request_started(rq))
> -			goto no_request_capture;
> +		if (rq && !i915_request_started(rq)) {
> +			drm_info(&engine->gt->i915->drm, "Got hung context on %s with no active request!\n",
> +				 engine->name);
> +			i915_request_put(rq);
> +			rq = NULL;
> +		}
>   	} else {
>   		/*
>   		 * Getting here with GuC enabled means it is a forced error capture
> @@ -1624,25 +1648,24 @@ capture_engine(struct intel_engine_cs *engine,
>   					       flags);
>   		}
>   	}
> -	if (!rq)
> -		goto no_request_capture;
> -
> -	capture = intel_engine_coredump_add_request(ee, rq, ATOMIC_MAYFAIL);
> -	if (!capture) {
> +	if (rq) {
> +		capture = intel_engine_coredump_add_request(ee, rq, ATOMIC_MAYFAIL);
>   		i915_request_put(rq);
> -		goto no_request_capture;
> +	} else if (ce) {
> +		capture = engine_coredump_add_context(ee, ce, ATOMIC_MAYFAIL);
>   	}
> +
>   	if (dump_flags & CORE_DUMP_FLAG_IS_GUC_CAPTURE)
>   		intel_guc_capture_get_matching_node(engine->gt, ee, ce);
>   
> -	intel_engine_coredump_add_vma(ee, capture, compress);
> -	i915_request_put(rq);
> +	if (capture) {
> +		intel_engine_coredump_add_vma(ee, capture, compress);
> +	} else {
> +		kfree(ee);
> +		ee = NULL;
> +	}
>   
>   	return ee;
> -
> -no_request_capture:
> -	kfree(ee);
> -	return NULL;
>   }
>   
>   static void

LGTM - regardless of how i915_request_get_rcu flow ends up:

Acked-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

Regards,

Tvrtko

next prev parent reply	other threads:[~2023-01-18 16:35 UTC|newest]

Thread overview: 18+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-01-17 21:36 [Intel-gfx] [PATCH v2 0/5] Allow error capture without a request / on reset failure John.C.Harrison
2023-01-17 21:36 ` [Intel-gfx] [PATCH v2 1/5] drm/i915: Fix request locking during error capture & debugfs dump John.C.Harrison
2023-01-18  8:29   ` Andy Shevchenko
2023-01-18 17:34     ` John Harrison
2023-01-18 17:54       ` Andy Shevchenko
2023-01-18 18:18         ` John Harrison
2023-01-18 16:22   ` Tvrtko Ursulin
2023-01-18 17:55     ` John Harrison
2023-01-17 21:36 ` [Intel-gfx] [PATCH v2 2/5] drm/i915: Allow error capture without a request John.C.Harrison
2023-01-18 16:34   ` Tvrtko Ursulin [this message]
2023-01-17 21:36 ` [Intel-gfx] [PATCH v2 3/5] drm/i915: Allow error capture of a pending request John.C.Harrison
2023-01-18 16:35   ` Tvrtko Ursulin
2023-01-17 21:36 ` [Intel-gfx] [PATCH v2 4/5] drm/i915/guc: Look for a guilty context when an engine reset fails John.C.Harrison
2023-01-18 16:37   ` Tvrtko Ursulin
2023-01-17 21:36 ` [Intel-gfx] [PATCH v2 5/5] drm/i915/guc: Add a debug print on GuC triggered reset John.C.Harrison
2023-01-17 22:55 ` [Intel-gfx] ✗ Fi.CI.CHECKPATCH: warning for Allow error capture without a request / on reset failure (rev3) Patchwork
2023-01-17 22:55 ` [Intel-gfx] ✗ Fi.CI.SPARSE: " Patchwork
2023-01-17 23:05 ` [Intel-gfx] ✗ Fi.CI.BAT: failure " Patchwork

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=548703fc-ebb4-b991-6b64-a4a378feb507@linux.intel.com \
    --to=tvrtko.ursulin@linux.intel.com \
    --cc=DRI-Devel@Lists.FreeDesktop.Org \
    --cc=Intel-GFX@Lists.FreeDesktop.Org \
    --cc=John.C.Harrison@Intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox