Re: [Intel-gfx] [PATCH] drm/i915/guc: Force a reset on internal GuC error

Intel-GFX Archive on lore.kernel.org
 help / color / mirror / Atom feed

From: "Ceraolo Spurio, Daniele" <daniele.ceraolospurio@intel.com>
To: <John.C.Harrison@Intel.com>, <Intel-GFX@Lists.FreeDesktop.Org>
Cc: DRI-Devel@Lists.FreeDesktop.Org
Subject: Re: [Intel-gfx] [PATCH] drm/i915/guc: Force a reset on internal GuC error
Date: Tue, 6 Jun 2023 17:15:59 -0700	[thread overview]
Message-ID: <ea89f474-163d-3838-a364-a2479e183768@intel.com> (raw)
In-Reply-To: <20230605205431.852088-1-John.C.Harrison@Intel.com>



On 6/5/2023 1:54 PM, John.C.Harrison@Intel.com wrote:
> From: John Harrison <John.C.Harrison@Intel.com>
>
> If GuC hits an internal error (and survives long enough to report it
> to the KMD), it is basically toast and will stop until a GT reset and
> subsequent GuC reload is performed. Previously, the KMD just printed
> an error message and then waited for the heartbeat to eventually kick
> in and trigger a reset (assuming the heartbeat had not been disabled).
> Instead, force the reset immediately to guarantee that it happens and
> to eliminate the very long heartbeat delay. The captured error state
> is also more likely to be useful if captured at the time of the error
> rather than many seconds later.
>
> Note that it is not possible to trigger a reset from with the G2H
> handler itself. The reset prepare process involves flushing
> outstanding G2H contents. So a deadlock could result. Instead, the G2H
> handler queues a worker thread to do the reset asynchronously.
>
> Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
> ---
>   drivers/gpu/drm/i915/gt/uc/intel_guc.c    | 26 +++++++++++++++++++++++
>   drivers/gpu/drm/i915/gt/uc/intel_guc.h    |  9 ++++++++
>   drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c |  6 +-----
>   3 files changed, 36 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc.c b/drivers/gpu/drm/i915/gt/uc/intel_guc.c
> index 2eb891b270aec..c35cf10f52b56 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc.c
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc.c
> @@ -159,6 +159,13 @@ static void gen11_disable_guc_interrupts(struct intel_guc *guc)
>   	gen11_reset_guc_interrupts(guc);
>   }
>   
> +static void guc_dead_worker_func(struct work_struct *w)
> +{
> +	struct intel_guc *guc = container_of(w, struct intel_guc, dead_guc_worker);
> +
> +	intel_gt_handle_error(guc_to_gt(guc), ALL_ENGINES, I915_ERROR_CAPTURE, "dead GuC");
> +}
> +
>   void intel_guc_init_early(struct intel_guc *guc)
>   {
>   	struct intel_gt *gt = guc_to_gt(guc);
> @@ -171,6 +178,8 @@ void intel_guc_init_early(struct intel_guc *guc)
>   	intel_guc_slpc_init_early(&guc->slpc);
>   	intel_guc_rc_init_early(guc);
>   
> +	INIT_WORK(&guc->dead_guc_worker, guc_dead_worker_func);
> +
>   	mutex_init(&guc->send_mutex);
>   	spin_lock_init(&guc->irq_lock);
>   	if (GRAPHICS_VER(i915) >= 11) {
> @@ -585,6 +594,20 @@ int intel_guc_send_mmio(struct intel_guc *guc, const u32 *request, u32 len,
>   	return ret;
>   }
>   
> +int intel_guc_crash_process_msg(struct intel_guc *guc, u32 action)
> +{
> +	if (action == INTEL_GUC_ACTION_NOTIFY_CRASH_DUMP_POSTED)
> +		guc_err(guc, "Crash dump notification\n");
> +	else if (action == INTEL_GUC_ACTION_NOTIFY_EXCEPTION)
> +		guc_err(guc, "Exception notification\n");
> +	else
> +		guc_err(guc, "Unknown crash notification\n");
> +
> +	queue_work(system_unbound_wq, &guc->dead_guc_worker);

Do we need to flush the worker on suspend/unload?

> +
> +	return 0;
> +}
> +
>   int intel_guc_to_host_process_recv_msg(struct intel_guc *guc,
>   				       const u32 *payload, u32 len)
>   {
> @@ -601,6 +624,9 @@ int intel_guc_to_host_process_recv_msg(struct intel_guc *guc,
>   	if (msg & INTEL_GUC_RECV_MSG_EXCEPTION)
>   		guc_err(guc, "Received early exception notification!\n");
>   
> +	if (msg & (INTEL_GUC_RECV_MSG_CRASH_DUMP_POSTED | INTEL_GUC_RECV_MSG_EXCEPTION))
> +		queue_work(system_unbound_wq, &guc->dead_guc_worker);

I'm a bit worried about queuing this for a failure during the init flow. 
If we have a systemic issue (e.g. bad memory) we could end up trying and 
failing to reset & reload multiple times, until the wedge code manages 
to set the wedge on init flag.

Daniele

> +
>   	return 0;
>   }
>   
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc.h b/drivers/gpu/drm/i915/gt/uc/intel_guc.h
> index 8dc291ff00935..0b54eec95fc00 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc.h
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc.h
> @@ -266,6 +266,14 @@ struct intel_guc {
>   		unsigned long last_stat_jiffies;
>   	} timestamp;
>   
> +	/**
> +	 * @dead_guc_worker: Asynchronous worker thread for forcing a GuC reset.
> +	 * Specifically used when the G2H handler wants to issue a reset. Resets
> +	 * require flushing the G2H queue. So, the G2H processing itself must not
> +	 * trigger a reset directly. Instead, go via this worker.
> +	 */
> +	struct work_struct dead_guc_worker;
> +
>   #ifdef CONFIG_DRM_I915_SELFTEST
>   	/**
>   	 * @number_guc_id_stolen: The number of guc_ids that have been stolen
> @@ -476,6 +484,7 @@ int intel_guc_engine_failure_process_msg(struct intel_guc *guc,
>   					 const u32 *msg, u32 len);
>   int intel_guc_error_capture_process_msg(struct intel_guc *guc,
>   					const u32 *msg, u32 len);
> +int intel_guc_crash_process_msg(struct intel_guc *guc, u32 action);
>   
>   struct intel_engine_cs *
>   intel_guc_lookup_engine(struct intel_guc *guc, u8 guc_class, u8 instance);
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
> index f28a3a83742dc..7b09ad6931021 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
> @@ -1116,12 +1116,8 @@ static int ct_process_request(struct intel_guc_ct *ct, struct ct_incoming_msg *r
>   		ret = 0;
>   		break;
>   	case INTEL_GUC_ACTION_NOTIFY_CRASH_DUMP_POSTED:
> -		CT_ERROR(ct, "Received GuC crash dump notification!\n");
> -		ret = 0;
> -		break;
>   	case INTEL_GUC_ACTION_NOTIFY_EXCEPTION:
> -		CT_ERROR(ct, "Received GuC exception notification!\n");
> -		ret = 0;
> +		ret = intel_guc_crash_process_msg(guc, action);
>   		break;
>   	default:
>   		ret = -EOPNOTSUPP;

     prev parent reply	other threads:[~2023-06-07  0:16 UTC|newest]

Thread overview: 5+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-06-05 20:54 [Intel-gfx] [PATCH] drm/i915/guc: Force a reset on internal GuC error John.C.Harrison
2023-06-06  2:43 ` [Intel-gfx] ✗ Fi.CI.SPARSE: warning for " Patchwork
2023-06-06  2:54 ` [Intel-gfx] ✓ Fi.CI.BAT: success " Patchwork
2023-06-06 23:48 ` [Intel-gfx] ✓ Fi.CI.IGT: " Patchwork
2023-06-07  0:15 ` Ceraolo Spurio, Daniele [this message]

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=ea89f474-163d-3838-a364-a2479e183768@intel.com \
    --to=daniele.ceraolospurio@intel.com \
    --cc=DRI-Devel@Lists.FreeDesktop.Org \
    --cc=Intel-GFX@Lists.FreeDesktop.Org \
    --cc=John.C.Harrison@Intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox