* Fwd: [PATCH v2 1/1] drm/xe/guc: Handle GuC local uncorrectable error notifications
[not found] <ffebb8b2-43c7-408f-86d1-218a0b13cf55@intel.com>
@ 2026-06-25 18:09 ` Dong, Zhanjun
0 siblings, 0 replies; only message in thread
From: Dong, Zhanjun @ 2026-06-25 18:09 UTC (permalink / raw)
To: intel-xe@lists.freedesktop.org
Add back mailinglist
-------- Forwarded Message --------
Subject: Re: [PATCH v2 1/1] drm/xe/guc: Handle GuC local uncorrectable
error notifications
Date: Thu, 25 Jun 2026 11:47:14 -0400
From: Dong, Zhanjun <zhanjun.dong@intel.com>
To: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
Thanks for take time to review it. See my comment inline below.
Regards,
Zhanjun Dong
On 2026-06-24 5:41 p.m., Daniele Ceraolo Spurio wrote:
>
>
> On 6/12/2026 3:44 PM, Zhanjun Dong wrote:
>> Add support for the GuC uncorrectable local error G2H notification and
>> opt in to the feature when the submission ABI exposes it.
>>
>> When the notification targets a known exec queue, treat it like an
>> engine reset request and route it through the existing timeout cleanup
>> path. This keeps the queue teardown, pending job cancellation and error
>> capture in one place instead of open-coding a parallel recovery flow.
>>
>> The timeout worker also needs to cope with this externally triggered
>> reset path. Keep permanent or wedged queue destruction asynchronous when
>> the timeout path is already running from a scheduler worker.
>>
>> Signed-off-by: Zhanjun Dong <zhanjun.dong@intel.com>
>> ---
>> History:
>> v2: Opt in for Xe3p only, excluding media GTs on NovaLake-P which
>> don't support the feature
>> Remove timeout bypass, which is outside the scope of this patch
>> and can be added separately
>> ---
>> drivers/gpu/drm/xe/abi/guc_actions_abi.h | 1 +
>> drivers/gpu/drm/xe/abi/guc_klvs_abi.h | 8 ++++
>> drivers/gpu/drm/xe/xe_guc.c | 9 ++++
>> drivers/gpu/drm/xe/xe_guc_ct.c | 3 ++
>> drivers/gpu/drm/xe/xe_guc_submit.c | 56 +++++++++++++++++++++++-
>> drivers/gpu/drm/xe/xe_guc_submit.h | 1 +
>> drivers/gpu/drm/xe/xe_trace.h | 5 +++
>> 7 files changed, 82 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/gpu/drm/xe/abi/guc_actions_abi.h b/drivers/gpu/
>> drm/xe/abi/guc_actions_abi.h
>> index 83a6e7794982..f5c9b37038d4 100644
>> --- a/drivers/gpu/drm/xe/abi/guc_actions_abi.h
>> +++ b/drivers/gpu/drm/xe/abi/guc_actions_abi.h
>> @@ -152,6 +152,7 @@ enum xe_guc_action {
>> XE_GUC_ACTION_REPORT_PAGE_FAULT_REQ_DESC = 0x6002,
>> XE_GUC_ACTION_PAGE_FAULT_RES_DESC = 0x6003,
>> XE_GUC_ACTION_ACCESS_COUNTER_NOTIFY = 0x6004,
>> + XE_GUC_ACTION_NOTIFY_UNCORRECTABLE_LOCAL_ERROR = 0x6005,
>> XE_GUC_ACTION_TLB_INVALIDATION = 0x7000,
>> XE_GUC_ACTION_TLB_INVALIDATION_DONE = 0x7001,
>> XE_GUC_ACTION_TLB_INVALIDATION_ALL = 0x7002,
>> diff --git a/drivers/gpu/drm/xe/abi/guc_klvs_abi.h b/drivers/gpu/drm/
>> xe/abi/guc_klvs_abi.h
>> index 644f5a4226d7..5c428f02a642 100644
>> --- a/drivers/gpu/drm/xe/abi/guc_klvs_abi.h
>> +++ b/drivers/gpu/drm/xe/abi/guc_klvs_abi.h
>> @@ -154,6 +154,11 @@ enum {
>> * (instead of waiting the full timeslice duration). The bit is
>> instead set
>> * to one if a single context is queued on the engine, to avoid
>> it being
>> * switched out if there isn't another context that can run in
>> its place.
>> + *
>> + * _`GUC_KLV_OPT_IN_FEATURE_UNCORRECTABLE_LOCAL_ERROR_NOTIFICATION` :
>> 0x4004
>> + * This flag will enable notification from GuC to KMD via G2H
>> message
>> + * GUC_ACTION_GUC2HOST_NOTIFY_UNCORRECTABLE_LOCAL_ERROR upon
>> receiving the
>> + * same interrupt from the CS.
>> */
>> #define GUC_KLV_OPT_IN_FEATURE_EXT_CAT_ERR_TYPE_KEY 0x4001
>> @@ -162,6 +167,9 @@ enum {
>> #define GUC_KLV_OPT_IN_FEATURE_DYNAMIC_INHIBIT_CONTEXT_SWITCH_KEY
>> 0x4003
>> #define GUC_KLV_OPT_IN_FEATURE_DYNAMIC_INHIBIT_CONTEXT_SWITCH_LEN 0u
>> +#define
>> GUC_KLV_OPT_IN_FEATURE_UNCORRECTABLE_LOCAL_ERROR_NOTIFICATION_KEY 0x4004
>> +#define
>> GUC_KLV_OPT_IN_FEATURE_UNCORRECTABLE_LOCAL_ERROR_NOTIFICATION_LEN 0u
>> +
>> /**
>> * DOC: GuC Scheduling Policies KLVs
>> *
>> diff --git a/drivers/gpu/drm/xe/xe_guc.c b/drivers/gpu/drm/xe/xe_guc.c
>> index 4023700ff2a9..3b3c6c450062 100644
>> --- a/drivers/gpu/drm/xe/xe_guc.c
>> +++ b/drivers/gpu/drm/xe/xe_guc.c
>> @@ -12,6 +12,7 @@
>> #include "abi/guc_actions_abi.h"
>> #include "abi/guc_errors_abi.h"
>> +#include "abi/guc_klvs_abi.h"
>> #include "regs/xe_gt_regs.h"
>> #include "regs/xe_gtt_defs.h"
>> #include "regs/xe_guc_regs.h"
>> @@ -641,6 +642,14 @@ int xe_guc_opt_in_features_enable(struct xe_guc
>> *guc)
>> if (GUC_SUBMIT_VER(guc) >= MAKE_GUC_VER(1, 7, 0))
>> klvs[count++] =
>> PREP_GUC_KLV_TAG(OPT_IN_FEATURE_EXT_CAT_ERR_TYPE);
>> + /*
>> + * The uncorrectable local error notification opt-in was added in
>> + * GuC v70.38.0, which maps to compatibility version v1.18.0.
>> + */
>> + if (GRAPHICS_VER(xe) >= 35 && GUC_SUBMIT_VER(guc) >=
>> MAKE_GUC_VER(1, 18, 0) &&
>> + !(xe->info.platform == XE_NOVALAKE_P &&
>> xe_gt_is_media_type(guc_to_gt(guc))))
>> + klvs[count++] =
>> PREP_GUC_KLV_TAG(OPT_IN_FEATURE_UNCORRECTABLE_LOCAL_ERROR_NOTIFICATION);
>> +
>> if (supports_dynamic_ics(guc))
>> klvs[count++] =
>> PREP_GUC_KLV_TAG(OPT_IN_FEATURE_DYNAMIC_INHIBIT_CONTEXT_SWITCH);
>> diff --git a/drivers/gpu/drm/xe/xe_guc_ct.c b/drivers/gpu/drm/xe/
>> xe_guc_ct.c
>> index 21e0dad9a481..fe6a55fe7fb5 100644
>> --- a/drivers/gpu/drm/xe/xe_guc_ct.c
>> +++ b/drivers/gpu/drm/xe/xe_guc_ct.c
>> @@ -1661,6 +1661,9 @@ static int process_g2h_msg(struct xe_guc_ct *ct,
>> u32 *msg, u32 len)
>> ret = xe_guc_exec_queue_memory_cat_error_handler(guc, payload,
>> adj_len);
>> break;
>> + case XE_GUC_ACTION_NOTIFY_UNCORRECTABLE_LOCAL_ERROR:
>> + ret = xe_guc_uncorrectable_error_handler(guc, payload, adj_len);
>> + break;
>> case XE_GUC_ACTION_REPORT_PAGE_FAULT_REQ_DESC:
>> ret = xe_guc_pagefault_handler(guc, payload, adj_len);
>> break;
>> diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/
>> xe_guc_submit.c
>> index b29cc08e6291..2840ab0bcd09 100644
>> --- a/drivers/gpu/drm/xe/xe_guc_submit.c
>> +++ b/drivers/gpu/drm/xe/xe_guc_submit.c
>> @@ -9,6 +9,7 @@
>> #include <linux/bitmap.h>
>> #include <linux/circ_buf.h>
>> #include <linux/dma-fence-array.h>
>> +#include <linux/workqueue.h>
>> #include <drm/drm_managed.h>
>> @@ -71,6 +72,7 @@ exec_queue_to_guc(struct xe_exec_queue *q)
>> #define EXEC_QUEUE_STATE_WEDGED (1 << 8)
>> #define EXEC_QUEUE_STATE_BANNED (1 << 9)
>> #define EXEC_QUEUE_STATE_PENDING_RESUME (1 << 10)
>> +#define EXEC_QUEUE_STATE_UNCORRECTABLE_ERROR (1 << 11)
>
> This can be dropped if no upstream user
Sure, will drop as not referenced
>
>> static bool exec_queue_registered(struct xe_exec_queue *q)
>> {
>> @@ -217,6 +219,11 @@ static void
>> clear_exec_queue_pending_resume(struct xe_exec_queue *q)
>> atomic_and(~EXEC_QUEUE_STATE_PENDING_RESUME, &q->guc->state);
>> }
>> +static void set_exec_queue_uncorrectable_error(struct xe_exec_queue *q)
>> +{
>> + atomic_or(EXEC_QUEUE_STATE_UNCORRECTABLE_ERROR, &q->guc->state);
>> +}
>> +
>> static bool exec_queue_killed_or_banned_or_wedged(struct
>> xe_exec_queue *q)
>> {
>> return (atomic_read(&q->guc->state) &
>> @@ -1697,6 +1704,19 @@ static void
>> __guc_exec_queue_destroy_async(struct work_struct *w)
>> xe_exec_queue_fini(q);
>> }
>> +static bool guc_exec_queue_in_sched_worker(struct xe_exec_queue *q)
>> +{
>> + struct work_struct *work = current_work();
>> +
>> + if (!work)
>> + return false;
>> +
>> + return work == &q->guc->sched.base.work_run_job ||
>> + work == &q->guc->sched.base.work_free_job ||
>> + work == &q->guc->sched.base.work_tdr.work ||
>> + work == &q->guc->sched.work_process_msg;
>> +}
>> +
>
> This can be dropped, replaced by Matt's fix
To be dropped
>
>> static void guc_exec_queue_destroy_async(struct xe_exec_queue *q)
>> {
>> struct xe_guc *guc = exec_queue_to_guc(q);
>> @@ -1705,7 +1725,8 @@ static void guc_exec_queue_destroy_async(struct
>> xe_exec_queue *q)
>> INIT_WORK(&q->guc->destroy_async, __guc_exec_queue_destroy_async);
>> /* We must block on kernel engines so slabs are empty on driver
>> unload */
>> - if (q->flags & EXEC_QUEUE_FLAG_PERMANENT || exec_queue_wedged(q))
>> + if ((q->flags & EXEC_QUEUE_FLAG_PERMANENT ||
>> exec_queue_wedged(q)) &&
>> + !guc_exec_queue_in_sched_worker(q))
>> __guc_exec_queue_destroy_async(&q->guc->destroy_async);
>> else
>> queue_work(xe->destroy_wq, &q->guc->destroy_async);
>> @@ -2997,6 +3018,39 @@ int
>> xe_guc_exec_queue_memory_cat_error_handler(struct xe_guc *guc, u32 *msg,
>> return 0;
>> }
>> +int xe_guc_uncorrectable_error_handler(struct xe_guc *guc, u32 *msg,
>> u32 len)
>> +{
>> + struct xe_gt *gt = guc_to_gt(guc);
>> + struct xe_exec_queue *q;
>> + u32 guc_id;
>> +
>> + if (unlikely(!len || len > 2))
>
> Here you're allowing len = 1 and len = 2, but below you only read 1
> dword (msg[0]).
You are right, expected length is 1, to be corrected in next rev.
>
>> + return -EPROTO;
>> +
>> + guc_id = msg[0];
>> +
>> + if (guc_id == GUC_ID_UNKNOWN) {
>> + xe_gt_err(gt, "GuC: Uncorrectable local error! guc_id=%d\n",
>> guc_id);
>> + return 0;
>> + }
>> +
>> + q = g2h_exec_queue_lookup(guc, guc_id);
>> + if (unlikely(!q))
>> + return -EPROTO;
>> +
>> + xe_gt_err(gt,
>> + "GuC: Uncorrectable local error! guc_id=%d class=%s,
>> logical_mask=0x%x",
>> + guc_id, xe_hw_engine_class_to_str(q->class), q->logical_mask);
>> +
>> + trace_xe_guc_uncorrectable_error(q);
>> + set_exec_queue_uncorrectable_error(q);
>> +
>> + if (guc_to_xe(guc)->wedged.mode !=
>> XE_WEDGED_MODE_UPON_ANY_HANG_NO_RESET)
>
> This needs to be a separate patch, probably implemented further down the
> reset path.
Yes, to be moved into a separate patch
>
>> + xe_guc_exec_queue_reset_trigger_cleanup(q);
>> +
>> + return 0;
>> +}
>> +
>> int xe_guc_exec_queue_reset_failure_handler(struct xe_guc *guc, u32
>> *msg, u32 len)
>> {
>> struct xe_gt *gt = guc_to_gt(guc);
>> diff --git a/drivers/gpu/drm/xe/xe_guc_submit.h b/drivers/gpu/drm/xe/
>> xe_guc_submit.h
>> index b3839a90c142..ccade320dc69 100644
>> --- a/drivers/gpu/drm/xe/xe_guc_submit.h
>> +++ b/drivers/gpu/drm/xe/xe_guc_submit.h
>> @@ -34,6 +34,7 @@ int xe_guc_deregister_done_handler(struct xe_guc
>> *guc, u32 *msg, u32 len);
>> int xe_guc_exec_queue_reset_handler(struct xe_guc *guc, u32 *msg,
>> u32 len);
>> int xe_guc_exec_queue_memory_cat_error_handler(struct xe_guc *guc,
>> u32 *msg,
>> u32 len);
>> +int xe_guc_uncorrectable_error_handler(struct xe_guc *guc, u32 *msg,
>> u32 len);
>> int xe_guc_exec_queue_reset_failure_handler(struct xe_guc *guc, u32
>> *msg, u32 len);
>> int xe_guc_error_capture_handler(struct xe_guc *guc, u32 *msg, u32
>> len);
>> int xe_guc_exec_queue_cgp_sync_done_handler(struct xe_guc *guc, u32
>> *msg, u32 len);
>> diff --git a/drivers/gpu/drm/xe/xe_trace.h b/drivers/gpu/drm/xe/
>> xe_trace.h
>> index 750fa32c13b2..2fe8f89a1e34 100644
>> --- a/drivers/gpu/drm/xe/xe_trace.h
>> +++ b/drivers/gpu/drm/xe/xe_trace.h
>> @@ -213,6 +213,11 @@ DEFINE_EVENT(xe_exec_queue,
>> xe_exec_queue_memory_cat_error,
>> TP_ARGS(q)
>> );
>> +DEFINE_EVENT(xe_exec_queue, xe_guc_uncorrectable_error,
>> + TP_PROTO(struct xe_exec_queue *q),
>> + TP_ARGS(q)
>> +);
>> +
>> DEFINE_EVENT(xe_exec_queue, xe_exec_queue_cgp_context_error,
>> TP_PROTO(struct xe_exec_queue *q),
>> TP_ARGS(q)
>
^ permalink raw reply [flat|nested] only message in thread