Intel-XE Archive on lore.kernel.org
 help / color / mirror / Atom feed
* Fwd: [PATCH v2 1/1] drm/xe/guc: Handle GuC local uncorrectable error notifications
       [not found] <ffebb8b2-43c7-408f-86d1-218a0b13cf55@intel.com>
@ 2026-06-25 18:09 ` Dong, Zhanjun
  0 siblings, 0 replies; only message in thread
From: Dong, Zhanjun @ 2026-06-25 18:09 UTC (permalink / raw)
  To: intel-xe@lists.freedesktop.org


Add back mailinglist


-------- Forwarded Message --------
Subject: Re: [PATCH v2 1/1] drm/xe/guc: Handle GuC local uncorrectable 
error notifications
Date: Thu, 25 Jun 2026 11:47:14 -0400
From: Dong, Zhanjun <zhanjun.dong@intel.com>
To: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>

Thanks for take time to review it. See my comment inline below.

Regards,
Zhanjun Dong

On 2026-06-24 5:41 p.m., Daniele Ceraolo Spurio wrote:
> 
> 
> On 6/12/2026 3:44 PM, Zhanjun Dong wrote:
>> Add support for the GuC uncorrectable local error G2H notification and
>> opt in to the feature when the submission ABI exposes it.
>>
>> When the notification targets a known exec queue, treat it like an
>> engine reset request and route it through the existing timeout cleanup
>> path. This keeps the queue teardown, pending job cancellation and error
>> capture in one place instead of open-coding a parallel recovery flow.
>>
>> The timeout worker also needs to cope with this externally triggered
>> reset path. Keep permanent or wedged queue destruction asynchronous when
>> the timeout path is already running from a scheduler worker.
>>
>> Signed-off-by: Zhanjun Dong <zhanjun.dong@intel.com>
>> ---
>> History:
>> v2: Opt in for Xe3p only, excluding media GTs on NovaLake-P which 
>> don't support the feature
>>      Remove timeout bypass, which is outside the scope of this patch 
>> and can be added separately
>> ---
>>   drivers/gpu/drm/xe/abi/guc_actions_abi.h |  1 +
>>   drivers/gpu/drm/xe/abi/guc_klvs_abi.h    |  8 ++++
>>   drivers/gpu/drm/xe/xe_guc.c              |  9 ++++
>>   drivers/gpu/drm/xe/xe_guc_ct.c           |  3 ++
>>   drivers/gpu/drm/xe/xe_guc_submit.c       | 56 +++++++++++++++++++++++-
>>   drivers/gpu/drm/xe/xe_guc_submit.h       |  1 +
>>   drivers/gpu/drm/xe/xe_trace.h            |  5 +++
>>   7 files changed, 82 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/gpu/drm/xe/abi/guc_actions_abi.h b/drivers/gpu/ 
>> drm/xe/abi/guc_actions_abi.h
>> index 83a6e7794982..f5c9b37038d4 100644
>> --- a/drivers/gpu/drm/xe/abi/guc_actions_abi.h
>> +++ b/drivers/gpu/drm/xe/abi/guc_actions_abi.h
>> @@ -152,6 +152,7 @@ enum xe_guc_action {
>>       XE_GUC_ACTION_REPORT_PAGE_FAULT_REQ_DESC = 0x6002,
>>       XE_GUC_ACTION_PAGE_FAULT_RES_DESC = 0x6003,
>>       XE_GUC_ACTION_ACCESS_COUNTER_NOTIFY = 0x6004,
>> +    XE_GUC_ACTION_NOTIFY_UNCORRECTABLE_LOCAL_ERROR = 0x6005,
>>       XE_GUC_ACTION_TLB_INVALIDATION = 0x7000,
>>       XE_GUC_ACTION_TLB_INVALIDATION_DONE = 0x7001,
>>       XE_GUC_ACTION_TLB_INVALIDATION_ALL = 0x7002,
>> diff --git a/drivers/gpu/drm/xe/abi/guc_klvs_abi.h b/drivers/gpu/drm/ 
>> xe/abi/guc_klvs_abi.h
>> index 644f5a4226d7..5c428f02a642 100644
>> --- a/drivers/gpu/drm/xe/abi/guc_klvs_abi.h
>> +++ b/drivers/gpu/drm/xe/abi/guc_klvs_abi.h
>> @@ -154,6 +154,11 @@ enum  {
>>    *      (instead of waiting the full timeslice duration). The bit is 
>> instead set
>>    *      to one if a single context is queued on the engine, to avoid 
>> it being
>>    *      switched out if there isn't another context that can run in 
>> its place.
>> + *
>> + * _`GUC_KLV_OPT_IN_FEATURE_UNCORRECTABLE_LOCAL_ERROR_NOTIFICATION` : 
>> 0x4004
>> + *      This flag will enable notification from GuC to KMD via G2H 
>> message
>> + *      GUC_ACTION_GUC2HOST_NOTIFY_UNCORRECTABLE_LOCAL_ERROR upon 
>> receiving the
>> + *      same interrupt from the CS.
>>    */
>>   #define GUC_KLV_OPT_IN_FEATURE_EXT_CAT_ERR_TYPE_KEY 0x4001
>> @@ -162,6 +167,9 @@ enum  {
>>   #define GUC_KLV_OPT_IN_FEATURE_DYNAMIC_INHIBIT_CONTEXT_SWITCH_KEY 
>> 0x4003
>>   #define GUC_KLV_OPT_IN_FEATURE_DYNAMIC_INHIBIT_CONTEXT_SWITCH_LEN 0u
>> +#define 
>> GUC_KLV_OPT_IN_FEATURE_UNCORRECTABLE_LOCAL_ERROR_NOTIFICATION_KEY 0x4004
>> +#define 
>> GUC_KLV_OPT_IN_FEATURE_UNCORRECTABLE_LOCAL_ERROR_NOTIFICATION_LEN 0u
>> +
>>   /**
>>    * DOC: GuC Scheduling Policies KLVs
>>    *
>> diff --git a/drivers/gpu/drm/xe/xe_guc.c b/drivers/gpu/drm/xe/xe_guc.c
>> index 4023700ff2a9..3b3c6c450062 100644
>> --- a/drivers/gpu/drm/xe/xe_guc.c
>> +++ b/drivers/gpu/drm/xe/xe_guc.c
>> @@ -12,6 +12,7 @@
>>   #include "abi/guc_actions_abi.h"
>>   #include "abi/guc_errors_abi.h"
>> +#include "abi/guc_klvs_abi.h"
>>   #include "regs/xe_gt_regs.h"
>>   #include "regs/xe_gtt_defs.h"
>>   #include "regs/xe_guc_regs.h"
>> @@ -641,6 +642,14 @@ int xe_guc_opt_in_features_enable(struct xe_guc 
>> *guc)
>>       if (GUC_SUBMIT_VER(guc) >= MAKE_GUC_VER(1, 7, 0))
>>           klvs[count++] = 
>> PREP_GUC_KLV_TAG(OPT_IN_FEATURE_EXT_CAT_ERR_TYPE);
>> +    /*
>> +     * The uncorrectable local error notification opt-in was added in
>> +     * GuC v70.38.0, which maps to compatibility version v1.18.0.
>> +     */
>> +    if (GRAPHICS_VER(xe) >= 35 && GUC_SUBMIT_VER(guc) >= 
>> MAKE_GUC_VER(1, 18, 0) &&
>> +        !(xe->info.platform == XE_NOVALAKE_P && 
>> xe_gt_is_media_type(guc_to_gt(guc))))
>> +        klvs[count++] = 
>> PREP_GUC_KLV_TAG(OPT_IN_FEATURE_UNCORRECTABLE_LOCAL_ERROR_NOTIFICATION);
>> +
>>       if (supports_dynamic_ics(guc))
>>           klvs[count++] = 
>> PREP_GUC_KLV_TAG(OPT_IN_FEATURE_DYNAMIC_INHIBIT_CONTEXT_SWITCH);
>> diff --git a/drivers/gpu/drm/xe/xe_guc_ct.c b/drivers/gpu/drm/xe/ 
>> xe_guc_ct.c
>> index 21e0dad9a481..fe6a55fe7fb5 100644
>> --- a/drivers/gpu/drm/xe/xe_guc_ct.c
>> +++ b/drivers/gpu/drm/xe/xe_guc_ct.c
>> @@ -1661,6 +1661,9 @@ static int process_g2h_msg(struct xe_guc_ct *ct, 
>> u32 *msg, u32 len)
>>           ret = xe_guc_exec_queue_memory_cat_error_handler(guc, payload,
>>                                    adj_len);
>>           break;
>> +    case XE_GUC_ACTION_NOTIFY_UNCORRECTABLE_LOCAL_ERROR:
>> +        ret = xe_guc_uncorrectable_error_handler(guc, payload, adj_len);
>> +        break;
>>       case XE_GUC_ACTION_REPORT_PAGE_FAULT_REQ_DESC:
>>           ret = xe_guc_pagefault_handler(guc, payload, adj_len);
>>           break;
>> diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/ 
>> xe_guc_submit.c
>> index b29cc08e6291..2840ab0bcd09 100644
>> --- a/drivers/gpu/drm/xe/xe_guc_submit.c
>> +++ b/drivers/gpu/drm/xe/xe_guc_submit.c
>> @@ -9,6 +9,7 @@
>>   #include <linux/bitmap.h>
>>   #include <linux/circ_buf.h>
>>   #include <linux/dma-fence-array.h>
>> +#include <linux/workqueue.h>
>>   #include <drm/drm_managed.h>
>> @@ -71,6 +72,7 @@ exec_queue_to_guc(struct xe_exec_queue *q)
>>   #define EXEC_QUEUE_STATE_WEDGED            (1 << 8)
>>   #define EXEC_QUEUE_STATE_BANNED            (1 << 9)
>>   #define EXEC_QUEUE_STATE_PENDING_RESUME        (1 << 10)
>> +#define EXEC_QUEUE_STATE_UNCORRECTABLE_ERROR    (1 << 11)
> 
> This can be dropped if no upstream user
Sure, will drop as not referenced

> 
>>   static bool exec_queue_registered(struct xe_exec_queue *q)
>>   {
>> @@ -217,6 +219,11 @@ static void 
>> clear_exec_queue_pending_resume(struct xe_exec_queue *q)
>>       atomic_and(~EXEC_QUEUE_STATE_PENDING_RESUME, &q->guc->state);
>>   }
>> +static void set_exec_queue_uncorrectable_error(struct xe_exec_queue *q)
>> +{
>> +    atomic_or(EXEC_QUEUE_STATE_UNCORRECTABLE_ERROR, &q->guc->state);
>> +}
>> +
>>   static bool exec_queue_killed_or_banned_or_wedged(struct 
>> xe_exec_queue *q)
>>   {
>>       return (atomic_read(&q->guc->state) &
>> @@ -1697,6 +1704,19 @@ static void 
>> __guc_exec_queue_destroy_async(struct work_struct *w)
>>       xe_exec_queue_fini(q);
>>   }
>> +static bool guc_exec_queue_in_sched_worker(struct xe_exec_queue *q)
>> +{
>> +    struct work_struct *work = current_work();
>> +
>> +    if (!work)
>> +        return false;
>> +
>> +    return work == &q->guc->sched.base.work_run_job ||
>> +        work == &q->guc->sched.base.work_free_job ||
>> +        work == &q->guc->sched.base.work_tdr.work ||
>> +        work == &q->guc->sched.work_process_msg;
>> +}
>> +
> 
> This can be dropped, replaced by Matt's fix
To be dropped

> 
>>   static void guc_exec_queue_destroy_async(struct xe_exec_queue *q)
>>   {
>>       struct xe_guc *guc = exec_queue_to_guc(q);
>> @@ -1705,7 +1725,8 @@ static void guc_exec_queue_destroy_async(struct 
>> xe_exec_queue *q)
>>       INIT_WORK(&q->guc->destroy_async, __guc_exec_queue_destroy_async);
>>       /* We must block on kernel engines so slabs are empty on driver 
>> unload */
>> -    if (q->flags & EXEC_QUEUE_FLAG_PERMANENT || exec_queue_wedged(q))
>> +    if ((q->flags & EXEC_QUEUE_FLAG_PERMANENT || 
>> exec_queue_wedged(q)) &&
>> +        !guc_exec_queue_in_sched_worker(q))
>>           __guc_exec_queue_destroy_async(&q->guc->destroy_async);
>>       else
>>           queue_work(xe->destroy_wq, &q->guc->destroy_async);
>> @@ -2997,6 +3018,39 @@ int 
>> xe_guc_exec_queue_memory_cat_error_handler(struct xe_guc *guc, u32 *msg,
>>       return 0;
>>   }
>> +int xe_guc_uncorrectable_error_handler(struct xe_guc *guc, u32 *msg, 
>> u32 len)
>> +{
>> +    struct xe_gt *gt = guc_to_gt(guc);
>> +    struct xe_exec_queue *q;
>> +    u32 guc_id;
>> +
>> +    if (unlikely(!len || len > 2))
> 
> Here you're allowing len = 1 and len = 2, but below you only read 1 
> dword (msg[0]).
You are right, expected length is 1, to be corrected in next rev.

> 
>> +        return -EPROTO;
>> +
>> +    guc_id = msg[0];
>> +
>> +    if (guc_id == GUC_ID_UNKNOWN) {
>> +        xe_gt_err(gt, "GuC: Uncorrectable local error! guc_id=%d\n", 
>> guc_id);
>> +        return 0;
>> +    }
>> +
>> +    q = g2h_exec_queue_lookup(guc, guc_id);
>> +    if (unlikely(!q))
>> +        return -EPROTO;
>> +
>> +    xe_gt_err(gt,
>> +          "GuC: Uncorrectable local error! guc_id=%d class=%s, 
>> logical_mask=0x%x",
>> +          guc_id, xe_hw_engine_class_to_str(q->class), q->logical_mask);
>> +
>> +    trace_xe_guc_uncorrectable_error(q);
>> +    set_exec_queue_uncorrectable_error(q);
>> +
>> +    if (guc_to_xe(guc)->wedged.mode != 
>> XE_WEDGED_MODE_UPON_ANY_HANG_NO_RESET)
> 
> This needs to be a separate patch, probably implemented further down the 
> reset path.
Yes, to be moved into a separate patch

> 
>> +        xe_guc_exec_queue_reset_trigger_cleanup(q);
>> +
>> +    return 0;
>> +}
>> +
>>   int xe_guc_exec_queue_reset_failure_handler(struct xe_guc *guc, u32 
>> *msg, u32 len)
>>   {
>>       struct xe_gt *gt = guc_to_gt(guc);
>> diff --git a/drivers/gpu/drm/xe/xe_guc_submit.h b/drivers/gpu/drm/xe/ 
>> xe_guc_submit.h
>> index b3839a90c142..ccade320dc69 100644
>> --- a/drivers/gpu/drm/xe/xe_guc_submit.h
>> +++ b/drivers/gpu/drm/xe/xe_guc_submit.h
>> @@ -34,6 +34,7 @@ int xe_guc_deregister_done_handler(struct xe_guc 
>> *guc, u32 *msg, u32 len);
>>   int xe_guc_exec_queue_reset_handler(struct xe_guc *guc, u32 *msg, 
>> u32 len);
>>   int xe_guc_exec_queue_memory_cat_error_handler(struct xe_guc *guc, 
>> u32 *msg,
>>                              u32 len);
>> +int xe_guc_uncorrectable_error_handler(struct xe_guc *guc, u32 *msg, 
>> u32 len);
>>   int xe_guc_exec_queue_reset_failure_handler(struct xe_guc *guc, u32 
>> *msg, u32 len);
>>   int xe_guc_error_capture_handler(struct xe_guc *guc, u32 *msg, u32 
>> len);
>>   int xe_guc_exec_queue_cgp_sync_done_handler(struct xe_guc *guc, u32 
>> *msg, u32 len);
>> diff --git a/drivers/gpu/drm/xe/xe_trace.h b/drivers/gpu/drm/xe/ 
>> xe_trace.h
>> index 750fa32c13b2..2fe8f89a1e34 100644
>> --- a/drivers/gpu/drm/xe/xe_trace.h
>> +++ b/drivers/gpu/drm/xe/xe_trace.h
>> @@ -213,6 +213,11 @@ DEFINE_EVENT(xe_exec_queue, 
>> xe_exec_queue_memory_cat_error,
>>            TP_ARGS(q)
>>   );
>> +DEFINE_EVENT(xe_exec_queue, xe_guc_uncorrectable_error,
>> +         TP_PROTO(struct xe_exec_queue *q),
>> +         TP_ARGS(q)
>> +);
>> +
>>   DEFINE_EVENT(xe_exec_queue, xe_exec_queue_cgp_context_error,
>>            TP_PROTO(struct xe_exec_queue *q),
>>            TP_ARGS(q)
> 


^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2026-06-25 18:09 UTC | newest]

Thread overview: (only message) (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
     [not found] <ffebb8b2-43c7-408f-86d1-218a0b13cf55@intel.com>
2026-06-25 18:09 ` Fwd: [PATCH v2 1/1] drm/xe/guc: Handle GuC local uncorrectable error notifications Dong, Zhanjun

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox