Intel-XE Archive on lore.kernel.org
 help / color / mirror / Atom feed
From: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
To: Michal Wajdeczko <michal.wajdeczko@intel.com>,
	<intel-xe@lists.freedesktop.org>
Subject: Re: [PATCH v2 05/11] drm/xe/sriov: Add handling for MLRC adverse event threshold
Date: Mon, 8 Dec 2025 10:27:19 -0800	[thread overview]
Message-ID: <22a1be76-cf95-4119-ab15-40a98c567faa@intel.com> (raw)
In-Reply-To: <1c790980-36d0-43ad-b4b7-573d287df716@intel.com>



On 12/8/2025 9:52 AM, Daniele Ceraolo Spurio wrote:
>
>
> On 12/7/2025 2:03 PM, Michal Wajdeczko wrote:
>>
>> On 12/7/2025 12:04 AM, Daniele Ceraolo Spurio wrote:
>>> Since it is illegal to register a MLRC context when scheduler groups 
>>> are
>>> enabled, the GuC consider the VF doing so as an adverse event. Like for
>>> other adverse event, there is a threshold for how many times the event
>>> can happen before the GuC throws an error, which we need to add support
>>> for.
>>>
>>> Since this is the first threshold that we have that has a minimum GuC
>>> version requirement, support for checking that has been added to the
>>> generic threshold handling. As part of it, some of the version code has
>>> been moved to its own file and with the occasion some SRIOV
>>> documentation has been added.
>>>
>>> v2: split from previous patch, add GuC version checking
>>>
>>> Signed-off-by: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
>>> Cc: Michal Wajdeczko <michal.wajdeczko@intel.com>
>>> ---
>>>   drivers/gpu/drm/xe/abi/guc_klvs_abi.h         |  9 +++++
>>>   drivers/gpu/drm/xe/xe_gt_sriov_pf_config.c    | 19 ++++++----
>>>   drivers/gpu/drm/xe/xe_gt_sriov_pf_debugfs.c   |  9 +++--
>>>   drivers/gpu/drm/xe/xe_guc.h                   |  7 +---
>>>   .../drm/xe/xe_guc_klv_thresholds_set_types.h  | 18 +++++-----
>>>   drivers/gpu/drm/xe/xe_guc_version.h           | 36 
>>> +++++++++++++++++++
>>>   6 files changed, 74 insertions(+), 24 deletions(-)
>>>   create mode 100644 drivers/gpu/drm/xe/xe_guc_version.h
>>>
>>> diff --git a/drivers/gpu/drm/xe/abi/guc_klvs_abi.h 
>>> b/drivers/gpu/drm/xe/abi/guc_klvs_abi.h
>>> index edb0546fb163..30a051a0b4ee 100644
>>> --- a/drivers/gpu/drm/xe/abi/guc_klvs_abi.h
>>> +++ b/drivers/gpu/drm/xe/abi/guc_klvs_abi.h
>>> @@ -376,6 +376,12 @@ enum  {
>>>    *      :1: NORMAL = schedule VF always, irrespective of whether 
>>> it has work or not
>>>    *      :2: HIGH = schedule VF in the next time-slice after 
>>> current active
>>>    *          time-slice completes if it has active work
>>> + *
>>> + * _`GUC_KLV_VF_CFG_THRESHOLD_MULTI_LRC_COUNT` : 0x8A0D
>>> + *      Given that multi-LRC contexts are incompatible with SRIOV 
>>> scheduler
>>> + *      groups and cause the latter to be turned off when 
>>> registered with the
>>> + *      GuC, this config allows the PF to set a threshold for 
>>> multi-LRC context
>>> + *      registrations by VFs to monitor their behavior.
>>>    */
>>>     #define GUC_KLV_VF_CFG_GGTT_START_KEY        0x0001
>>> @@ -434,6 +440,9 @@ enum  {
>>>   #define   GUC_SCHED_PRIORITY_NORMAL        1u
>>>   #define   GUC_SCHED_PRIORITY_HIGH        2u
>>>   +#define GUC_KLV_VF_CFG_THRESHOLD_MULTI_LRC_COUNT_KEY 0x8a0d
>>> +#define GUC_KLV_VF_CFG_THRESHOLD_MULTI_LRC_COUNT_LEN    1u
>>> +
>>>   /*
>>>    * Workaround keys:
>>>    */
>>> diff --git a/drivers/gpu/drm/xe/xe_gt_sriov_pf_config.c 
>>> b/drivers/gpu/drm/xe/xe_gt_sriov_pf_config.c
>>> index 59c5c6b4d994..dda671d05b89 100644
>>> --- a/drivers/gpu/drm/xe/xe_gt_sriov_pf_config.c
>>> +++ b/drivers/gpu/drm/xe/xe_gt_sriov_pf_config.c
>>> @@ -269,7 +269,8 @@ static u32 encode_config_ggtt(u32 *cfg, const 
>>> struct xe_gt_sriov_config *config,
>>>   }
>>>     /* Return: number of configuration dwords written */
>>> -static u32 encode_config(u32 *cfg, const struct xe_gt_sriov_config 
>>> *config, bool details)
>>> +static u32 encode_config(struct xe_gt *gt, u32 *cfg,
>>> +             const struct xe_gt_sriov_config *config, bool details)
>>>   {
>>>       u32 n = 0;
>>>   @@ -303,9 +304,11 @@ static u32 encode_config(u32 *cfg, const 
>>> struct xe_gt_sriov_config *config, bool
>>>       cfg[n++] = PREP_GUC_KLV_TAG(VF_CFG_PREEMPT_TIMEOUT);
>>>       cfg[n++] = config->preempt_timeout;
>>>   -#define encode_threshold_config(TAG, ...) ({                    \
>>> -    cfg[n++] = PREP_GUC_KLV_TAG(VF_CFG_THRESHOLD_##TAG);            \
>>> -    cfg[n++] = 
>>> config->thresholds[MAKE_XE_GUC_KLV_THRESHOLD_INDEX(TAG)]; \
>>> +#define encode_threshold_config(TAG, NAME, MIN_GUC_VER) 
>>> ({                \
>>> +    if (!MIN_GUC_VER || GUC_FIRMWARE_VER(&gt->uc.guc) >= 
>>> MIN_GUC_VER) {        \
>>> +        cfg[n++] = 
>>> PREP_GUC_KLV_TAG(VF_CFG_THRESHOLD_##TAG);            \
>>> +        cfg[n++] = 
>>> config->thresholds[MAKE_XE_GUC_KLV_THRESHOLD_INDEX(TAG)]; \
>>> +    }                                        \
>>>   });
>>> MAKE_XE_GUC_KLV_THRESHOLDS_SET(encode_threshold_config);
>>> @@ -328,7 +331,7 @@ static int pf_push_full_vf_config(struct xe_gt 
>>> *gt, unsigned int vfid)
>>>           return -ENOBUFS;
>>>         cfg = xe_guc_buf_cpu_ptr(buf);
>>> -    num_dwords = encode_config(cfg, config, true);
>>> +    num_dwords = encode_config(gt, cfg, config, true);
>>>       xe_gt_assert(gt, num_dwords <= max_cfg_dwords);
>>>         if (xe_gt_is_media_type(gt)) {
>>> @@ -2518,7 +2521,7 @@ ssize_t xe_gt_sriov_pf_config_save(struct 
>>> xe_gt *gt, unsigned int vfid, void *bu
>>>               ret = -ENOBUFS;
>>>           } else {
>>>               config = pf_pick_vf_config(gt, vfid);
>>> -            ret = encode_config(buf, config, false) * sizeof(u32);
>>> +            ret = encode_config(gt, buf, config, false) * sizeof(u32);
>>>           }
>>>       }
>>>       mutex_unlock(xe_gt_sriov_pf_master_mutex(gt));
>>> @@ -2551,9 +2554,11 @@ static int pf_restore_vf_config_klv(struct 
>>> xe_gt *gt, unsigned int vfid,
>>>           return pf_provision_preempt_timeout(gt, vfid, value[0]);
>>>         /* auto-generate case statements */
>>> -#define define_threshold_key_to_provision_case(TAG, 
>>> ...)                \
>>> +#define define_threshold_key_to_provision_case(TAG, NAME, 
>>> MIN_GUC_VER)            \
>>>       case MAKE_GUC_KLV_VF_CFG_THRESHOLD_KEY(TAG):                    \
>>>           BUILD_BUG_ON(MAKE_GUC_KLV_VF_CFG_THRESHOLD_LEN(TAG) != 
>>> 1u);        \
>>> +        if (MIN_GUC_VER && GUC_FIRMWARE_VER(&gt->uc.guc) < 
>>> MIN_GUC_VER) \
>>> +            return -ENOKEY;                            \
>>>           if (len != 
>>> MAKE_GUC_KLV_VF_CFG_THRESHOLD_LEN(TAG))            \
>>>               return -EBADMSG;                        \
>>>           return pf_provision_threshold(gt, vfid,                    \
>>> diff --git a/drivers/gpu/drm/xe/xe_gt_sriov_pf_debugfs.c 
>>> b/drivers/gpu/drm/xe/xe_gt_sriov_pf_debugfs.c
>>> index 0fd863609848..5123ff1fb116 100644
>>> --- a/drivers/gpu/drm/xe/xe_gt_sriov_pf_debugfs.c
>>> +++ b/drivers/gpu/drm/xe/xe_gt_sriov_pf_debugfs.c
>>> @@ -21,6 +21,7 @@
>>>   #include "xe_gt_sriov_pf_monitor.h"
>>>   #include "xe_gt_sriov_pf_policy.h"
>>>   #include "xe_gt_sriov_pf_service.h"
>>> +#include "xe_guc.h"
>>>   #include "xe_pm.h"
>>>   #include "xe_sriov_pf.h"
>>>   #include "xe_sriov_pf_provision.h"
>>> @@ -301,9 +302,11 @@ static void pf_add_config_attrs(struct xe_gt 
>>> *gt, struct dentry *parent, unsigne
>>>                      &sched_priority_fops);
>>>         /* register all threshold attributes */
>>> -#define register_threshold_attribute(TAG, NAME, ...) \
>>> -    debugfs_create_file_unsafe("threshold_" #NAME, 0644, parent, 
>>> parent, \
>>> -                   &NAME##_fops);
>>> +#define register_threshold_attribute(TAG, NAME, MIN_GUC_VER) 
>>> ({                \
>>> +    if (!MIN_GUC_VER || GUC_FIRMWARE_VER(&gt->uc.guc) >= 
>>> MIN_GUC_VER)        \
>>> +        debugfs_create_file_unsafe("threshold_" #NAME, 0644, 
>>> parent, parent,    \
>>> +                       &NAME##_fops);                \
>>> +});
>>> MAKE_XE_GUC_KLV_THRESHOLDS_SET(register_threshold_attribute)
>>>   #undef register_threshold_attribute
>>>   }
>>> diff --git a/drivers/gpu/drm/xe/xe_guc.h b/drivers/gpu/drm/xe/xe_guc.h
>>> index fdb08658d05a..9028718189ed 100644
>>> --- a/drivers/gpu/drm/xe/xe_guc.h
>>> +++ b/drivers/gpu/drm/xe/xe_guc.h
>>> @@ -8,15 +8,10 @@
>>>     #include "xe_gt.h"
>>>   #include "xe_guc_types.h"
>>> +#include "xe_guc_version.h"
>>>   #include "xe_hw_engine_types.h"
>>>   #include "xe_macros.h"
>>>   -/*
>>> - * GuC version number components are defined to be only 8-bit size,
>>> - * so converting to a 32bit 8.8.8 integer allows simple (and safe)
>>> - * numerical comparisons.
>>> - */
>>> -#define MAKE_GUC_VER(maj, min, pat)    (((maj) << 16) | ((min) << 
>>> 8) | (pat))
>>>   #define MAKE_GUC_VER_STRUCT(ver) MAKE_GUC_VER((ver).major, 
>>> (ver).minor, (ver).patch)
>> I guess this macro can also be moved
>
> I purposely didn't move this as MAKE_GUC_VER_STRUCT is specific to how 
> we code xe_uc_fw_version, while MAKE_GUC_VER is based on what the GuC 
> interface define.
>
>>
>>>   #define GUC_SUBMIT_VER(guc) \
>>> MAKE_GUC_VER_STRUCT((guc)->fw.versions.found[XE_UC_FW_VER_COMPATIBILITY])
>>> diff --git a/drivers/gpu/drm/xe/xe_guc_klv_thresholds_set_types.h 
>>> b/drivers/gpu/drm/xe/xe_guc_klv_thresholds_set_types.h
>>> index 0a028c94756d..f7ed32244c6b 100644
>>> --- a/drivers/gpu/drm/xe/xe_guc_klv_thresholds_set_types.h
>>> +++ b/drivers/gpu/drm/xe/xe_guc_klv_thresholds_set_types.h
>>> @@ -7,6 +7,7 @@
>>>   #define _XE_GUC_KLV_THRESHOLDS_SET_TYPES_H_
>>>     #include "xe_args.h"
>>> +#include "xe_guc_version.h"
>>>     /**
>>>    * MAKE_XE_GUC_KLV_THRESHOLDS_SET - Generate various GuC 
>>> thresholds definitions.
>>> @@ -23,15 +24,16 @@
>>>    * with the &TAG, that corresponds to the GuC threshold KLV key 
>>> name defined by
>>>    * ABI and the associated &NAME, that may be used in code or 
>>> debugfs/sysfs::
>>>    *
>>> - *    define(TAG, NAME)
>>> + *    define(TAG, NAME, MIN_GUC_VER)
>>>    */
>>> -#define MAKE_XE_GUC_KLV_THRESHOLDS_SET(define)        \
>>> -    define(CAT_ERR, cat_error_count)        \
>>> -    define(ENGINE_RESET, engine_reset_count)    \
>>> -    define(PAGE_FAULT, page_fault_count)        \
>>> -    define(H2G_STORM, guc_time_us)            \
>>> -    define(IRQ_STORM, irq_time_us)            \
>>> -    define(DOORBELL_STORM, doorbell_time_us)    \
>>> +#define MAKE_XE_GUC_KLV_THRESHOLDS_SET(define)                    \
>>> +    define(CAT_ERR, cat_error_count, 0)                    \
>>> +    define(ENGINE_RESET, engine_reset_count, 0)                \
>>> +    define(PAGE_FAULT, page_fault_count, 0)                    \
>>> +    define(H2G_STORM, guc_time_us, 0)                    \
>>> +    define(IRQ_STORM, irq_time_us, 0)                    \
>>> +    define(DOORBELL_STORM, doorbell_time_us, 0)                \
>>> +    define(MULTI_LRC_COUNT, multi_lrc_count, MAKE_GUC_VER(70, 53, 
>>> 0))    \
>>>       /* end */
>>>     /**
>>> diff --git a/drivers/gpu/drm/xe/xe_guc_version.h 
>>> b/drivers/gpu/drm/xe/xe_guc_version.h
>>> new file mode 100644
>> introduction of this new ver.h file is self-contained so maybe it 
>> should be in its own patch?
>
> IMO it is simple enough to keep here, to avoid too many small patches 
> in the series.
>
>>
>>> index 000000000000..e6f80abd2f05
>>> --- /dev/null
>>> +++ b/drivers/gpu/drm/xe/xe_guc_version.h
>>> @@ -0,0 +1,36 @@
>>> +/* SPDX-License-Identifier: MIT */
>>> +/*
>>> + * Copyright © 2025 Intel Corporation
>>> + */
>>> +
>>> +#ifndef _XE_GUC_VERSION_H_
>>> +#define _XE_GUC_VERSION_H_
>>> +
>>> +/*
>> this should be regular kernel-doc
>>
>>> + * GuC version number components are defined to be only 8-bit size,
>>> + * so converting to a 32bit 8.8.8 integer allows simple (and safe)
>>> + * numerical comparisons.
>>> + */
>>> +#define MAKE_GUC_VER(maj, min, pat)    (((maj) << 16) | ((min) << 
>>> 8) | (pat))
>>> +
>>> +/**
>>> + * DOC: SRIOV-changes
>>     DOC: SR-IOV Changes
>>
>>> + *
>>> + * We record SRIOV-specific changes here as those need to be 
>>> tracked carefully.
>>> + *
>> what about 1.23.0 (CCS) ?
>
> If you tell me exactly what to write I'll add it in, because I don't 
> know the specifics.
>
>>
>>> + * GuC 70.53.0 (VF interface 1.26.0):
>>> + *
>>> + * Added support for EGS. See:
>> probably we need extra line here to render correctly
>>
>>> + *  * GUC_KLV_VGT_POLICY_ENGINE_GROUP_CONFIG
>>> + *  * GUC_KLV_VF_CFG_THRESHOLD_MULTI_LRC_COUNT
>>> + *
>>> + * GuC 70.54.0 (VF interface 1.27.0):
>>> + *
>>> + * Updated VF migration support. See RESFIX actions
>> maybe we can list those actions:
>>
>>     * VF2GUC_RESFIX_START
>>     * VF2GUC_RESFIX_DONE
>
> Those don't seem to yet be in the driver. Should I list them anyway?

Ignore this question, I was looking in an older tree.

Daniele

>
> Daniele
>
>>> + *
>>> + * GuC 70.55.1 (VF interface 1.28.1):
>>> + *
>>> + * Fixes for EGS.
>>> + */
>>> +
>>> +#endif
>


  reply	other threads:[~2025-12-08 18:27 UTC|newest]

Thread overview: 30+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-12-06 23:03 [PATCH v2 00/11] Introduce SRIOV scheduler groups Daniele Ceraolo Spurio
2025-12-06 23:03 ` [PATCH v2 01/11] drm/xe/gt: Add engine masks for each class Daniele Ceraolo Spurio
2025-12-07 15:35   ` Michal Wajdeczko
2025-12-06 23:03 ` [PATCH v2 02/11] drm/xe/sriov: Initialize scheduler groups Daniele Ceraolo Spurio
2025-12-07 21:57   ` Michal Wajdeczko
2025-12-08 17:36     ` Daniele Ceraolo Spurio
2025-12-06 23:03 ` [PATCH v2 03/11] drm/xe/sriov: Add support for enabling " Daniele Ceraolo Spurio
2025-12-07 21:57   ` Michal Wajdeczko
2025-12-08 17:41     ` Daniele Ceraolo Spurio
2025-12-06 23:04 ` [PATCH v2 04/11] drm/xe/sriov: Scheduler groups are incompatible with multi-lrc Daniele Ceraolo Spurio
2025-12-07 21:58   ` Michal Wajdeczko
2025-12-08 17:48     ` Daniele Ceraolo Spurio
2025-12-06 23:04 ` [PATCH v2 05/11] drm/xe/sriov: Add handling for MLRC adverse event threshold Daniele Ceraolo Spurio
2025-12-07 22:03   ` Michal Wajdeczko
2025-12-08 17:52     ` Daniele Ceraolo Spurio
2025-12-08 18:27       ` Daniele Ceraolo Spurio [this message]
2025-12-06 23:04 ` [PATCH v2 06/11] drm/xe/sriov: Add debugfs to enable scheduler groups Daniele Ceraolo Spurio
2025-12-08 23:38   ` Michal Wajdeczko
2025-12-09  0:36     ` Daniele Ceraolo Spurio
2025-12-09 15:07       ` Michal Wajdeczko
2025-12-09 18:09         ` Daniele Ceraolo Spurio
2025-12-06 23:04 ` [PATCH v2 07/11] drm/xe/sriov: Add debugfs with scheduler groups information Daniele Ceraolo Spurio
2025-12-09  0:08   ` Michal Wajdeczko
2025-12-09  0:23     ` Daniele Ceraolo Spurio
2025-12-06 23:04 ` [PATCH v2 08/11] drm/xe/sriov: Prep for multiple exec quantums and preemption timeouts Daniele Ceraolo Spurio
2025-12-06 23:04 ` [PATCH v2 09/11] drm/xe/sriov: Add functions to set exec quantums for each group Daniele Ceraolo Spurio
2025-12-06 23:04 ` [PATCH v2 10/11] drm/xe/sriov: Add functions to set preempt timeouts " Daniele Ceraolo Spurio
2025-12-06 23:04 ` [PATCH v2 11/11] drm/xe/sriov: Add debugfs to set EQ and PT for scheduler groups Daniele Ceraolo Spurio
2025-12-06 23:10 ` ✗ CI.checkpatch: warning for Introduce SRIOV scheduler groups (rev2) Patchwork
2025-12-06 23:11 ` ✓ CI.KUnit: success " Patchwork

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=22a1be76-cf95-4119-ab15-40a98c567faa@intel.com \
    --to=daniele.ceraolospurio@intel.com \
    --cc=intel-xe@lists.freedesktop.org \
    --cc=michal.wajdeczko@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox