From: Rodrigo Vivi <rodrigo.vivi@intel.com>
To: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Cc: <intel-xe@lists.freedesktop.org>
Subject: Re: [v4] drm/xe: Allow per queue programming of COMMON_SLICE_CHICKEN3 bit13
Date: Tue, 3 Mar 2026 16:09:59 -0500 [thread overview]
Message-ID: <aadOJ6YTk2fhOEBn@intel.com> (raw)
In-Reply-To: <20260227112310.1222483-1-lionel.g.landwerlin@intel.com>
On Fri, Feb 27, 2026 at 01:23:08PM +0200, Lionel Landwerlin wrote:
> Similar to i915's commit cebc13de7e704b1355bea208a9f9cdb042c74588
> ("drm/i915: Whitelist COMMON_SLICE_CHICKEN3 for UMD access"), except
> people have decided to not rely on putting the register on the
> allowlist for UMD to program and instead have context/queue creation
> flag.
NACK!
Broken record: https://lore.kernel.org/intel-xe/aZXhoXWY8tDl63Lw@intel.com/
>
> This is a recommended tuning setting for both gen12 and Xe_HP
> platforms.
>
> If a render queue is created with
> DRM_XE_EXEC_QUEUE_SET_STATE_CACHE_PERF_FIX, COMMON_SLICE_CHICKEN3 will
> be programmed at initialization to enable the render color cache to
> key with BTP+BTI (binding table pool + binding table entry) instead of
> just BTI (binding table entry). This enables the UMD to avoid emitting
> render-target-cache-flush + stall-at-pixel-scoreboard every time a
> binding table entry pointing to a render target is changed.
>
> v2: Use xe_lrc_write_ring()
>
> v3: Update xe_query.c to report availability
>
> v4: Rename defines to add DISABLE_
>
> Bspec: 73993, 73994, 72161, 31870, 68331
> Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
> ---
> drivers/gpu/drm/xe/regs/xe_gt_regs.h | 1 +
> drivers/gpu/drm/xe/xe_exec_queue.c | 19 ++++++++++++++++++-
> drivers/gpu/drm/xe/xe_exec_queue_types.h | 2 ++
> drivers/gpu/drm/xe/xe_lrc.c | 9 +++++++++
> drivers/gpu/drm/xe/xe_lrc.h | 1 +
> drivers/gpu/drm/xe/xe_query.c | 2 ++
> include/uapi/drm/xe_drm.h | 8 ++++++++
> 7 files changed, 41 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/xe/regs/xe_gt_regs.h b/drivers/gpu/drm/xe/regs/xe_gt_regs.h
> index 90b9017770ea2..33b94ec344044 100644
> --- a/drivers/gpu/drm/xe/regs/xe_gt_regs.h
> +++ b/drivers/gpu/drm/xe/regs/xe_gt_regs.h
> @@ -179,6 +179,7 @@
>
> #define COMMON_SLICE_CHICKEN3 XE_REG(0x7304, XE_REG_OPTION_MASKED)
> #define XEHP_COMMON_SLICE_CHICKEN3 XE_REG_MCR(0x7304, XE_REG_OPTION_MASKED)
> +#define DISABLE_STATE_CACHE_PERF_FIX REG_BIT(13)
> #define DG1_FLOAT_POINT_BLEND_OPT_STRICT_MODE_EN REG_BIT(12)
> #define XEHP_DUAL_SIMD8_SEQ_MERGE_DISABLE REG_BIT(12)
> #define BLEND_EMB_FIX_DISABLE_IN_RCC REG_BIT(11)
> diff --git a/drivers/gpu/drm/xe/xe_exec_queue.c b/drivers/gpu/drm/xe/xe_exec_queue.c
> index f8980cb7293dd..93b39a3d60800 100644
> --- a/drivers/gpu/drm/xe/xe_exec_queue.c
> +++ b/drivers/gpu/drm/xe/xe_exec_queue.c
> @@ -292,6 +292,9 @@ static int __xe_exec_queue_init(struct xe_exec_queue *q, u32 exec_queue_flags)
> if (!(exec_queue_flags & EXEC_QUEUE_FLAG_KERNEL))
> flags |= XE_LRC_CREATE_USER_CTX;
>
> + if (q->flags & EXEC_QUEUE_FLAG_DISABLE_STATE_CACHE_PERF_FIX)
> + flags |= XE_LRC_DISABLE_STATE_CACHE_PERF_FIX;
> +
> err = q->ops->init(q);
> if (err)
> return err;
> @@ -850,6 +853,17 @@ static int exec_queue_set_multi_queue_priority(struct xe_device *xe, struct xe_e
> return q->ops->set_multi_queue_priority(q, value);
> }
>
> +static int exec_queue_set_state_cache_perf_fix(struct xe_device *xe, struct xe_exec_queue *q,
> + u64 value)
> +{
> + if (XE_IOCTL_DBG(xe, q->class != XE_ENGINE_CLASS_RENDER))
> + return -EOPNOTSUPP;
> +
> + q->flags |= value != 0 ? EXEC_QUEUE_FLAG_DISABLE_STATE_CACHE_PERF_FIX : 0;
> +
> + return 0;
> +}
> +
> typedef int (*xe_exec_queue_set_property_fn)(struct xe_device *xe,
> struct xe_exec_queue *q,
> u64 value);
> @@ -862,6 +876,8 @@ static const xe_exec_queue_set_property_fn exec_queue_set_property_funcs[] = {
> [DRM_XE_EXEC_QUEUE_SET_PROPERTY_MULTI_GROUP] = exec_queue_set_multi_group,
> [DRM_XE_EXEC_QUEUE_SET_PROPERTY_MULTI_QUEUE_PRIORITY] =
> exec_queue_set_multi_queue_priority,
> + [DRM_XE_EXEC_QUEUE_SET_DISABLE_STATE_CACHE_PERF_FIX] =
> + exec_queue_set_state_cache_perf_fix,
> };
>
> int xe_exec_queue_set_property_ioctl(struct drm_device *dev, void *data,
> @@ -946,7 +962,8 @@ static int exec_queue_user_ext_set_property(struct xe_device *xe,
> ext.property != DRM_XE_EXEC_QUEUE_SET_PROPERTY_PXP_TYPE &&
> ext.property != DRM_XE_EXEC_QUEUE_SET_HANG_REPLAY_STATE &&
> ext.property != DRM_XE_EXEC_QUEUE_SET_PROPERTY_MULTI_GROUP &&
> - ext.property != DRM_XE_EXEC_QUEUE_SET_PROPERTY_MULTI_QUEUE_PRIORITY))
> + ext.property != DRM_XE_EXEC_QUEUE_SET_PROPERTY_MULTI_QUEUE_PRIORITY &&
> + ext.property != DRM_XE_EXEC_QUEUE_SET_DISABLE_STATE_CACHE_PERF_FIX))
> return -EINVAL;
>
> idx = array_index_nospec(ext.property, ARRAY_SIZE(exec_queue_set_property_funcs));
> diff --git a/drivers/gpu/drm/xe/xe_exec_queue_types.h b/drivers/gpu/drm/xe/xe_exec_queue_types.h
> index 3791fed34ffa5..e4e3dbe956f05 100644
> --- a/drivers/gpu/drm/xe/xe_exec_queue_types.h
> +++ b/drivers/gpu/drm/xe/xe_exec_queue_types.h
> @@ -134,6 +134,8 @@ struct xe_exec_queue {
> #define EXEC_QUEUE_FLAG_LOW_LATENCY BIT(5)
> /* for migration (kernel copy, clear, bind) jobs */
> #define EXEC_QUEUE_FLAG_MIGRATE BIT(6)
> +/* for programming COMMON_SLICE_CHICKEN3 on first submission */
> +#define EXEC_QUEUE_FLAG_DISABLE_STATE_CACHE_PERF_FIX BIT(7)
>
> /**
> * @flags: flags for this exec queue, should statically setup aside from ban
> diff --git a/drivers/gpu/drm/xe/xe_lrc.c b/drivers/gpu/drm/xe/xe_lrc.c
> index 384f9b31421ec..1ddc4f269d08c 100644
> --- a/drivers/gpu/drm/xe/xe_lrc.c
> +++ b/drivers/gpu/drm/xe/xe_lrc.c
> @@ -14,6 +14,7 @@
> #include "instructions/xe_gfxpipe_commands.h"
> #include "instructions/xe_gfx_state_commands.h"
> #include "regs/xe_engine_regs.h"
> +#include "regs/xe_gt_regs.h"
> #include "regs/xe_lrc_layout.h"
> #include "xe_bb.h"
> #include "xe_bo.h"
> @@ -1451,6 +1452,7 @@ static int xe_lrc_init(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
> struct xe_bo *seqno_bo;
> struct iosys_map map;
> u32 arb_enable;
> + u32 state_cache_perf_fix[3];
> u32 bo_flags;
> int err;
>
> @@ -1594,6 +1596,13 @@ static int xe_lrc_init(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
> arb_enable = MI_ARB_ON_OFF | MI_ARB_ENABLE;
> xe_lrc_write_ring(lrc, &arb_enable, sizeof(arb_enable));
>
> + if (init_flags & XE_LRC_DISABLE_STATE_CACHE_PERF_FIX) {
> + state_cache_perf_fix[0] = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(1);
> + state_cache_perf_fix[1] = COMMON_SLICE_CHICKEN3.addr;
> + state_cache_perf_fix[2] = _MASKED_BIT_ENABLE(DISABLE_STATE_CACHE_PERF_FIX);
> + xe_lrc_write_ring(lrc, state_cache_perf_fix, sizeof(state_cache_perf_fix));
> + }
> +
> map = __xe_lrc_seqno_map(lrc);
> xe_map_write32(lrc_to_xe(lrc), &map, lrc->fence_ctx.next_seqno - 1);
>
> diff --git a/drivers/gpu/drm/xe/xe_lrc.h b/drivers/gpu/drm/xe/xe_lrc.h
> index 3e500004f1ae4..1c4ae5f6f06da 100644
> --- a/drivers/gpu/drm/xe/xe_lrc.h
> +++ b/drivers/gpu/drm/xe/xe_lrc.h
> @@ -49,6 +49,7 @@ struct xe_lrc_snapshot {
> #define XE_LRC_CREATE_RUNALONE BIT(0)
> #define XE_LRC_CREATE_PXP BIT(1)
> #define XE_LRC_CREATE_USER_CTX BIT(2)
> +#define XE_LRC_DISABLE_STATE_CACHE_PERF_FIX BIT(3)
>
> struct xe_lrc *xe_lrc_create(struct xe_hw_engine *hwe, struct xe_vm *vm,
> void *replay_state, u32 ring_size, u16 msix_vec, u32 flags);
> diff --git a/drivers/gpu/drm/xe/xe_query.c b/drivers/gpu/drm/xe/xe_query.c
> index 34db266b723fa..4852fdcb4b959 100644
> --- a/drivers/gpu/drm/xe/xe_query.c
> +++ b/drivers/gpu/drm/xe/xe_query.c
> @@ -340,6 +340,8 @@ static int query_config(struct xe_device *xe, struct drm_xe_device_query *query)
> DRM_XE_QUERY_CONFIG_FLAG_HAS_NO_COMPRESSION_HINT;
> config->info[DRM_XE_QUERY_CONFIG_FLAGS] |=
> DRM_XE_QUERY_CONFIG_FLAG_HAS_LOW_LATENCY;
> + config->info[DRM_XE_QUERY_CONFIG_FLAGS] |=
> + DRM_XE_QUERY_CONFIG_FLAG_HAS_DISABLE_STATE_CACHE_PERF_FIX;
> config->info[DRM_XE_QUERY_CONFIG_MIN_ALIGNMENT] =
> xe->info.vram_flags & XE_VRAM_FLAGS_NEED64K ? SZ_64K : SZ_4K;
> config->info[DRM_XE_QUERY_CONFIG_VA_BITS] = xe->info.va_bits;
> diff --git a/include/uapi/drm/xe_drm.h b/include/uapi/drm/xe_drm.h
> index ef2565048bdf1..df1dc6b9cbc8c 100644
> --- a/include/uapi/drm/xe_drm.h
> +++ b/include/uapi/drm/xe_drm.h
> @@ -406,6 +406,9 @@ struct drm_xe_query_mem_regions {
> * - %DRM_XE_QUERY_CONFIG_FLAG_HAS_NO_COMPRESSION_HINT - Flag is set if the
> * device supports the userspace hint %DRM_XE_GEM_CREATE_FLAG_NO_COMPRESSION.
> * This is exposed only on Xe2+.
> + * - %DRM_XE_QUERY_CONFIG_FLAG_HAS_DISABLE_STATE_CACHE_PERF_FIX - Flag is set
> + * if a queue can be creaed with
> + * %DRM_XE_EXEC_QUEUE_SET_DISABLE_STATE_CACHE_PERF_FIX
> * - %DRM_XE_QUERY_CONFIG_MIN_ALIGNMENT - Minimal memory alignment
> * required by this device, typically SZ_4K or SZ_64K
> * - %DRM_XE_QUERY_CONFIG_VA_BITS - Maximum bits of a virtual address
> @@ -425,6 +428,7 @@ struct drm_xe_query_config {
> #define DRM_XE_QUERY_CONFIG_FLAG_HAS_LOW_LATENCY (1 << 1)
> #define DRM_XE_QUERY_CONFIG_FLAG_HAS_CPU_ADDR_MIRROR (1 << 2)
> #define DRM_XE_QUERY_CONFIG_FLAG_HAS_NO_COMPRESSION_HINT (1 << 3)
> + #define DRM_XE_QUERY_CONFIG_FLAG_HAS_DISABLE_STATE_CACHE_PERF_FIX (1 << 4)
> #define DRM_XE_QUERY_CONFIG_MIN_ALIGNMENT 2
> #define DRM_XE_QUERY_CONFIG_VA_BITS 3
> #define DRM_XE_QUERY_CONFIG_MAX_EXEC_QUEUE_PRIORITY 4
> @@ -1285,6 +1289,9 @@ struct drm_xe_vm_bind {
> * - %DRM_XE_EXEC_QUEUE_SET_PROPERTY_MULTI_QUEUE_PRIORITY - Set the queue
> * priority within the multi-queue group. Current valid priority values are 0–2
> * (default is 1), with higher values indicating higher priority.
> + * - %DRM_XE_EXEC_QUEUE_SET_DISABLE_STATE_CACHE_PERF_FIX - Set the queue to
> + * enable render color cache keying on BTP+BTI instead of just BTI
> + * (only valid for render queues).
> *
> * The example below shows how to use @drm_xe_exec_queue_create to create
> * a simple exec_queue (no parallel submission) of class
> @@ -1329,6 +1336,7 @@ struct drm_xe_exec_queue_create {
> #define DRM_XE_EXEC_QUEUE_SET_PROPERTY_MULTI_GROUP 4
> #define DRM_XE_MULTI_GROUP_CREATE (1ull << 63)
> #define DRM_XE_EXEC_QUEUE_SET_PROPERTY_MULTI_QUEUE_PRIORITY 5
> +#define DRM_XE_EXEC_QUEUE_SET_DISABLE_STATE_CACHE_PERF_FIX 6
> /** @extensions: Pointer to the first extension struct, if any */
> __u64 extensions;
>
> --
> 2.43.0
>
prev parent reply other threads:[~2026-03-03 21:10 UTC|newest]
Thread overview: 5+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-02-27 11:23 [v4] drm/xe: Allow per queue programming of COMMON_SLICE_CHICKEN3 bit13 Lionel Landwerlin
2026-02-27 11:28 ` ✓ CI.KUnit: success for drm/xe: Allow per queue programming of COMMON_SLICE_CHICKEN3 bit13 (rev4) Patchwork
2026-02-27 12:13 ` ✓ Xe.CI.BAT: " Patchwork
2026-02-27 19:54 ` ✗ Xe.CI.FULL: failure " Patchwork
2026-03-03 21:09 ` Rodrigo Vivi [this message]
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=aadOJ6YTk2fhOEBn@intel.com \
--to=rodrigo.vivi@intel.com \
--cc=intel-xe@lists.freedesktop.org \
--cc=lionel.g.landwerlin@intel.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.