Intel-XE Archive on lore.kernel.org
 help / color / mirror / Atom feed
From: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
To: Matt Roper <matthew.d.roper@intel.com>
Cc: <intel-xe@lists.freedesktop.org>
Subject: Re: [v2] drm/xe: Allow per queue programming of COMMON_SLICE_CHICKEN3 bit13
Date: Mon, 2 Mar 2026 09:52:04 +0200	[thread overview]
Message-ID: <711d1ca5-25ed-4518-8282-ec52251cc094@intel.com> (raw)
In-Reply-To: <20260227221703.GR52346@mdroper-desk1.amr.corp.intel.com>

On 28/02/2026 00:17, Matt Roper wrote:
> On Fri, Feb 27, 2026 at 02:12:34PM -0800, Matt Roper wrote:
>> On Fri, Feb 27, 2026 at 10:42:41AM +0200, Lionel Landwerlin wrote:
>>> On 18/02/2026 01:51, Matt Roper wrote:
>>>> On Tue, Feb 17, 2026 at 10:34:28AM +0200, Lionel Landwerlin wrote:
>>>>> Similar to i915's commit cebc13de7e704b1355bea208a9f9cdb042c74588
>>>>> ("drm/i915: Whitelist COMMON_SLICE_CHICKEN3 for UMD access"), except
>>>>> people have decided to not rely on putting the register on the
>>>>> allowlist for UMD to program and instead have context/queue creation
>>>>> flag.
>>>>>
>>>>> This is a recommended tuning setting for both gen12 and Xe_HP
>>>>> platforms.
>>>>>
>>>>> If a render queue is created with
>>>>> DRM_XE_EXEC_QUEUE_SET_STATE_CACHE_PERF_FIX, COMMON_SLICE_CHICKEN3 will
>>>>> be programmed at initialization to enable the render color cache to
>>>>> key with BTP+BTI (binding table pool + binding table entry) instead of
>>>>> just BTI (binding table entry). This enables the UMD to avoid emitting
>>>>> render-target-cache-flush + stall-at-pixel-scoreboard every time a
>>>>> binding table entry pointing to a render target is changed.
>>>>>
>>>>> Bspec: 73993, 73994, 72161, 31870, 68331
>>>>> Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
>>>>> ---
>>>>>    drivers/gpu/drm/xe/regs/xe_gt_regs.h     |  1 +
>>>>>    drivers/gpu/drm/xe/xe_exec_queue.c       | 18 +++++++++++++++++-
>>>>>    drivers/gpu/drm/xe/xe_exec_queue_types.h |  2 ++
>>>>>    drivers/gpu/drm/xe/xe_lrc.c              |  9 +++++++++
>>>>>    drivers/gpu/drm/xe/xe_lrc.h              |  1 +
>>>>>    drivers/gpu/drm/xe/xe_query.c            |  2 ++
>>>>>    include/uapi/drm/xe_drm.h                |  8 ++++++++
>>>>>    7 files changed, 40 insertions(+), 1 deletion(-)
>>>>>
>>>>> diff --git a/drivers/gpu/drm/xe/regs/xe_gt_regs.h b/drivers/gpu/drm/xe/regs/xe_gt_regs.h
>>>>> index a375ffd666ba2..80a438e51419f 100644
>>>>> --- a/drivers/gpu/drm/xe/regs/xe_gt_regs.h
>>>>> +++ b/drivers/gpu/drm/xe/regs/xe_gt_regs.h
>>>>> @@ -178,6 +178,7 @@
>>>>>    #define COMMON_SLICE_CHICKEN3				XE_REG(0x7304, XE_REG_OPTION_MASKED)
>>>>>    #define XEHP_COMMON_SLICE_CHICKEN3			XE_REG_MCR(0x7304, XE_REG_OPTION_MASKED)
>>>>> +#define   STATE_CACHE_PERF_FIX_DISABLED			REG_BIT(13)
>>>>>    #define   DG1_FLOAT_POINT_BLEND_OPT_STRICT_MODE_EN	REG_BIT(12)
>>>>>    #define   XEHP_DUAL_SIMD8_SEQ_MERGE_DISABLE		REG_BIT(12)
>>>>>    #define   BLEND_EMB_FIX_DISABLE_IN_RCC			REG_BIT(11)
>>>>> diff --git a/drivers/gpu/drm/xe/xe_exec_queue.c b/drivers/gpu/drm/xe/xe_exec_queue.c
>>>>> index 66d0e10ee2c4a..d3168353fcaaf 100644
>>>>> --- a/drivers/gpu/drm/xe/xe_exec_queue.c
>>>>> +++ b/drivers/gpu/drm/xe/xe_exec_queue.c
>>>>> @@ -292,6 +292,9 @@ static int __xe_exec_queue_init(struct xe_exec_queue *q, u32 exec_queue_flags)
>>>>>    	if (!(exec_queue_flags & EXEC_QUEUE_FLAG_KERNEL))
>>>>>    		flags |= XE_LRC_CREATE_USER_CTX;
>>>>> +	if (q->flags & EXEC_QUEUE_FLAG_STATE_CACHE_PERF_FIX)
>>>>> +		flags |= XE_LRC_STATE_CACHE_PERF_FIX;
>>>>> +
>>>>>    	err = q->ops->init(q);
>>>>>    	if (err)
>>>>>    		return err;
>>>>> @@ -850,6 +853,17 @@ static int exec_queue_set_multi_queue_priority(struct xe_device *xe, struct xe_e
>>>>>    	return q->ops->set_multi_queue_priority(q, value);
>>>>>    }
>>>>> +static int exec_queue_set_state_cache_perf_fix(struct xe_device *xe, struct xe_exec_queue *q,
>>>>> +					       u64 value)
>>>>> +{
>>>>> +	if (XE_IOCTL_DBG(xe, q->class != XE_ENGINE_CLASS_RENDER))
>>>>> +		return -EOPNOTSUPP;
>>>>> +
>>>>> +	q->flags |= value != 0 ? EXEC_QUEUE_FLAG_STATE_CACHE_PERF_FIX : 0;
>>>>> +
>>>>> +	return 0;
>>>>> +}
>>>>> +
>>>>>    typedef int (*xe_exec_queue_set_property_fn)(struct xe_device *xe,
>>>>>    					     struct xe_exec_queue *q,
>>>>>    					     u64 value);
>>>>> @@ -862,6 +876,7 @@ static const xe_exec_queue_set_property_fn exec_queue_set_property_funcs[] = {
>>>>>    	[DRM_XE_EXEC_QUEUE_SET_PROPERTY_MULTI_GROUP] = exec_queue_set_multi_group,
>>>>>    	[DRM_XE_EXEC_QUEUE_SET_PROPERTY_MULTI_QUEUE_PRIORITY] =
>>>>>    							exec_queue_set_multi_queue_priority,
>>>>> +	[DRM_XE_EXEC_QUEUE_SET_STATE_CACHE_PERF_FIX] = exec_queue_set_state_cache_perf_fix,
>>>>>    };
>>>>>    int xe_exec_queue_set_property_ioctl(struct drm_device *dev, void *data,
>>>>> @@ -946,7 +961,8 @@ static int exec_queue_user_ext_set_property(struct xe_device *xe,
>>>>>    			 ext.property != DRM_XE_EXEC_QUEUE_SET_PROPERTY_PXP_TYPE &&
>>>>>    			 ext.property != DRM_XE_EXEC_QUEUE_SET_HANG_REPLAY_STATE &&
>>>>>    			 ext.property != DRM_XE_EXEC_QUEUE_SET_PROPERTY_MULTI_GROUP &&
>>>>> -			 ext.property != DRM_XE_EXEC_QUEUE_SET_PROPERTY_MULTI_QUEUE_PRIORITY))
>>>>> +			 ext.property != DRM_XE_EXEC_QUEUE_SET_PROPERTY_MULTI_QUEUE_PRIORITY &&
>>>>> +			 ext.property != DRM_XE_EXEC_QUEUE_SET_STATE_CACHE_PERF_FIX))
>>>>>    		return -EINVAL;
>>>>>    	idx = array_index_nospec(ext.property, ARRAY_SIZE(exec_queue_set_property_funcs));
>>>>> diff --git a/drivers/gpu/drm/xe/xe_exec_queue_types.h b/drivers/gpu/drm/xe/xe_exec_queue_types.h
>>>>> index 3791fed34ffa5..f4f72d01eb8c8 100644
>>>>> --- a/drivers/gpu/drm/xe/xe_exec_queue_types.h
>>>>> +++ b/drivers/gpu/drm/xe/xe_exec_queue_types.h
>>>>> @@ -134,6 +134,8 @@ struct xe_exec_queue {
>>>>>    #define EXEC_QUEUE_FLAG_LOW_LATENCY		BIT(5)
>>>>>    /* for migration (kernel copy, clear, bind) jobs */
>>>>>    #define EXEC_QUEUE_FLAG_MIGRATE			BIT(6)
>>>>> +/* for programming COMMON_SLICE_CHICKEN2 on first submission */
>>>>> +#define EXEC_QUEUE_FLAG_STATE_CACHE_PERF_FIX	BIT(7)
>>>>>    	/**
>>>>>    	 * @flags: flags for this exec queue, should statically setup aside from ban
>>>>> diff --git a/drivers/gpu/drm/xe/xe_lrc.c b/drivers/gpu/drm/xe/xe_lrc.c
>>>>> index 38f648b98868d..a962ac2bb7ca2 100644
>>>>> --- a/drivers/gpu/drm/xe/xe_lrc.c
>>>>> +++ b/drivers/gpu/drm/xe/xe_lrc.c
>>>>> @@ -14,6 +14,7 @@
>>>>>    #include "instructions/xe_gfxpipe_commands.h"
>>>>>    #include "instructions/xe_gfx_state_commands.h"
>>>>>    #include "regs/xe_engine_regs.h"
>>>>> +#include "regs/xe_gt_regs.h"
>>>>>    #include "regs/xe_lrc_layout.h"
>>>>>    #include "xe_bb.h"
>>>>>    #include "xe_bo.h"
>>>>> @@ -1447,6 +1448,7 @@ static int xe_lrc_init(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
>>>>>    	struct xe_device *xe = gt_to_xe(gt);
>>>>>    	struct iosys_map map;
>>>>>    	u32 arb_enable;
>>>>> +	u32 state_cache_perf_fix[3];
>>>>>    	u32 bo_flags;
>>>>>    	int err;
>>>>> @@ -1579,6 +1581,13 @@ static int xe_lrc_init(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
>>>>>    	arb_enable = MI_ARB_ON_OFF | MI_ARB_ENABLE;
>>>>>    	xe_lrc_write_ring(lrc, &arb_enable, sizeof(arb_enable));
>>>>> +	if (init_flags & XE_LRC_STATE_CACHE_PERF_FIX) {
>>>>> +		state_cache_perf_fix[0] = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(1);
>>>>> +		state_cache_perf_fix[1] = COMMON_SLICE_CHICKEN3.addr;
>>>>> +		state_cache_perf_fix[2] = _MASKED_BIT_ENABLE(STATE_CACHE_PERF_FIX_DISABLED);
>>>>> +		xe_lrc_write_ring(lrc, state_cache_perf_fix, sizeof(state_cache_perf_fix));
>>>>> +	}
>>>> This will put instructions in the LRC's ring to update the register.  So
>>>> when this context starts running, the context switch will load the
>>>> default value of COMMON_SLICE_CHICKEN3 from the LRC's main MI_LRI
>>>> instruction, then these commands will run to update the value, and
>>>> eventually when we context switch away, the modified value will be
>>>> written out to the LRC's main MI_LRI instruction so.
>>>>
>>>> That should work, but wouldn't it be more straightforward (and more
>>>> consistent with our other LRC initialization) to use
>>>> xe_lrc_write_ctx_reg() to put the value we want into the LRC even before
>>>> it runs for the first time?  That's how we poke several other register
>>>> values into the in-memory LRC during init.  There's a
>>>> xe_lrc_read_ctx_reg() you can use to get the current value for
>>>> read-modify-write purposes (see the handling of the RUNALONE flag for an
>>>> example).
>>>>
>>>> The only quirk of using xe_lrc_read_ctx_reg() instead of
>>>> xe_lrc_write_ring() is that we'll need to add a #define for the dword
>>>> offset of COMMON_SLICE_CHICKEN3 within the LRC since we don't have that
>>>> defined yet.
>>>
>>> I'm not sure how you make this work.
>>>
>>> The current register you place like this from the host, their location in
>>> the image is know and doesn't change.
>>>
>>> I can't say this is the case for COMMON_SLICE_CHICKEN3.
>> You'd find it by looking at bspec 65182, although it's a bit annoying
>> since you have to manually count up the values in the "# of DW" column
>> to find the proper offset.
> Now that I think about it, we could probably do something on the KMD
> side to make it easier to find these offsets for people who have access
> to the platform in question --- we could add a running offset to
> to /sys/kernel/debug/dri/0/gt0/default_lrc_rcs and such.  I'll add that
> to my todo list, since it may come in useful in the future.
>
>
> Matt


Nice idea. I'm afraid you're going to find out it's not stable across 
GPUs and it'll be rather annoying to have an offset per platform.

Hopefully I'm wrong.


-Lionel


>
>> Anyway, it's not a big deal.  We can always switch over later on as a
>> follow-up patch if we decide we want to.
>>
>>
>> Matt
>>
>>>
>>> -Lionel
>>>
>>>
>>>>> +
>>>>>    	map = __xe_lrc_seqno_map(lrc);
>>>>>    	xe_map_write32(lrc_to_xe(lrc), &map, lrc->fence_ctx.next_seqno - 1);
>>>>> diff --git a/drivers/gpu/drm/xe/xe_lrc.h b/drivers/gpu/drm/xe/xe_lrc.h
>>>>> index c307a3fd9ea28..083a2167aeef8 100644
>>>>> --- a/drivers/gpu/drm/xe/xe_lrc.h
>>>>> +++ b/drivers/gpu/drm/xe/xe_lrc.h
>>>>> @@ -49,6 +49,7 @@ struct xe_lrc_snapshot {
>>>>>    #define XE_LRC_CREATE_RUNALONE		BIT(0)
>>>>>    #define XE_LRC_CREATE_PXP		BIT(1)
>>>>>    #define XE_LRC_CREATE_USER_CTX		BIT(2)
>>>>> +#define XE_LRC_STATE_CACHE_PERF_FIX	BIT(3)
>>>>>    struct xe_lrc *xe_lrc_create(struct xe_hw_engine *hwe, struct xe_vm *vm,
>>>>>    			     void *replay_state, u32 ring_size, u16 msix_vec, u32 flags);
>>>>> diff --git a/drivers/gpu/drm/xe/xe_query.c b/drivers/gpu/drm/xe/xe_query.c
>>>>> index 34db266b723fa..5927eaf792efe 100644
>>>>> --- a/drivers/gpu/drm/xe/xe_query.c
>>>>> +++ b/drivers/gpu/drm/xe/xe_query.c
>>>>> @@ -340,6 +340,8 @@ static int query_config(struct xe_device *xe, struct drm_xe_device_query *query)
>>>>>    			DRM_XE_QUERY_CONFIG_FLAG_HAS_NO_COMPRESSION_HINT;
>>>>>    	config->info[DRM_XE_QUERY_CONFIG_FLAGS] |=
>>>>>    			DRM_XE_QUERY_CONFIG_FLAG_HAS_LOW_LATENCY;
>>>>> +	config->info[DRM_XE_QUERY_CONFIG_FLAGS] |=
>>>>> +		DRM_XE_QUERY_CONFIG_FLAG_HAS_STATE_CACHE_PERF_FIX;
>>>>>    	config->info[DRM_XE_QUERY_CONFIG_MIN_ALIGNMENT] =
>>>>>    		xe->info.vram_flags & XE_VRAM_FLAGS_NEED64K ? SZ_64K : SZ_4K;
>>>>>    	config->info[DRM_XE_QUERY_CONFIG_VA_BITS] = xe->info.va_bits;
>>>>> diff --git a/include/uapi/drm/xe_drm.h b/include/uapi/drm/xe_drm.h
>>>>> index c9e70f78e7238..856838fcadd89 100644
>>>>> --- a/include/uapi/drm/xe_drm.h
>>>>> +++ b/include/uapi/drm/xe_drm.h
>>>>> @@ -406,6 +406,9 @@ struct drm_xe_query_mem_regions {
>>>>>     *    - %DRM_XE_QUERY_CONFIG_FLAG_HAS_NO_COMPRESSION_HINT - Flag is set if the
>>>>>     *      device supports the userspace hint %DRM_XE_GEM_CREATE_FLAG_NO_COMPRESSION.
>>>>>     *      This is exposed only on Xe2+.
>>>>> + *    - %DRM_XE_QUERY_CONFIG_FLAG_HAS_STATE_CACHE_PERF_FIX - Flag is set
>>>>> + *      if a queue can be creaed with
>>>>> + *      %DRM_XE_EXEC_QUEUE_SET_STATE_CACHE_PERF_FIX
>>>>>     *  - %DRM_XE_QUERY_CONFIG_MIN_ALIGNMENT - Minimal memory alignment
>>>>>     *    required by this device, typically SZ_4K or SZ_64K
>>>>>     *  - %DRM_XE_QUERY_CONFIG_VA_BITS - Maximum bits of a virtual address
>>>>> @@ -425,6 +428,7 @@ struct drm_xe_query_config {
>>>>>    	#define DRM_XE_QUERY_CONFIG_FLAG_HAS_LOW_LATENCY	(1 << 1)
>>>>>    	#define DRM_XE_QUERY_CONFIG_FLAG_HAS_CPU_ADDR_MIRROR	(1 << 2)
>>>>>    	#define DRM_XE_QUERY_CONFIG_FLAG_HAS_NO_COMPRESSION_HINT (1 << 3)
>>>>> +	#define DRM_XE_QUERY_CONFIG_FLAG_HAS_STATE_CACHE_PERF_FIX	(1 << 4)
>>>>>    #define DRM_XE_QUERY_CONFIG_MIN_ALIGNMENT		2
>>>>>    #define DRM_XE_QUERY_CONFIG_VA_BITS			3
>>>>>    #define DRM_XE_QUERY_CONFIG_MAX_EXEC_QUEUE_PRIORITY	4
>>>>> @@ -1279,6 +1283,9 @@ struct drm_xe_vm_bind {
>>>>>     *  - %DRM_XE_EXEC_QUEUE_SET_PROPERTY_MULTI_QUEUE_PRIORITY - Set the queue
>>>>>     *    priority within the multi-queue group. Current valid priority values are 0–2
>>>>>     *    (default is 1), with higher values indicating higher priority.
>>>>> + *  - %DRM_XE_EXEC_QUEUE_SET_STATE_CACHE_PERF_FIX - Set the queue to
>>>>> + *    enable render color cache keying on BTP+BTI instead of just BTI
>>>>> + *    (only valid for render queues).
>>>> I'm not sure if this is the best name.  The bspec indicates that
>>>> 0x7304[13] effectively *disables* "state cache perf fix" which was only
>>>> intended for DX11 scenarios and shouldn't be used elsewhere.  So it
>>>> seems like the name here should either mention "disable" or should be a
>>>> more descriptive explanation of what actually happens when we set this
>>>> flag (e.g., "xxx_USE_BTP_AND_BTI" rather than using the vague "PERF_FIX"
>>>> terminology).  The maintainers may have thoughts on what they want to
>>>> see.
>>>>
>>>>
>>>> Matt
>>>>
>>>>>     *
>>>>>     * The example below shows how to use @drm_xe_exec_queue_create to create
>>>>>     * a simple exec_queue (no parallel submission) of class
>>>>> @@ -1323,6 +1330,7 @@ struct drm_xe_exec_queue_create {
>>>>>    #define   DRM_XE_EXEC_QUEUE_SET_PROPERTY_MULTI_GROUP		4
>>>>>    #define     DRM_XE_MULTI_GROUP_CREATE				(1ull << 63)
>>>>>    #define   DRM_XE_EXEC_QUEUE_SET_PROPERTY_MULTI_QUEUE_PRIORITY	5
>>>>> +#define   DRM_XE_EXEC_QUEUE_SET_STATE_CACHE_PERF_FIX		6
>>>>>    	/** @extensions: Pointer to the first extension struct, if any */
>>>>>    	__u64 extensions;
>>>>> -- 
>>>>> 2.43.0
>>>>>
>> -- 
>> Matt Roper
>> Graphics Software Engineer
>> Linux GPU Platform Enablement
>> Intel Corporation



  reply	other threads:[~2026-03-02  7:50 UTC|newest]

Thread overview: 10+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-02-17  8:34 [v2] drm/xe: Allow per queue programming of COMMON_SLICE_CHICKEN3 bit13 Lionel Landwerlin
2026-02-17  8:38 ` ✓ CI.KUnit: success for drm/xe: Allow per queue programming of COMMON_SLICE_CHICKEN3 bit13 (rev2) Patchwork
2026-02-17  9:24 ` ✓ Xe.CI.BAT: " Patchwork
2026-02-17 10:22 ` ✗ Xe.CI.FULL: failure " Patchwork
2026-02-17 23:51 ` [v2] drm/xe: Allow per queue programming of COMMON_SLICE_CHICKEN3 bit13 Matt Roper
2026-02-27  8:42   ` Lionel Landwerlin
2026-02-27 22:12     ` Matt Roper
2026-02-27 22:17       ` Matt Roper
2026-03-02  7:52         ` Lionel Landwerlin [this message]
2026-02-18 15:58 ` Rodrigo Vivi

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=711d1ca5-25ed-4518-8282-ec52251cc094@intel.com \
    --to=lionel.g.landwerlin@intel.com \
    --cc=intel-xe@lists.freedesktop.org \
    --cc=matthew.d.roper@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox