public inbox for intel-xe@lists.freedesktop.org
 help / color / mirror / Atom feed
From: Gustavo Sousa <gustavo.sousa@intel.com>
To: Matt Roper <matthew.d.roper@intel.com>
Cc: <intel-xe@lists.freedesktop.org>
Subject: Re: [PATCH 12/12] drm/xe/reg_sr: Do sanity check for MCR vs non-MCR
Date: Tue, 5 May 2026 16:39:10 -0300	[thread overview]
Message-ID: <87ik91y7c1.fsf@intel.com> (raw)
In-Reply-To: <20260128235940.GW458797@mdroper-desk1.amr.corp.intel.com>

Matt Roper <matthew.d.roper@intel.com> writes:

> On Fri, Jan 16, 2026 at 07:12:20PM -0300, Gustavo Sousa wrote:
>> The type struct xe_reg_mcr exists to ensure that the correct API is used
>> when handling MCR registers.  However, for in the register save/restore
>> functionality, the RTP processing always cast the register to a struct
>> xe_reg and then apply_one_mmio() selects the MMIO API based on the "mcr"
>> field of the register instance.
>> 
>> This allows the developer to commit mistakes like passing a MCR register
>> for an RTP action for a platform where the respective register is not
>> MCR; and vice-versa.
>> 
>> To capture such scenarios, do a sanity check in xe_reg_sr_add() and
>> raise warnings if inconsistencies are detected.
>> 
>> Signed-off-by: Gustavo Sousa <gustavo.sousa@intel.com>
>> ---
>>  drivers/gpu/drm/xe/tests/xe_rtp_test.c | 71 ++++++++++++++++++++++++++++++----
>>  drivers/gpu/drm/xe/tests/xe_wa_test.c  | 11 +++++-
>>  drivers/gpu/drm/xe/xe_gt_mcr.c         | 21 ++++++++++
>>  drivers/gpu/drm/xe/xe_gt_mcr.h         |  1 +
>>  drivers/gpu/drm/xe/xe_reg_sr.c         | 36 +++++++++++++++++
>>  5 files changed, 132 insertions(+), 8 deletions(-)
>> 
>> diff --git a/drivers/gpu/drm/xe/tests/xe_rtp_test.c b/drivers/gpu/drm/xe/tests/xe_rtp_test.c
>> index d2255a59e58f..80bd83f56d04 100644
>> --- a/drivers/gpu/drm/xe/tests/xe_rtp_test.c
>> +++ b/drivers/gpu/drm/xe/tests/xe_rtp_test.c
>> @@ -9,24 +9,30 @@
>>  #include <drm/drm_drv.h>
>>  #include <drm/drm_kunit_helpers.h>
>>  
>> +#include <kunit/static_stub.h>
>>  #include <kunit/test.h>
>>  
>>  #include "regs/xe_gt_regs.h"
>>  #include "regs/xe_reg_defs.h"
>>  #include "xe_device.h"
>>  #include "xe_device_types.h"
>> +#include "xe_gt_mcr.h"
>>  #include "xe_kunit_helpers.h"
>>  #include "xe_pci_test.h"
>>  #include "xe_reg_sr.h"
>>  #include "xe_rtp.h"
>>  
>> -#define REGULAR_REG1	XE_REG(1)
>> -#define REGULAR_REG2	XE_REG(2)
>> -#define REGULAR_REG3	XE_REG(3)
>> -#define MCR_REG1	XE_REG_MCR(1)
>> -#define MCR_REG2	XE_REG_MCR(2)
>> -#define MCR_REG3	XE_REG_MCR(3)
>> -#define MASKED_REG1	XE_REG(1, XE_REG_OPTION_MASKED)
>> +#define REGULAR_REG1		XE_REG(1)
>> +#define REGULAR_REG2		XE_REG(2)
>> +#define REGULAR_REG3		XE_REG(3)
>> +#define REGULAR_REG4		XE_REG(4)
>> +#define BAD_REGULAR_REG5	XE_REG(5)
>> +#define MCR_REG1		XE_REG_MCR(1)
>> +#define MCR_REG2		XE_REG_MCR(2)
>> +#define MCR_REG3		XE_REG_MCR(3)
>> +#define BAD_MCR_REG4		XE_REG_MCR(4)
>> +#define MCR_REG5		XE_REG_MCR(5)
>> +#define MASKED_REG1		XE_REG(1, XE_REG_OPTION_MASKED)
>>  
>>  #undef XE_REG_MCR
>>  #define XE_REG_MCR(...)     XE_REG(__VA_ARGS__, .mcr = 1)
>> @@ -48,6 +54,23 @@ struct rtp_test_case {
>>  	const struct xe_rtp_entry *entries;
>>  };
>>  
>> +static bool fake_xe_gt_mcr_check_reg(struct xe_gt *gt, struct xe_reg reg)
>> +{
>> +	/*
>> +	 * All supported platforms in this imaginary setup will always have REG4
>> +	 * as a non-MCR register and REG5 as MCR, meaning that BAD_MCR_REG4 and
>> +	 * BAD_REGULAR_REG5 represent programming errors to be captured by our
>> +	 * tests.
>> +	 */
>> +	if (reg.raw == BAD_REGULAR_REG5.raw)
>> +		return true;
>> +
>> +	if (reg.raw == BAD_MCR_REG4.raw)
>> +		return false;
>> +
>> +	return reg.mcr;
>> +}
>> +
>>  static bool match_yes(const struct xe_device *xe, const struct xe_gt *gt,
>>  		      const struct xe_hw_engine *hwe)
>>  {
>> @@ -304,6 +327,38 @@ static const struct rtp_to_sr_test_case rtp_to_sr_cases[] = {
>>  			{}
>>  		},
>>  	},
>> +	{
>> +		.name = "bad-mcr-reg-forced-to-regular",
>> +		.expected_reg = REGULAR_REG4,
>> +		.expected_set_bits = REG_BIT(0),
>> +		.expected_clr_bits = REG_BIT(0),
>> +		.expected_active = BIT(0),
>> +		.expected_count_sr_entries = 1,
>> +		.expected_sr_errors = 1,
>> +		.entries = (const struct xe_rtp_entry_sr[]) {
>> +			{ XE_RTP_NAME("bad-mcr-regular-reg"),
>> +			  XE_RTP_RULES(FUNC(match_yes)),
>> +			  XE_RTP_ACTIONS(SET(BAD_MCR_REG4, REG_BIT(0)))
>> +			},
>> +			{}
>> +		},
>> +	},
>> +	{
>> +		.name = "bad-regular-reg-forced-to-mcr",
>> +		.expected_reg = MCR_REG5,
>> +		.expected_set_bits = REG_BIT(0),
>> +		.expected_clr_bits = REG_BIT(0),
>> +		.expected_active = BIT(0),
>> +		.expected_count_sr_entries = 1,
>> +		.expected_sr_errors = 1,
>> +		.entries = (const struct xe_rtp_entry_sr[]) {
>> +			{ XE_RTP_NAME("bad-regular-reg"),
>> +			  XE_RTP_RULES(FUNC(match_yes)),
>> +			  XE_RTP_ACTIONS(SET(BAD_REGULAR_REG5, REG_BIT(0)))
>> +			},
>> +			{}
>> +		},
>> +	},
>>  };
>>  
>>  static void xe_rtp_process_to_sr_tests(struct kunit *test)
>> @@ -522,6 +577,8 @@ static int xe_rtp_test_init(struct kunit *test)
>>  	xe->drm.dev = dev;
>>  	test->priv = xe;
>>  
>> +	kunit_activate_static_stub(test, xe_gt_mcr_check_reg, fake_xe_gt_mcr_check_reg);
>> +
>>  	return 0;
>>  }
>>  
>> diff --git a/drivers/gpu/drm/xe/tests/xe_wa_test.c b/drivers/gpu/drm/xe/tests/xe_wa_test.c
>> index 3311f05a6fc2..326278488cdb 100644
>> --- a/drivers/gpu/drm/xe/tests/xe_wa_test.c
>> +++ b/drivers/gpu/drm/xe/tests/xe_wa_test.c
>> @@ -13,6 +13,7 @@
>>  #include "tests/xe_mmio_intercept.h"
>>  #include "xe_device.h"
>>  #include "xe_gt.h"
>> +#include "xe_gt_mcr.h"
>>  #include "xe_hw_engine.h"
>>  #include "xe_kunit_helpers.h"
>>  #include "xe_pci_test.h"
>> @@ -38,8 +39,10 @@ static int xe_wa_test_init(struct kunit *test)
>>  {
>>  	const struct xe_pci_fake_data *param = test->param_value;
>>  	struct xe_pci_fake_data data = *param;
>> -	struct xe_device *xe;
>>  	struct device *dev;
>> +	struct xe_device *xe;
>> +	struct xe_gt *gt;
>> +	int id;
>>  	int ret;
>>  
>>  	dev = drm_kunit_helper_alloc_device(test);
>> @@ -52,6 +55,12 @@ static int xe_wa_test_init(struct kunit *test)
>>  	ret = xe_pci_fake_device_init(xe);
>>  	KUNIT_ASSERT_EQ(test, ret, 0);
>>  
>> +	/* Needed for sanitize_mcr(). */
>> +	for_each_gt(gt, xe, id) {
>> +		xe_gt_mcr_init_early(gt);
>> +		xe_gt_mmio_init(gt);
>> +	}
>> +
>>  	kunit_activate_static_stub(test,
>>  				   xe_mmio_intercept_read32,
>>  				   xe_wa_mmio_intercept_read32);
>> diff --git a/drivers/gpu/drm/xe/xe_gt_mcr.c b/drivers/gpu/drm/xe/xe_gt_mcr.c
>> index e7eb3c6da234..c6d9766b6cc6 100644
>> --- a/drivers/gpu/drm/xe/xe_gt_mcr.c
>> +++ b/drivers/gpu/drm/xe/xe_gt_mcr.c
>> @@ -3,6 +3,8 @@
>>   * Copyright © 2022 Intel Corporation
>>   */
>>  
>> +#include <kunit/static_stub.h>
>> +
>>  #include "xe_gt_mcr.h"
>>  
>>  #include "regs/xe_gt_regs.h"
>> @@ -623,6 +625,25 @@ static bool reg_in_steering_type_ranges(struct xe_gt *gt,
>>  	return false;
>>  }
>>  
>> +/*
>> + * xe_gt_mcr_check_reg - check if a register is recognized by this GT as MCR
>> + * @gt: GT structure
>> + * @reg: The register to check
>> + *
>> + * Returns true if the register offset falls within one of the MMIO ranges
>> + * classified as MCR for the GT.
>> + */
>> +bool xe_gt_mcr_check_reg(struct xe_gt *gt, struct xe_reg reg)
>> +{
>> +	KUNIT_STATIC_STUB_REDIRECT(xe_gt_mcr_check_reg, gt, reg);
>> +
>> +	for (int type = 0; type <= IMPLICIT_STEERING; type++)
>> +		if (reg_in_steering_type_ranges(gt, reg, type))
>> +			return true;
>> +
>> +	return false;
>> +}
>> +
>>  /*
>>   * xe_gt_mcr_get_nonterminated_steering - find group/instance values that
>>   *    will steer a register to a non-terminated instance
>> diff --git a/drivers/gpu/drm/xe/xe_gt_mcr.h b/drivers/gpu/drm/xe/xe_gt_mcr.h
>> index 283a1c9770e2..d6b50b52b1d5 100644
>> --- a/drivers/gpu/drm/xe/xe_gt_mcr.h
>> +++ b/drivers/gpu/drm/xe/xe_gt_mcr.h
>> @@ -26,6 +26,7 @@ void xe_gt_mcr_unicast_write(struct xe_gt *gt, struct xe_reg_mcr mcr_reg,
>>  void xe_gt_mcr_multicast_write(struct xe_gt *gt, struct xe_reg_mcr mcr_reg,
>>  			       u32 value);
>>  
>> +bool xe_gt_mcr_check_reg(struct xe_gt *gt, struct xe_reg reg);
>>  bool xe_gt_mcr_get_nonterminated_steering(struct xe_gt *gt,
>>  					  struct xe_reg_mcr reg_mcr,
>>  					  u8 *group, u8 *instance);
>> diff --git a/drivers/gpu/drm/xe/xe_reg_sr.c b/drivers/gpu/drm/xe/xe_reg_sr.c
>> index d3e13ea33123..89d35301defa 100644
>> --- a/drivers/gpu/drm/xe/xe_reg_sr.c
>> +++ b/drivers/gpu/drm/xe/xe_reg_sr.c
>> @@ -68,14 +68,49 @@ static void reg_sr_inc_error(struct xe_reg_sr *sr)
>>  #endif
>>  }
>>  
>> +static struct xe_reg sanitize_mcr(struct xe_reg_sr *sr,
>> +				  const struct xe_reg_sr_entry *e,
>> +				  struct xe_gt *gt)
>> +{
>> +	struct xe_reg reg = e->reg;
>> +	bool is_mcr;
>> +
>> +	/*
>> +	 * We need the gt structure to check MCR ranges.
>> +	 */
>> +	if (!gt)
>> +		return reg;
>> +
>> +	is_mcr = xe_gt_mcr_check_reg(gt, reg);
>> +
>> +	if (is_mcr && !reg.mcr) {
>> +		reg.mcr = 1;
>> +		xe_gt_warn(gt, "xe_reg_sr_entry using non-MCR register for address 0x%x, forcing MCR\n",
>> +			   reg.addr);
>> +		reg_sr_inc_error(sr);
>> +	}
>> +
>> +	if (!is_mcr && reg.mcr) {
>> +		reg.mcr = 0;
>> +		xe_gt_warn(gt, "xe_reg_sr_entry using MCR register for address 0x%x, forcing non-MCR\n",
>> +			   reg.addr);
>> +		reg_sr_inc_error(sr);
>> +	}
>
> This sanity check (and fixup) is going to happen not only on the
> non-live kunit test, but also on true device probes on hardware, right?
> In this case, we might want to consider downgrading these messages to
> notice level rather than warn.  The kunit results should let us know if
> we have a problem here (even before we have real hardware CI setup), but
> if a mistake somehow sneaks by, having warn (or error) level messages
> printed in dmesg will cause CI to consider the entire device probe a
> failure and will prevent any further testing from running on the
> platform until it's fixed.  That seems unnecessarily harsh since a
> mistake here shouldn't really impact most of the other CI results we
> could be receiving.

Yep. I agree. I'll downgrade it to notice level on the next iteration of
this patch.

>
>
> Actually, related to the earlier patches in this series, I wonder if we
> truly need to mock up register accesses for non-live kunits.  It seems
> like we might still wind up with some FUNC() rules that incorrectly
> disqualify RTP entries that we want to check.  I'm thinking of oddball
> rules like xe_rtp_match_gt_has_discontiguous_dss_groups that either come
> from previous parts of driver initialization, and/or have somewhat
> non-obvious relationships to register values. It seems like for the
> purposes of checking MCR vs non-MCR status, the _only_ thing that
> matters is IP version:  IP version determines which MCR tables are used
> on the platform and IP version is the only RTP rule that actually
> matters to confirming that we're using proper register definitions.
> E.g., for an RTP entry with rules:
>
>    XE_RTP_RULES(GRAPHICS_VERSION_RANGE(2001, 2004),
>                 GRAPHICS_STEP(B0, D0),
>                 FUNC(check_foo()),
>                 FUNC(check_bar()))
>    XE_RTP_ACTIONS(SET(CHKNREG, SOMEBIT)),
>
> we don't care what check_foo() and check_bar() are, or even what the
> stepping is.  Only the IP version rule is necessary to determine whether
> we should check CHKNREG's status against a specific IP version's MCR
> table and for the purposes of the kunit we can treat all other rules as
> 'true' without trying to process them.
>
> So maybe a simpler approach would be to have an alternative version of
> xe_rtp_process_to_sr() intended for use in non-live kunit tests that
> just ignores everything except the IP version rules and treats
> everything else as an automatic 'true.'

Makes sense.  Let me play around with that idea and see if I can send a
new version of this series with that solution.

--
Gustavo Sousa

>
>
> Matt
>
>> +
>> +	return reg;
>> +}
>> +
>>  int xe_reg_sr_add(struct xe_reg_sr *sr,
>>  		  const struct xe_reg_sr_entry *e,
>>  		  struct xe_gt *gt)
>>  {
>>  	unsigned long idx = e->reg.addr;
>>  	struct xe_reg_sr_entry *pentry = xa_load(&sr->xa, idx);
>> +	struct xe_reg reg;
>>  	int ret;
>>  
>> +	reg = sanitize_mcr(sr, e, gt);
>> +
>>  	if (pentry) {
>>  		if (!compatible_entries(pentry, e)) {
>>  			ret = -EINVAL;
>> @@ -96,6 +131,7 @@ int xe_reg_sr_add(struct xe_reg_sr *sr,
>>  	}
>>  
>>  	*pentry = *e;
>> +	pentry->reg = reg;
>>  	ret = xa_err(xa_store(&sr->xa, idx, pentry, GFP_KERNEL));
>>  	if (ret)
>>  		goto fail;
>> 
>> -- 
>> 2.52.0
>> 
>
> -- 
> Matt Roper
> Graphics Software Engineer
> Linux GPU Platform Enablement
> Intel Corporation

  reply	other threads:[~2026-05-05 19:39 UTC|newest]

Thread overview: 30+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-01-16 22:12 [PATCH 00/12] Fix MCR inconsistencies in RTP tables Gustavo Sousa
2026-01-16 22:12 ` [PATCH 01/12] drm/xe: Define CACHE_MODE_1 as MCR register Gustavo Sousa
2026-01-21  0:04   ` Matt Roper
2026-01-16 22:12 ` [PATCH 02/12] drm/xe: Define and use MCR version of COMMON_SLICE_CHICKEN1 Gustavo Sousa
2026-01-21  0:06   ` Matt Roper
2026-01-16 22:12 ` [PATCH 03/12] drm/xe: Define and use MCR version of COMMON_SLICE_CHICKEN4 Gustavo Sousa
2026-01-21  0:08   ` Matt Roper
2026-01-16 22:12 ` [PATCH 04/12] drm/xe/kunit: Add xe_kunit_is_live_test() Gustavo Sousa
2026-01-16 22:59   ` Michal Wajdeczko
2026-05-05 18:32     ` Gustavo Sousa
2026-01-16 22:12 ` [PATCH 05/12] drm/xe/kunit: Abort test if MMIO operation is attempted Gustavo Sousa
2026-01-16 23:15   ` Michal Wajdeczko
2026-05-05 19:11     ` Gustavo Sousa
2026-01-16 22:12 ` [PATCH 06/12] drm/xe/kunit: Allow intercepting MMIO operations Gustavo Sousa
2026-01-16 22:12 ` [PATCH 07/12] drm/xe: Extract xe_hw_engine_setup_reg_lrc() Gustavo Sousa
2026-01-21  0:12   ` Matt Roper
2026-01-16 22:12 ` [PATCH 08/12] drm/xe: Extract xe_hw_engines_setup_runtime_mask() Gustavo Sousa
2026-01-28 18:07   ` Matt Roper
2026-01-16 22:12 ` [PATCH 09/12] drm/xe/kunit: Use KUNIT_EXPECT_EQ() in xe_wa_gt() Gustavo Sousa
2026-01-16 23:29   ` Michal Wajdeczko
2026-01-28 18:09   ` Matt Roper
2026-01-16 22:12 ` [PATCH 10/12] drm/xe/kunit: Include hw_engines in xe_wa test Gustavo Sousa
2026-01-28 21:08   ` Matt Roper
2026-01-16 22:12 ` [PATCH 11/12] drm/xe/mcr: Extract reg_in_steering_type_ranges() Gustavo Sousa
2026-01-28 21:11   ` Matt Roper
2026-01-16 22:12 ` [PATCH 12/12] drm/xe/reg_sr: Do sanity check for MCR vs non-MCR Gustavo Sousa
2026-01-28 23:59   ` Matt Roper
2026-05-05 19:39     ` Gustavo Sousa [this message]
2026-01-16 23:15 ` ✗ CI.checkpatch: warning for Fix MCR inconsistencies in RTP tables Patchwork
2026-01-16 23:16 ` ✓ CI.KUnit: success " Patchwork

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=87ik91y7c1.fsf@intel.com \
    --to=gustavo.sousa@intel.com \
    --cc=intel-xe@lists.freedesktop.org \
    --cc=matthew.d.roper@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox