All of lore.kernel.org
 help / color / mirror / Atom feed
From: Raag Jadav <raag.jadav@intel.com>
To: Riana Tauro <riana.tauro@intel.com>
Cc: intel-xe@lists.freedesktop.org, dri-devel@lists.freedesktop.org,
	aravind.iddamsetty@linux.intel.com, anshuman.gupta@intel.com,
	rodrigo.vivi@intel.com, joonas.lahtinen@linux.intel.com,
	simona.vetter@ffwll.ch, airlied@gmail.com, pratik.bari@intel.com,
	joshua.santosh.ranjan@intel.com, ashwin.kumar.kulkarni@intel.com,
	shubham.kumar@intel.com, ravi.kishore.koppuravuri@intel.com,
	Himal Prasad Ghimiray <himal.prasad.ghimiray@intel.com>
Subject: Re: [PATCH v5 5/5] drm/xe/xe_hw_error: Add support for PVC SoC errors
Date: Thu, 5 Feb 2026 19:10:28 +0100	[thread overview]
Message-ID: <aYTdFPXDRUsT7JGe@black.igk.intel.com> (raw)
In-Reply-To: <20260202064356.286243-12-riana.tauro@intel.com>

On Mon, Feb 02, 2026 at 12:14:01PM +0530, Riana Tauro wrote:
> Report the SoC nonfatal/fatal hardware error and update the counters.
> 
> $ sudo ynl --family drm_ras --do query-error-counter  --json '{"node-id":0, "error-id":2}'

Same comment as last patch.

> {'error-id': 2, 'error-name': 'soc-internal', 'error-value': 0}
> 
> Co-developed-by: Himal Prasad Ghimiray <himal.prasad.ghimiray@intel.com>
> Signed-off-by: Himal Prasad Ghimiray <himal.prasad.ghimiray@intel.com>
> Signed-off-by: Riana Tauro <riana.tauro@intel.com>
> ---
> v2: Add ID's and names as uAPI (Rodrigo)
> 
> v3: reorder and align arrays
>     remove redundant string err
>     use REG_BIT
>     fix aesthic review comments (Raag)
>     use only correctable/uncorrectable error severity (Aravind)
> 
> v4: fix comments
>     use master as variable name
>     add static_assert (Raag)
> ---
>  drivers/gpu/drm/xe/regs/xe_hw_error_regs.h |  24 +++
>  drivers/gpu/drm/xe/xe_hw_error.c           | 221 ++++++++++++++++++++-
>  2 files changed, 244 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/gpu/drm/xe/regs/xe_hw_error_regs.h b/drivers/gpu/drm/xe/regs/xe_hw_error_regs.h
> index 17982a335941..a89a07d067fc 100644
> --- a/drivers/gpu/drm/xe/regs/xe_hw_error_regs.h
> +++ b/drivers/gpu/drm/xe/regs/xe_hw_error_regs.h
> @@ -41,6 +41,7 @@
>  									  DEV_ERR_STAT_NONFATAL))
>  
>  #define   XE_CSC_ERROR					17

I overlooked this in the last patch but I think this should be used as

	if (err_src & REG_BIT(XE_CSC_ERROR))

> +#define   XE_SOC_ERROR					16
>  #define   XE_GT_ERROR					0
>  
>  #define ERR_STAT_GT_FATAL_VECTOR_0			0x100260
> @@ -61,4 +62,27 @@
>  							ERR_STAT_GT_COR_VECTOR_REG(x) : \
>  							ERR_STAT_GT_FATAL_VECTOR_REG(x))
>  
> +#define SOC_PVC_MASTER_BASE				0x282000
> +#define SOC_PVC_SLAVE_BASE				0x283000
> +
> +#define SOC_GCOERRSTS					0x200
> +#define SOC_GNFERRSTS					0x210
> +#define SOC_GLOBAL_ERR_STAT_REG(base, x)		XE_REG(_PICK_EVEN((x), \
> +									  (base) + SOC_GCOERRSTS, \
> +									  (base) + SOC_GNFERRSTS))
> +#define   SOC_SLAVE_IEH					REG_BIT(1)
> +#define   SOC_IEH0_LOCAL_ERR_STATUS			REG_BIT(0)
> +#define   SOC_IEH1_LOCAL_ERR_STATUS			REG_BIT(0)
> +
> +#define SOC_GSYSEVTCTL					0x264
> +#define SOC_GSYSEVTCTL_REG(master, slave, x)		XE_REG(_PICK_EVEN((x), \
> +									  (master) + SOC_GSYSEVTCTL, \
> +									  (slave) + SOC_GSYSEVTCTL))
> +
> +#define SOC_LERRUNCSTS					0x280
> +#define SOC_LERRCORSTS					0x294
> +#define SOC_LOCAL_ERR_STAT_REG(base, hw_err)		XE_REG(hw_err == HARDWARE_ERROR_CORRECTABLE ? \
> +							       (base) + SOC_LERRCORSTS : \
> +							       (base) + SOC_LERRUNCSTS)
> +
>  #endif
> diff --git a/drivers/gpu/drm/xe/xe_hw_error.c b/drivers/gpu/drm/xe/xe_hw_error.c
> index ff31fb322c8a..159ec796386a 100644
> --- a/drivers/gpu/drm/xe/xe_hw_error.c
> +++ b/drivers/gpu/drm/xe/xe_hw_error.c
> @@ -19,6 +19,7 @@
>  #define  GT_HW_ERROR_MAX_ERR_BITS	16
>  #define  HEC_UNCORR_FW_ERR_BITS		4
>  #define  XE_RAS_REG_SIZE		32
> +#define  XE_SOC_NUM_IEH			2
>  
>  #define  PVC_ERROR_MASK_SET(hw_err, err_bit) \
>  	((hw_err == HARDWARE_ERROR_CORRECTABLE) ? (BIT(err_bit) & PVC_COR_ERR_MASK) : \
> @@ -36,7 +37,8 @@ static const char * const hec_uncorrected_fw_errors[] = {
>  };
>  
>  static const unsigned long xe_hw_error_map[] = {
> -	[XE_GT_ERROR] = DRM_XE_RAS_ERR_COMP_CORE_COMPUTE,
> +	[XE_GT_ERROR]	= DRM_XE_RAS_ERR_COMP_CORE_COMPUTE,

Unneeded churn, please align in the original patch.

> +	[XE_SOC_ERROR]	= DRM_XE_RAS_ERR_COMP_SOC_INTERNAL,
>  };
>  
>  enum gt_vector_regs {
> @@ -60,6 +62,102 @@ static enum drm_xe_ras_error_severity hw_err_to_severity(enum hardware_error hw_
>  	return DRM_XE_RAS_ERR_SEV_UNCORRECTABLE;
>  }
>  
> +static const char * const pvc_master_global_err_reg[] = {
> +	[0 ... 1]	= "Undefined",
> +	[2]		= "HBM SS0: Channel0",
> +	[3]		= "HBM SS0: Channel1",
> +	[4]		= "HBM SS0: Channel2",
> +	[5]		= "HBM SS0: Channel3",
> +	[6]		= "HBM SS0: Channel4",
> +	[7]		= "HBM SS0: Channel5",
> +	[8]		= "HBM SS0: Channel6",
> +	[9]		= "HBM SS0: Channel7",
> +	[10]		= "HBM SS1: Channel0",
> +	[11]		= "HBM SS1: Channel1",
> +	[12]		= "HBM SS1: Channel2",
> +	[13]		= "HBM SS1: Channel3",
> +	[14]		= "HBM SS1: Channel4",
> +	[15]		= "HBM SS1: Channel5",
> +	[16]		= "HBM SS1: Channel6",
> +	[17]		= "HBM SS1: Channel7",
> +	[18 ... 31]	= "Undefined",
> +};
> +

Redundant blank line.

> +static_assert(ARRAY_SIZE(pvc_master_global_err_reg) == XE_RAS_REG_SIZE);
> +
> +static const char * const pvc_slave_global_err_reg[] = {
> +	[0]		= "Undefined",
> +	[1]		= "HBM SS2: Channel0",
> +	[2]		= "HBM SS2: Channel1",
> +	[3]		= "HBM SS2: Channel2",
> +	[4]		= "HBM SS2: Channel3",
> +	[5]		= "HBM SS2: Channel4",
> +	[6]		= "HBM SS2: Channel5",
> +	[7]		= "HBM SS2: Channel6",
> +	[8]		= "HBM SS2: Channel7",
> +	[9]		= "HBM SS3: Channel0",
> +	[10]		= "HBM SS3: Channel1",
> +	[11]		= "HBM SS3: Channel2",
> +	[12]		= "HBM SS3: Channel3",
> +	[13]		= "HBM SS3: Channel4",
> +	[14]		= "HBM SS3: Channel5",
> +	[15]		= "HBM SS3: Channel6",
> +	[16]		= "HBM SS3: Channel7",
> +	[17]		= "Undefined",
> +	[18]		= "ANR MDFI",
> +	[19 ... 31]	= "Undefined",
> +};
> +

Ditto.

> +static_assert(ARRAY_SIZE(pvc_slave_global_err_reg) == XE_RAS_REG_SIZE);
> +
> +static const char * const pvc_slave_local_fatal_err_reg[] = {
> +	[0]		= "Local IEH: Malformed PCIe AER",
> +	[1]		= "Local IEH: Malformed PCIe ERR",
> +	[2]		= "Local IEH: UR conditions in IEH",
> +	[3]		= "Local IEH: From SERR Sources",
> +	[4 ... 19]	= "Undefined",
> +	[20]		= "Malformed MCA error packet (HBM/Punit)",
> +	[21 ... 31]	= "Undefined",
> +};
> +

Ditto.

> +static_assert(ARRAY_SIZE(pvc_slave_local_fatal_err_reg) == XE_RAS_REG_SIZE);
> +
> +static const char * const pvc_master_local_fatal_err_reg[] = {
> +	[0]		= "Local IEH: Malformed IOSF PCIe AER",
> +	[1]		= "Local IEH: Malformed IOSF PCIe ERR",
> +	[2]		= "Local IEH: UR RESPONSE",
> +	[3]		= "Local IEH: From SERR SPI controller",
> +	[4]		= "Base Die MDFI T2T",
> +	[5]		= "Undefined",
> +	[6]		= "Base Die MDFI T2C",
> +	[7]		= "Undefined",
> +	[8]		= "Invalid CSC PSF Command Parity",
> +	[9]		= "Invalid CSC PSF Unexpected Completion",
> +	[10]		= "Invalid CSC PSF Unsupported Request",
> +	[11]		= "Invalid PCIe PSF Command Parity",
> +	[12]		= "PCIe PSF Unexpected Completion",
> +	[13]		= "PCIe PSF Unsupported Request",
> +	[14 ... 19]	= "Undefined",
> +	[20]		= "Malformed MCA error packet (HBM/Punit)",
> +	[21 ... 31]	= "Undefined",
> +};
> +

Ditto.

> +static_assert(ARRAY_SIZE(pvc_master_local_fatal_err_reg) == XE_RAS_REG_SIZE);
> +
> +static const char * const pvc_master_local_nonfatal_err_reg[] = {
> +	[0 ... 3]	= "Undefined",
> +	[4]		= "Base Die MDFI T2T",
> +	[5]		= "Undefined",
> +	[6]		= "Base Die MDFI T2C",
> +	[7]		= "Undefined",
> +	[8]		= "Invalid CSC PSF Command Parity",
> +	[9]		= "Invalid CSC PSF Unexpected Completion",
> +	[10]		= "Invalid PCIe PSF Command Parity",
> +	[11 ... 31]	= "Undefined",
> +};
> +

Ditto.

> +static_assert(ARRAY_SIZE(pvc_master_local_nonfatal_err_reg) == XE_RAS_REG_SIZE);
> +
>  static bool fault_inject_csc_hw_error(void)
>  {
>  	return IS_ENABLED(CONFIG_DEBUG_FS) && should_fail(&inject_csc_hw_error, 1);
> @@ -138,6 +236,26 @@ static void log_gt_err(struct xe_tile *tile, const char *name, int i, u32 err,
>  				    name, severity_str, i, err);
>  }
>  
> +static void log_soc_error(struct xe_tile *tile, const char * const *reg_info,
> +			  const enum drm_xe_ras_error_severity severity, u32 err_bit, u32 index)
> +{
> +	const char *severity_str = error_severity[severity];
> +	struct xe_device *xe = tile_to_xe(tile);
> +	struct xe_drm_ras *ras = &xe->ras;
> +	struct xe_drm_ras_counter *info = ras->info[severity];
> +	const char *name;
> +
> +	name = reg_info[err_bit];
> +
> +	if (strcmp(name, "Undefined")) {
> +		if (severity == DRM_XE_RAS_ERR_SEV_CORRECTABLE)
> +			drm_warn(&xe->drm, "%s SOC %s detected", name, severity_str);
> +		else
> +			drm_err_ratelimited(&xe->drm, "%s SOC %s detected", name, severity_str);
> +		atomic_inc(&info[index].counter);
> +	}
> +}
> +
>  static void gt_hw_error_handler(struct xe_tile *tile, const enum hardware_error hw_err,
>  				u32 error_id)
>  {
> @@ -221,6 +339,104 @@ static void gt_hw_error_handler(struct xe_tile *tile, const enum hardware_error
>  	}
>  }
>  
> +static void soc_slave_ieh_handler(struct xe_tile *tile, const enum hardware_error hw_err, u32 error_id)
> +{
> +	const enum drm_xe_ras_error_severity severity = hw_err_to_severity(hw_err);
> +	unsigned long slave_global_errstat, slave_local_errstat;
> +	struct xe_mmio *mmio = &tile->mmio;
> +	u32 regbit, slave_base;
> +
> +	slave_base = SOC_PVC_SLAVE_BASE;

Just name it 'slave' and it'll probably help remove the line wrapping below.

> +	slave_global_errstat = xe_mmio_read32(mmio, SOC_GLOBAL_ERR_STAT_REG(slave_base, hw_err));
> +
> +	if (slave_global_errstat & SOC_IEH1_LOCAL_ERR_STATUS) {
> +		slave_local_errstat = xe_mmio_read32(mmio, SOC_LOCAL_ERR_STAT_REG(slave_base, hw_err));
> +
> +		if (hw_err == HARDWARE_ERROR_FATAL) {
> +			for_each_set_bit(regbit, &slave_local_errstat, XE_RAS_REG_SIZE)
> +				log_soc_error(tile, pvc_slave_local_fatal_err_reg, severity,
> +					      regbit, error_id);
> +		}
> +
> +		xe_mmio_write32(mmio, SOC_LOCAL_ERR_STAT_REG(slave_base, hw_err),
> +				slave_local_errstat);
> +	}
> +
> +	for_each_set_bit(regbit, &slave_global_errstat, XE_RAS_REG_SIZE)
> +		log_soc_error(tile, pvc_slave_global_err_reg, severity, regbit, error_id);
> +
> +	xe_mmio_write32(mmio, SOC_GLOBAL_ERR_STAT_REG(slave_base, hw_err), slave_global_errstat);
> +}
> +
> +static void soc_hw_error_handler(struct xe_tile *tile, const enum hardware_error hw_err,
> +				 u32 error_id)
> +{
> +	const enum drm_xe_ras_error_severity severity = hw_err_to_severity(hw_err);
> +	struct xe_device *xe = tile_to_xe(tile);
> +	struct xe_mmio *mmio = &tile->mmio;
> +	unsigned long master_global_errstat, master_local_errstat;
> +	u32 master_base, slave_base, regbit;
> +	int i;
> +
> +	if (xe->info.platform != XE_PVC)
> +		return;
> +
> +	master_base = SOC_PVC_MASTER_BASE;
> +	slave_base = SOC_PVC_SLAVE_BASE;

Ditto. Just 'master' and 'slave' will help remove the line wrapping below.

> +	/* Mask error type in GSYSEVTCTL so that no new errors of the type will be reported */
> +	for (i = 0; i < XE_SOC_NUM_IEH; i++)
> +		xe_mmio_write32(mmio, SOC_GSYSEVTCTL_REG(master_base, slave_base, i),
> +				~REG_BIT(hw_err));
> +
> +	if (hw_err == HARDWARE_ERROR_CORRECTABLE) {
> +		xe_mmio_write32(mmio, SOC_GLOBAL_ERR_STAT_REG(master_base, hw_err),
> +				REG_GENMASK(31, 0));
> +		xe_mmio_write32(mmio, SOC_LOCAL_ERR_STAT_REG(master_base, hw_err),
> +				REG_GENMASK(31, 0));
> +		xe_mmio_write32(mmio, SOC_GLOBAL_ERR_STAT_REG(slave_base, hw_err),
> +				REG_GENMASK(31, 0));
> +		xe_mmio_write32(mmio, SOC_LOCAL_ERR_STAT_REG(slave_base, hw_err),
> +				REG_GENMASK(31, 0));
> +		goto unmask_gsysevtctl;
> +	}
> +
> +	/*
> +	 * Read the master global IEH error register if BIT(1) is set then process

Missing comma after 'register'.

> +	 * the slave IEH first. If BIT(0) in global error register is set then process
> +	 * the corresponding local error registers.
> +	 */
> +	master_global_errstat = xe_mmio_read32(mmio, SOC_GLOBAL_ERR_STAT_REG(master_base, hw_err));
> +	if (master_global_errstat & SOC_SLAVE_IEH)
> +		soc_slave_ieh_handler(tile, hw_err, error_id);
> +
> +	if (master_global_errstat & SOC_IEH0_LOCAL_ERR_STATUS) {
> +		master_local_errstat = xe_mmio_read32(mmio, SOC_LOCAL_ERR_STAT_REG(master_base, hw_err));
> +
> +		for_each_set_bit(regbit, &master_local_errstat, XE_RAS_REG_SIZE) {
> +			const char * const *reg_info = (hw_err == HARDWARE_ERROR_FATAL) ?

This looks like it can be outside the loop.

Raag

> +						       pvc_master_local_fatal_err_reg :
> +						       pvc_master_local_nonfatal_err_reg;
> +
> +			log_soc_error(tile, reg_info, severity, regbit, error_id);
> +		}
> +
> +		xe_mmio_write32(mmio, SOC_LOCAL_ERR_STAT_REG(master_base, hw_err),
> +				master_local_errstat);
> +	}
> +
> +	for_each_set_bit(regbit, &master_global_errstat, XE_RAS_REG_SIZE)
> +		log_soc_error(tile, pvc_master_global_err_reg, severity, regbit, error_id);
> +
> +	xe_mmio_write32(mmio, SOC_GLOBAL_ERR_STAT_REG(master_base, hw_err),
> +			master_global_errstat);
> +
> +unmask_gsysevtctl:
> +	for (i = 0; i < XE_SOC_NUM_IEH; i++)
> +		xe_mmio_write32(mmio, SOC_GSYSEVTCTL_REG(master_base, slave_base, i),
> +				(HARDWARE_ERROR_MAX << 1) + 1);
> +}
> +
>  static void hw_error_source_handler(struct xe_tile *tile, const enum hardware_error hw_err)
>  {
>  	const enum drm_xe_ras_error_severity severity = hw_err_to_severity(hw_err);
> @@ -283,8 +499,11 @@ static void hw_error_source_handler(struct xe_tile *tile, const enum hardware_er
>  					    "TILE%d reported %s %s, bit[%d] is set\n",
>  					    tile->id, name, severity_str, err_bit);
>  		}
> +
>  		if (err_bit == XE_GT_ERROR)
>  			gt_hw_error_handler(tile, hw_err, error_id);
> +		if (err_bit == XE_SOC_ERROR)
> +			soc_hw_error_handler(tile, hw_err, error_id);
>  	}
>  
>  clear_reg:
> -- 
> 2.47.1
> 

  reply	other threads:[~2026-02-05 18:10 UTC|newest]

Thread overview: 24+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-02-02  6:43 [PATCH v5 0/5] Introduce DRM_RAS using generic netlink for RAS Riana Tauro
2026-02-02  6:43 ` [PATCH v5 1/5] drm/ras: Introduce the DRM RAS infrastructure over generic netlink Riana Tauro
2026-02-02 10:08   ` kernel test robot
2026-02-02 22:52   ` kernel test robot
2026-02-02  6:43 ` [PATCH v5 2/5] drm/xe/xe_drm_ras: Add support for XE DRM RAS Riana Tauro
2026-02-03 17:58   ` Raag Jadav
2026-02-10  4:20     ` Riana Tauro
2026-02-02  6:43 ` [PATCH v5 3/5] drm/xe/xe_hw_error: Integrate DRM RAS with hardware error handling Riana Tauro
2026-02-05  8:30   ` Raag Jadav
2026-02-10  4:58     ` Riana Tauro
2026-02-10  4:59       ` Riana Tauro
2026-02-02  6:44 ` [PATCH v5 4/5] drm/xe/xe_hw_error: Add support for Core-Compute errors Riana Tauro
2026-02-05 15:30   ` Raag Jadav
2026-02-10  5:58     ` Riana Tauro
2026-02-10 11:45       ` Raag Jadav
2026-02-12  3:25         ` Riana Tauro
2026-02-02  6:44 ` [PATCH v5 5/5] drm/xe/xe_hw_error: Add support for PVC SoC errors Riana Tauro
2026-02-05 18:10   ` Raag Jadav [this message]
2026-02-10  6:32     ` Riana Tauro
2026-02-10 11:52       ` Raag Jadav
2026-02-02 16:15 ` ✗ CI.checkpatch: warning for Introduce DRM_RAS using generic netlink for RAS (rev5) Patchwork
2026-02-02 16:16 ` ✓ CI.KUnit: success " Patchwork
2026-02-02 16:31 ` ✗ CI.checksparse: warning " Patchwork
2026-02-02 16:51 ` ✓ Xe.CI.BAT: success " Patchwork

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=aYTdFPXDRUsT7JGe@black.igk.intel.com \
    --to=raag.jadav@intel.com \
    --cc=airlied@gmail.com \
    --cc=anshuman.gupta@intel.com \
    --cc=aravind.iddamsetty@linux.intel.com \
    --cc=ashwin.kumar.kulkarni@intel.com \
    --cc=dri-devel@lists.freedesktop.org \
    --cc=himal.prasad.ghimiray@intel.com \
    --cc=intel-xe@lists.freedesktop.org \
    --cc=joonas.lahtinen@linux.intel.com \
    --cc=joshua.santosh.ranjan@intel.com \
    --cc=pratik.bari@intel.com \
    --cc=ravi.kishore.koppuravuri@intel.com \
    --cc=riana.tauro@intel.com \
    --cc=rodrigo.vivi@intel.com \
    --cc=shubham.kumar@intel.com \
    --cc=simona.vetter@ffwll.ch \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.