[PATCH v3 3/4] drm/xe/xe_hw_error: Add support for GT hardware errors

Intel-XE Archive on lore.kernel.org
 help / color / mirror / Atom feed

From: Riana Tauro <riana.tauro@intel.com>
To: intel-xe@lists.freedesktop.org, dri-devel@lists.freedesktop.org
Cc: aravind.iddamsetty@linux.intel.com, anshuman.gupta@intel.com,
	rodrigo.vivi@intel.com, joonas.lahtinen@linux.intel.com,
	lukas@wunner.de, simona.vetter@ffwll.ch, airlied@gmail.com,
	pratik.bari@intel.com, joshua.santosh.ranjan@intel.com,
	ashwin.kumar.kulkarni@intel.com, shubham.kumar@intel.com,
	Riana Tauro <riana.tauro@intel.com>,
	Himal Prasad Ghimiray <himal.prasad.ghimiray@intel.com>
Subject: [PATCH v3 3/4] drm/xe/xe_hw_error: Add support for GT hardware errors
Date: Fri,  5 Dec 2025 14:09:35 +0530	[thread overview]
Message-ID: <20251205083934.3602030-9-riana.tauro@intel.com> (raw)
In-Reply-To: <20251205083934.3602030-6-riana.tauro@intel.com>

PVC supports GT error reporting via vector registers along with
error status register. Add support to report these errors and
update respective counters. Incase of Subslice error reported
by vector register, process the error status register
for applicable bits.

Incorporate the counter inside the driver itself and start
using the drm_ras generic netlink to report them.

Co-developed-by: Himal Prasad Ghimiray <himal.prasad.ghimiray@intel.com>
Signed-off-by: Himal Prasad Ghimiray <himal.prasad.ghimiray@intel.com>
Signed-off-by: Riana Tauro <riana.tauro@intel.com>
---
v2: Add ID's and names as uAPI (Rodrigo)
---
 drivers/gpu/drm/xe/regs/xe_hw_error_regs.h |  44 +++++
 drivers/gpu/drm/xe/xe_hw_error.c           | 182 ++++++++++++++++++++-
 2 files changed, 221 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/xe/regs/xe_hw_error_regs.h b/drivers/gpu/drm/xe/regs/xe_hw_error_regs.h
index c146b9ef44eb..b54712e893d5 100644
--- a/drivers/gpu/drm/xe/regs/xe_hw_error_regs.h
+++ b/drivers/gpu/drm/xe/regs/xe_hw_error_regs.h
@@ -11,10 +11,54 @@
 
 #define HEC_UNCORR_FW_ERR_DW0(base)                    XE_REG((base) + 0x124)
 
+#define ERR_STAT_GT_COR				0x100160
+#define ERR_STAT_GT_NONFATAL			0x100164
+#define ERR_STAT_GT_FATAL			0x100168
+#define ERR_STAT_GT_REG(x)			XE_REG(_PICK_EVEN((x), \
+								 ERR_STAT_GT_COR, \
+								 ERR_STAT_GT_NONFATAL))
+
+#define  GT_HW_ERROR_MAX_ERR_BITS		16
+#define  EU_GRF_ERR				(15)
+#define  EU_IC_ERR				(14)
+#define  SLM_ERR				(13)
+#define  GUC_COR_ERR				(1)
+
+#define  GUC_FAT_ERR				(6)
+#define  FPU_FAT_ERR				(3)
+
+#define PVC_COR_ERR_MASK			(BIT(GUC_COR_ERR) | BIT(SLM_ERR) | \
+						 BIT(EU_IC_ERR) | BIT(EU_GRF_ERR))
+
+#define PVC_FAT_ERR_MASK			(BIT(FPU_FAT_ERR) | BIT(GUC_FAT_ERR) | \
+						 BIT(EU_GRF_ERR) | BIT(SLM_ERR))
+
 #define DEV_ERR_STAT_NONFATAL			0x100178
 #define DEV_ERR_STAT_CORRECTABLE		0x10017c
 #define DEV_ERR_STAT_REG(x)			XE_REG(_PICK_EVEN((x), \
 								  DEV_ERR_STAT_CORRECTABLE, \
 								  DEV_ERR_STAT_NONFATAL))
+
 #define   XE_CSC_ERROR				BIT(17)
+#define   XE_GT_ERROR				BIT(0)
+
+#define  ERR_STAT_GT_FATAL_VECTOR_0		0x100260
+#define  ERR_STAT_GT_FATAL_VECTOR_1		0x100264
+
+#define  ERR_STAT_GT_FATAL_VECTOR_REG(x)	XE_REG(_PICK_EVEN((x), \
+								  ERR_STAT_GT_FATAL_VECTOR_0, \
+								  ERR_STAT_GT_FATAL_VECTOR_1))
+
+#define  ERR_STAT_GT_COR_VECTOR_LEN		(4)
+#define  ERR_STAT_GT_COR_VECTOR_0		0x1002a0
+#define  ERR_STAT_GT_COR_VECTOR_1		0x1002a4
+
+#define  ERR_STAT_GT_COR_VECTOR_REG(x)		XE_REG(_PICK_EVEN((x), \
+								 ERR_STAT_GT_COR_VECTOR_0,\
+								 ERR_STAT_GT_COR_VECTOR_1))
+
+#define ERR_STAT_GT_VECTOR_REG(hw_err, x)	(hw_err == DRM_XE_RAS_ERROR_CORRECTABLE ? \
+						 ERR_STAT_GT_COR_VECTOR_REG(x) : \
+						 ERR_STAT_GT_FATAL_VECTOR_REG(x))
+
 #endif
diff --git a/drivers/gpu/drm/xe/xe_hw_error.c b/drivers/gpu/drm/xe/xe_hw_error.c
index d63078d00b56..77c90f1b06fd 100644
--- a/drivers/gpu/drm/xe/xe_hw_error.c
+++ b/drivers/gpu/drm/xe/xe_hw_error.c
@@ -3,6 +3,7 @@
  * Copyright © 2025 Intel Corporation
  */
 
+#include <linux/bitmap.h>
 #include <linux/fault-inject.h>
 
 #include "regs/xe_gsc_regs.h"
@@ -16,6 +17,8 @@
 #include "xe_survivability_mode.h"
 
 #define  HEC_UNCORR_FW_ERR_BITS 4
+#define XE_RAS_REG_SIZE 32
+
 extern struct fault_attr inject_csc_hw_error;
 static const char * const error_severity[] = DRM_XE_RAS_ERROR_SEVERITY_NAMES;
 
@@ -26,6 +29,25 @@ static const char * const hec_uncorrected_fw_errors[] = {
 	"Data Corruption"
 };
 
+#define ERR_INDEX(_bit, index) \
+	[__ffs(_bit)] = index
+
+static const unsigned long xe_hw_error_map[] = {
+	ERR_INDEX(XE_GT_ERROR, DRM_XE_RAS_ERROR_CORE_COMPUTE),
+};
+
+enum gt_vector_regs {
+	ERR_STAT_GT_VECTOR0 = 0,
+	ERR_STAT_GT_VECTOR1,
+	ERR_STAT_GT_VECTOR2,
+	ERR_STAT_GT_VECTOR3,
+	ERR_STAT_GT_VECTOR4,
+	ERR_STAT_GT_VECTOR5,
+	ERR_STAT_GT_VECTOR6,
+	ERR_STAT_GT_VECTOR7,
+	ERR_STAT_GT_VECTOR_MAX,
+};
+
 static bool fault_inject_csc_hw_error(void)
 {
 	return IS_ENABLED(CONFIG_DEBUG_FS) && should_fail(&inject_csc_hw_error, 1);
@@ -78,14 +100,136 @@ static void csc_hw_error_handler(struct xe_tile *tile,
 	xe_mmio_write32(mmio, HEC_UNCORR_ERR_STATUS(base), err_src);
 }
 
+static void log_hw_error(struct xe_tile *tile, const char *name,
+			 const enum drm_xe_ras_error_severity severity)
+{
+	const char *severity_str = error_severity[severity];
+	struct xe_device *xe = tile_to_xe(tile);
+
+	if (severity == DRM_XE_RAS_ERROR_FATAL)
+		drm_err_ratelimited(&xe->drm, "%s %s error detected\n", name, severity_str);
+	else
+		drm_warn(&xe->drm, "%s %s error detected\n", name, severity_str);
+}
+
+static void
+log_gt_err(struct xe_tile *tile, const char *name, int i, u32 err,
+	   const enum drm_xe_ras_error_severity severity)
+{
+	const char *severity_str = error_severity[severity];
+	struct xe_device *xe = tile_to_xe(tile);
+
+	if (severity == DRM_XE_RAS_ERROR_FATAL)
+		drm_err_ratelimited(&xe->drm, "%s %s error detected, ERROR_STAT_GT_VECTOR%d:0x%08x\n",
+				    name, severity_str, i, err);
+	else
+		drm_warn(&xe->drm, "%s %s error detected, ERROR_STAT_GT_VECTOR%d:0x%08x\n",
+			 name, severity_str, i, err);
+}
+
+static void gt_handle_errors(struct xe_tile *tile,
+			     const enum drm_xe_ras_error_severity severity, u32 error_id)
+{
+	struct xe_device *xe = tile_to_xe(tile);
+	struct xe_drm_ras *ras = &xe->ras;
+	struct xe_drm_ras_counter *info = ras->info[severity];
+	struct xe_mmio *mmio = &tile->mmio;
+	unsigned long err_stat = 0;
+	int i;
+
+	if (xe->info.platform != XE_PVC)
+		return;
+
+	for (i = 0; i < ERR_STAT_GT_VECTOR_MAX; i++) {
+		u32 vector, val;
+
+		if (severity == DRM_XE_RAS_ERROR_CORRECTABLE && i >= ERR_STAT_GT_COR_VECTOR_LEN)
+			break;
+
+		vector = xe_mmio_read32(mmio, ERR_STAT_GT_VECTOR_REG(severity, i));
+		if (!vector)
+			continue;
+
+		switch (i) {
+		case ERR_STAT_GT_VECTOR0:
+		case ERR_STAT_GT_VECTOR1:
+			u32 errbit;
+
+			val = hweight32(vector);
+			atomic64_add(val, &info[error_id].counter);
+			log_gt_err(tile, "Subslice", i, vector, severity);
+
+			if (err_stat)
+				break;
+
+			err_stat = xe_mmio_read32(mmio, ERR_STAT_GT_REG(severity));
+			for_each_set_bit(errbit, &err_stat, GT_HW_ERROR_MAX_ERR_BITS) {
+				if (severity == DRM_XE_RAS_ERROR_CORRECTABLE &&
+				    (BIT(errbit) & PVC_COR_ERR_MASK))
+					atomic64_inc(&info[error_id].counter);
+				if (severity == DRM_XE_RAS_ERROR_FATAL &&
+				    (BIT(errbit) & PVC_FAT_ERR_MASK))
+					atomic64_inc(&info[error_id].counter);
+			}
+			if (err_stat)
+				xe_mmio_write32(mmio, ERR_STAT_GT_REG(severity), err_stat);
+			break;
+		case ERR_STAT_GT_VECTOR2:
+		case ERR_STAT_GT_VECTOR3:
+			val = hweight32(vector);
+			atomic64_add(val, &info[error_id].counter);
+			log_gt_err(tile, "L3 BANK", i, vector, severity);
+			break;
+		case ERR_STAT_GT_VECTOR6:
+			val = hweight32(vector);
+			atomic64_add(val, &info[error_id].counter);
+			log_gt_err(tile, "TLB", i, vector, severity);
+			break;
+		case ERR_STAT_GT_VECTOR7:
+			val = hweight32(vector);
+			atomic64_add(val, &info[error_id].counter);
+			break;
+		default:
+			log_gt_err(tile, "Undefined", i, vector, severity);
+		}
+
+		xe_mmio_write32(mmio, ERR_STAT_GT_VECTOR_REG(severity, i), vector);
+	}
+}
+
+static void gt_hw_error_handler(struct xe_tile *tile,
+				const enum drm_xe_ras_error_severity severity, u32 error_id)
+{
+	struct xe_device *xe = tile_to_xe(tile);
+	struct xe_drm_ras *ras = &xe->ras;
+	struct xe_drm_ras_counter *info = ras->info[severity];
+
+	switch (severity) {
+	case DRM_XE_RAS_ERROR_CORRECTABLE:
+		gt_handle_errors(tile, severity, error_id);
+		break;
+	case DRM_XE_RAS_ERROR_NONFATAL:
+		atomic64_inc(&info[error_id].counter);
+		log_hw_error(tile, "GT", severity);
+		break;
+	case DRM_XE_RAS_ERROR_FATAL:
+		gt_handle_errors(tile, severity, error_id);
+		break;
+	default:
+		drm_warn(&xe->drm, "Undefined error detected\n");
+	}
+}
+
 static void hw_error_source_handler(struct xe_tile *tile, enum drm_xe_ras_error_severity severity)
 {
 	const char *severity_str = error_severity[severity];
 	struct xe_device *xe = tile_to_xe(tile);
-	unsigned long flags;
-	u32 err_src;
+	struct xe_drm_ras *ras = &xe->ras;
+	struct xe_drm_ras_counter *info = ras->info[severity];
+	unsigned long flags, err_src;
+	u32 err_bit;
 
-	if (xe->info.platform != XE_BATTLEMAGE)
+	if (!IS_DGFX(xe))
 		return;
 
 	spin_lock_irqsave(&xe->irq.lock, flags);
@@ -96,11 +240,39 @@ static void hw_error_source_handler(struct xe_tile *tile, enum drm_xe_ras_error_
 		goto unlock;
 	}
 
-	if (err_src & XE_CSC_ERROR)
+	if (err_src & XE_CSC_ERROR) {
 		csc_hw_error_handler(tile, severity);
+		goto clear_reg;
+	}
 
-	xe_mmio_write32(&tile->mmio, DEV_ERR_STAT_REG(severity), err_src);
+	if (!info) {
+		drm_err_ratelimited(&xe->drm, HW_ERR "Errors undefined\n");
+		goto clear_reg;
+	}
+
+	for_each_set_bit(err_bit, &err_src, XE_RAS_REG_SIZE) {
+		u32 error_id = xe_hw_error_map[err_bit];
+		const char *name;
+
+		name = info[error_id].name;
+		if (!name)
+			goto clear_reg;
 
+		if (severity == DRM_XE_RAS_ERROR_FATAL) {
+			drm_err_ratelimited(&xe->drm, HW_ERR
+					    "TILE%d reported %s %s error, bit[%d] is set\n",
+					    tile->id, name, severity_str, err_bit);
+		} else {
+			drm_warn(&xe->drm, HW_ERR
+				 "TILE%d reported %s %s error, bit[%d] is set\n",
+				 tile->id, name, severity_str, err_bit);
+		}
+		if (BIT(err_bit) & XE_GT_ERROR)
+			gt_hw_error_handler(tile, severity, error_id);
+	}
+
+clear_reg:
+	xe_mmio_write32(&tile->mmio, DEV_ERR_STAT_REG(severity), err_src);
 unlock:
 	spin_unlock_irqrestore(&xe->irq.lock, flags);
 }
-- 
2.47.1

next prev parent reply	other threads:[~2025-12-05  8:11 UTC|newest]

Thread overview: 31+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-12-05  8:39 [PATCH v3 0/4] Introduce DRM_RAS using generic netlink for RAS Riana Tauro
2025-12-05  8:39 ` [PATCH v3 1/4] drm/ras: Introduce the DRM RAS infrastructure over generic netlink Riana Tauro
2025-12-09 21:35   ` Rodrigo Vivi
2026-01-08 22:36     ` Zack McKevitt
2026-01-09 20:57       ` Rodrigo Vivi
2026-01-13  8:20         ` Riana Tauro
2026-01-15 23:39           ` Zack McKevitt
2026-01-16  5:56             ` Riana Tauro
2026-01-16 20:26               ` Rodrigo Vivi
2025-12-05  8:39 ` [PATCH v3 2/4] drm/xe/xe_drm_ras: Add support for drm ras Riana Tauro
2025-12-09  8:22   ` Raag Jadav
2026-01-09  8:08     ` Riana Tauro
2026-01-09 14:13       ` Rodrigo Vivi
2026-01-09 15:58         ` Raag Jadav
2026-01-12  6:13           ` Riana Tauro
2026-01-12 10:27             ` Raag Jadav
2025-12-09 21:57   ` Rodrigo Vivi
2026-01-07  9:48     ` Aravind Iddamsetty
2025-12-05  8:39 ` Riana Tauro [this message]
2025-12-10 18:18   ` [PATCH v3 3/4] drm/xe/xe_hw_error: Add support for GT hardware errors Raag Jadav
2026-01-12  3:41     ` Riana Tauro
2026-01-12 10:02       ` Raag Jadav
2025-12-05  8:39 ` [PATCH v3 4/4] drm/xe/xe_hw_error: Add support for PVC SOC errors Riana Tauro
2025-12-15 10:52   ` Raag Jadav
2026-01-12  4:45     ` Riana Tauro
2026-01-12 10:06       ` Raag Jadav
2025-12-05  9:40 ` ✗ CI.checkpatch: warning for Introduce DRM_RAS using generic netlink for RAS (rev3) Patchwork
2025-12-05  9:41 ` ✓ CI.KUnit: success " Patchwork
2025-12-05  9:56 ` ✗ CI.checksparse: warning " Patchwork
2025-12-05 11:27 ` ✗ Xe.CI.Full: failure " Patchwork
2025-12-09 21:56 ` [PATCH v3 0/4] Introduce DRM_RAS using generic netlink for RAS Alex Deucher

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:c146b9ef44e dfblob:b54712e893d dfblob:d63078d00b5
dfblob:77c90f1b06f )
 OR (
bs:"[PATCH v3 3/4] drm/xe/xe_hw_error: Add support for GT hardware errors" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20251205083934.3602030-9-riana.tauro@intel.com \
    --to=riana.tauro@intel.com \
    --cc=airlied@gmail.com \
    --cc=anshuman.gupta@intel.com \
    --cc=aravind.iddamsetty@linux.intel.com \
    --cc=ashwin.kumar.kulkarni@intel.com \
    --cc=dri-devel@lists.freedesktop.org \
    --cc=himal.prasad.ghimiray@intel.com \
    --cc=intel-xe@lists.freedesktop.org \
    --cc=joonas.lahtinen@linux.intel.com \
    --cc=joshua.santosh.ranjan@intel.com \
    --cc=lukas@wunner.de \
    --cc=pratik.bari@intel.com \
    --cc=rodrigo.vivi@intel.com \
    --cc=shubham.kumar@intel.com \
    --cc=simona.vetter@ffwll.ch \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox