intel-xe.lists.freedesktop.org archive mirror
 help / color / mirror / Atom feed
From: Aravind Iddamsetty <aravind.iddamsetty@linux.intel.com>
To: intel-xe@lists.freedesktop.org
Cc: riana.tauro@intel.com, rodrigo.vivi@intel.com,
	himal.prasad.ghimiray@intel.com, anshuman.gupta@intel.com
Subject: [PATCH 07/10] drm/xe: Support SOC NONFATAL error handling for PVC.
Date: Wed, 30 Jul 2025 11:18:11 +0530	[thread overview]
Message-ID: <20250730054814.1376770-8-aravind.iddamsetty@linux.intel.com> (raw)
In-Reply-To: <20250730054814.1376770-1-aravind.iddamsetty@linux.intel.com>

From: Himal Prasad Ghimiray <himal.prasad.ghimiray@intel.com>

Report the SOC nonfatal hardware error and update the counters which
will increment incase of error.

v2
- Use xe_assign_hw_err_regs to initilaize registers.
- Dont use the counters if error is being reported by second level
  registers.
- Fix Num of IEH to 2.
- Follow the convention source_typeoferror_errorname for enum and error
reporting.(Aravind)

Cc: Aravind Iddamsetty <aravind.iddamsetty@linux.intel.com>
Reviewed-by: Aravind Iddamsetty <aravind.iddamsetty@linux.intel.com>
Signed-off-by: Himal Prasad Ghimiray <himal.prasad.ghimiray@intel.com>
---
 drivers/gpu/drm/xe/xe_hw_error.c | 70 +++++++++++++++++++++++++++++++-
 drivers/gpu/drm/xe/xe_hw_error.h | 39 ++++++++++++++++++
 2 files changed, 108 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/xe/xe_hw_error.c b/drivers/gpu/drm/xe/xe_hw_error.c
index 927bf2ab401f..705a670f01fc 100644
--- a/drivers/gpu/drm/xe/xe_hw_error.c
+++ b/drivers/gpu/drm/xe/xe_hw_error.c
@@ -260,6 +260,67 @@ static const struct err_name_index_pair pvc_soc_mstr_lcl_err_reg_fatal[] = {
 	[14 ... 31] = {"Undefined",				XE_HW_ERR_SOC_FATAL_UNKNOWN},
 };
 
+static const struct err_name_index_pair pvc_soc_mstr_glbl_err_reg_nonfatal[] = {
+	[0]         = {"MASTER LOCAL Reported",			XE_HW_ERR_TILE_UNSPEC},
+	[1]         = {"SLAVE GLOBAL Reported",			XE_HW_ERR_TILE_UNSPEC},
+	[2]         = {"HBM SS0: Channel0",			XE_HW_ERR_SOC_NONFATAL_HBM0_CHNL0},
+	[3]         = {"HBM SS0: Channel1",			XE_HW_ERR_SOC_NONFATAL_HBM0_CHNL1},
+	[4]         = {"HBM SS0: Channel2",			XE_HW_ERR_SOC_NONFATAL_HBM0_CHNL2},
+	[5]         = {"HBM SS0: Channel3",			XE_HW_ERR_SOC_NONFATAL_HBM0_CHNL3},
+	[6]         = {"HBM SS0: Channel4",			XE_HW_ERR_SOC_NONFATAL_HBM0_CHNL4},
+	[7]         = {"HBM SS0: Channel5",			XE_HW_ERR_SOC_NONFATAL_HBM0_CHNL5},
+	[8]         = {"HBM SS0: Channel6",                     XE_HW_ERR_SOC_NONFATAL_HBM0_CHNL6},
+	[9]         = {"HBM SS0: Channel7",                     XE_HW_ERR_SOC_NONFATAL_HBM0_CHNL7},
+	[10]        = {"HBM SS1: Channel0",                     XE_HW_ERR_SOC_NONFATAL_HBM1_CHNL0},
+	[11]        = {"HBM SS1: Channel1",                     XE_HW_ERR_SOC_NONFATAL_HBM1_CHNL1},
+	[12]        = {"HBM SS1: Channel2",                     XE_HW_ERR_SOC_NONFATAL_HBM1_CHNL2},
+	[13]        = {"HBM SS1: Channel3",                     XE_HW_ERR_SOC_NONFATAL_HBM1_CHNL3},
+	[14]        = {"HBM SS1: Channel4",                     XE_HW_ERR_SOC_NONFATAL_HBM1_CHNL4},
+	[15]        = {"HBM SS1: Channel5",                     XE_HW_ERR_SOC_NONFATAL_HBM1_CHNL5},
+	[16]        = {"HBM SS1: Channel6",                     XE_HW_ERR_SOC_NONFATAL_HBM1_CHNL6},
+	[17]        = {"HBM SS1: Channel7",                     XE_HW_ERR_SOC_NONFATAL_HBM1_CHNL7},
+	[18 ... 31] = {"Undefined",				XE_HW_ERR_SOC_NONFATAL_UNKNOWN},
+};
+
+static const struct err_name_index_pair pvc_soc_slave_glbl_err_reg_nonfatal[] = {
+	[0]         = {"SLAVE LOCAL Reported",			XE_HW_ERR_TILE_UNSPEC},
+	[1]         = {"HBM SS2: Channel0",			XE_HW_ERR_SOC_NONFATAL_HBM2_CHNL0},
+	[2]         = {"HBM SS2: Channel1",			XE_HW_ERR_SOC_NONFATAL_HBM2_CHNL1},
+	[3]         = {"HBM SS2: Channel2",			XE_HW_ERR_SOC_NONFATAL_HBM2_CHNL2},
+	[4]         = {"HBM SS2: Channel3",			XE_HW_ERR_SOC_NONFATAL_HBM2_CHNL3},
+	[5]         = {"HBM SS2: Channel4",			XE_HW_ERR_SOC_NONFATAL_HBM2_CHNL4},
+	[6]         = {"HBM SS2: Channel5",			XE_HW_ERR_SOC_NONFATAL_HBM2_CHNL5},
+	[7]         = {"HBM SS2: Channel6",                     XE_HW_ERR_SOC_NONFATAL_HBM2_CHNL6},
+	[8]         = {"HBM SS2: Channel7",                     XE_HW_ERR_SOC_NONFATAL_HBM2_CHNL7},
+	[9]         = {"HBM SS3: Channel0",                     XE_HW_ERR_SOC_NONFATAL_HBM3_CHNL0},
+	[10]        = {"HBM SS3: Channel1",                     XE_HW_ERR_SOC_NONFATAL_HBM3_CHNL1},
+	[11]        = {"HBM SS3: Channel2",                     XE_HW_ERR_SOC_NONFATAL_HBM3_CHNL2},
+	[12]        = {"HBM SS3: Channel3",                     XE_HW_ERR_SOC_NONFATAL_HBM3_CHNL3},
+	[13]        = {"HBM SS3: Channel4",                     XE_HW_ERR_SOC_NONFATAL_HBM3_CHNL4},
+	[14]        = {"HBM SS3: Channel5",                     XE_HW_ERR_SOC_NONFATAL_HBM3_CHNL5},
+	[15]        = {"HBM SS3: Channel6",                     XE_HW_ERR_SOC_NONFATAL_HBM3_CHNL6},
+	[16]        = {"HBM SS3: Channel7",                     XE_HW_ERR_SOC_NONFATAL_HBM3_CHNL7},
+	[18]	    = {"ANR MDFI",				XE_HW_ERR_SOC_NONFATAL_ANR_MDFI},
+	[17]        = {"Undefined",                             XE_HW_ERR_SOC_NONFATAL_UNKNOWN},
+	[19 ... 31] = {"Undefined",				XE_HW_ERR_SOC_NONFATAL_UNKNOWN},
+};
+
+static const struct err_name_index_pair pvc_soc_slave_lcl_err_reg_nonfatal[] = {
+	[0 ... 31]  = {"Undefined",			XE_HW_ERR_SOC_NONFATAL_UNKNOWN},
+};
+
+static const struct err_name_index_pair pvc_soc_mstr_lcl_err_reg_nonfatal[] = {
+	[0 ... 3]   = {"Undefined",				XE_HW_ERR_SOC_NONFATAL_UNKNOWN},
+	[4]         = {"Base Die MDFI T2T",			XE_HW_ERR_SOC_NONFATAL_MDFI_T2T},
+	[5]         = {"Undefined",				XE_HW_ERR_SOC_NONFATAL_UNKNOWN},
+	[6]         = {"Base Die MDFI T2C",			XE_HW_ERR_SOC_NONFATAL_MDFI_T2C},
+	[7]         = {"Undefined",				XE_HW_ERR_SOC_NONFATAL_UNKNOWN},
+	[8]         = {"Invalid CSC PSF Command Parity",	XE_HW_ERR_SOC_NONFATAL_CSC_PSF_CMD},
+	[9]         = {"Invalid CSC PSF Unexpected Completion",	XE_HW_ERR_SOC_NONFATAL_CSC_PSF_CMP},
+	[10]        = {"Invalid CSC PSF Unsupported Request",	XE_HW_ERR_SOC_NONFATAL_CSC_PSF_REQ},
+	[11 ... 31] = {"Undefined",				XE_HW_ERR_SOC_NONFATAL_UNKNOWN},
+};
+
 static void xe_assign_hw_err_regs(struct xe_device *xe)
 {
 	const struct err_name_index_pair **dev_err_stat = xe->hw_err_regs.dev_err_stat;
@@ -294,6 +355,10 @@ static void xe_assign_hw_err_regs(struct xe_device *xe)
 		soc_mstr_lcl[HARDWARE_ERROR_FATAL] = pvc_soc_mstr_lcl_err_reg_fatal;
 		soc_slave_glbl[HARDWARE_ERROR_FATAL] = pvc_soc_slave_glbl_err_reg_fatal;
 		soc_slave_lcl[HARDWARE_ERROR_FATAL] = pvc_soc_slave_lcl_err_reg_fatal;
+		soc_mstr_glbl[HARDWARE_ERROR_NONFATAL] = pvc_soc_mstr_glbl_err_reg_nonfatal;
+		soc_mstr_lcl[HARDWARE_ERROR_NONFATAL] = pvc_soc_mstr_lcl_err_reg_nonfatal;
+		soc_slave_glbl[HARDWARE_ERROR_NONFATAL] = pvc_soc_slave_glbl_err_reg_nonfatal;
+		soc_slave_lcl[HARDWARE_ERROR_NONFATAL] = pvc_soc_slave_lcl_err_reg_nonfatal;
 	}
 
 }
@@ -556,7 +621,10 @@ xe_soc_hw_error_handler(struct xe_tile *tile, const enum hardware_error hw_err)
 
 	lockdep_assert_held(&tile_to_xe(tile)->irq.lock);
 
-	if ((tile_to_xe(tile)->info.platform != XE_PVC) ||  hw_err != HARDWARE_ERROR_FATAL)
+	if (tile_to_xe(tile)->info.platform != XE_PVC)
+		return;
+
+	if (hw_err == HARDWARE_ERROR_CORRECTABLE)
 		return;
 
 	base = SOC_PVC_BASE;
diff --git a/drivers/gpu/drm/xe/xe_hw_error.h b/drivers/gpu/drm/xe/xe_hw_error.h
index ecd7edfcd38b..e43157aae938 100644
--- a/drivers/gpu/drm/xe/xe_hw_error.h
+++ b/drivers/gpu/drm/xe/xe_hw_error.h
@@ -100,6 +100,45 @@ enum xe_tile_hw_errors {
 	XE_HW_ERR_SOC_FATAL_PCIE_PSF_CMD,
 	XE_HW_ERR_SOC_FATAL_PCIE_PSF_CMP,
 	XE_HW_ERR_SOC_FATAL_PCIE_PSF_REQ,
+	XE_HW_ERR_SOC_NONFATAL_HBM0_CHNL0,
+	XE_HW_ERR_SOC_NONFATAL_HBM0_CHNL1,
+	XE_HW_ERR_SOC_NONFATAL_HBM0_CHNL2,
+	XE_HW_ERR_SOC_NONFATAL_HBM0_CHNL3,
+	XE_HW_ERR_SOC_NONFATAL_HBM0_CHNL4,
+	XE_HW_ERR_SOC_NONFATAL_HBM0_CHNL5,
+	XE_HW_ERR_SOC_NONFATAL_HBM0_CHNL6,
+	XE_HW_ERR_SOC_NONFATAL_HBM0_CHNL7,
+	XE_HW_ERR_SOC_NONFATAL_HBM1_CHNL0,
+	XE_HW_ERR_SOC_NONFATAL_HBM1_CHNL1,
+	XE_HW_ERR_SOC_NONFATAL_HBM1_CHNL2,
+	XE_HW_ERR_SOC_NONFATAL_HBM1_CHNL3,
+	XE_HW_ERR_SOC_NONFATAL_HBM1_CHNL4,
+	XE_HW_ERR_SOC_NONFATAL_HBM1_CHNL5,
+	XE_HW_ERR_SOC_NONFATAL_HBM1_CHNL6,
+	XE_HW_ERR_SOC_NONFATAL_HBM1_CHNL7,
+	XE_HW_ERR_SOC_NONFATAL_UNKNOWN,
+	XE_HW_ERR_SOC_NONFATAL_HBM2_CHNL0,
+	XE_HW_ERR_SOC_NONFATAL_HBM2_CHNL1,
+	XE_HW_ERR_SOC_NONFATAL_HBM2_CHNL2,
+	XE_HW_ERR_SOC_NONFATAL_HBM2_CHNL3,
+	XE_HW_ERR_SOC_NONFATAL_HBM2_CHNL4,
+	XE_HW_ERR_SOC_NONFATAL_HBM2_CHNL5,
+	XE_HW_ERR_SOC_NONFATAL_HBM2_CHNL6,
+	XE_HW_ERR_SOC_NONFATAL_HBM2_CHNL7,
+	XE_HW_ERR_SOC_NONFATAL_HBM3_CHNL0,
+	XE_HW_ERR_SOC_NONFATAL_HBM3_CHNL1,
+	XE_HW_ERR_SOC_NONFATAL_HBM3_CHNL2,
+	XE_HW_ERR_SOC_NONFATAL_HBM3_CHNL3,
+	XE_HW_ERR_SOC_NONFATAL_HBM3_CHNL4,
+	XE_HW_ERR_SOC_NONFATAL_HBM3_CHNL5,
+	XE_HW_ERR_SOC_NONFATAL_HBM3_CHNL6,
+	XE_HW_ERR_SOC_NONFATAL_HBM3_CHNL7,
+	XE_HW_ERR_SOC_NONFATAL_ANR_MDFI,
+	XE_HW_ERR_SOC_NONFATAL_MDFI_T2T,
+	XE_HW_ERR_SOC_NONFATAL_MDFI_T2C,
+	XE_HW_ERR_SOC_NONFATAL_CSC_PSF_CMD,
+	XE_HW_ERR_SOC_NONFATAL_CSC_PSF_CMP,
+	XE_HW_ERR_SOC_NONFATAL_CSC_PSF_REQ,
 };
 
 enum gt_vctr_registers {
-- 
2.25.1


  parent reply	other threads:[~2025-07-30  5:49 UTC|newest]

Thread overview: 18+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-07-30  5:48 [PATCH 00/10] Supporting RAS on XE Aravind Iddamsetty
2025-07-30  5:48 ` [PATCH 01/10] drm/xe: Handle errors from various components Aravind Iddamsetty
2025-07-30  9:08   ` Michal Wajdeczko
2025-07-30 19:59   ` Rodrigo Vivi
2025-07-30  5:48 ` [PATCH 02/10] drm/xe: Add new helpers to log hardware errrors Aravind Iddamsetty
2025-07-30  8:55   ` Michal Wajdeczko
2025-07-30  5:48 ` [PATCH 03/10] drm/xe: Log and count the GT hardware errors Aravind Iddamsetty
2025-07-30  5:48 ` [PATCH 04/10] drm/xe: Support GT hardware error reporting for PVC Aravind Iddamsetty
2025-07-30  5:48 ` [PATCH 05/10] drm/xe: Support GSC " Aravind Iddamsetty
2025-07-30  5:48 ` [PATCH 06/10] drm/xe: Support SOC FATAL error handling " Aravind Iddamsetty
2025-07-30  5:48 ` Aravind Iddamsetty [this message]
2025-07-30  5:48 ` [PATCH 08/10] drm/xe: Handle MDFI error severity Aravind Iddamsetty
2025-07-30  5:48 ` [PATCH 09/10] drm/xe: Clear SOC CORRECTABLE error registers Aravind Iddamsetty
2025-07-30  5:48 ` [PATCH 10/10] drm/xe: Clear all SoC errors post warm reset Aravind Iddamsetty
2025-07-30  5:57 ` ✗ CI.checkpatch: warning for Supporting RAS on XE Patchwork
2025-07-30  5:58 ` ✓ CI.KUnit: success " Patchwork
2025-07-30  6:59 ` ✗ Xe.CI.BAT: failure " Patchwork
2025-07-30  8:03 ` ✗ Xe.CI.Full: " Patchwork

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20250730054814.1376770-8-aravind.iddamsetty@linux.intel.com \
    --to=aravind.iddamsetty@linux.intel.com \
    --cc=anshuman.gupta@intel.com \
    --cc=himal.prasad.ghimiray@intel.com \
    --cc=intel-xe@lists.freedesktop.org \
    --cc=riana.tauro@intel.com \
    --cc=rodrigo.vivi@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).