Re: [Intel-xe] [PATCH 05/11] drm/xe: Support GSC hardware error reporting for PVC.

All of lore.kernel.org
 help / color / mirror / Atom feed

From: Aravind Iddamsetty <aravind.iddamsetty@linux.intel.com>
To: Himal Prasad Ghimiray <himal.prasad.ghimiray@intel.com>,
	intel-xe@lists.freedesktop.org
Subject: Re: [Intel-xe] [PATCH 05/11] drm/xe: Support GSC hardware error reporting for PVC.
Date: Wed, 11 Oct 2023 12:48:07 +0530	[thread overview]
Message-ID: <f62b6408-cc93-33eb-e8e9-18a7ced769b1@linux.intel.com> (raw)
In-Reply-To: <20230927114627.136925-6-himal.prasad.ghimiray@intel.com>


On 27/09/23 17:16, Himal Prasad Ghimiray wrote:
> Add support to report GSC hw errors and counter update in case
> of correctable errors.
>
> Signed-off-by: Himal Prasad Ghimiray <himal.prasad.ghimiray@intel.com>
> ---
>  drivers/gpu/drm/xe/regs/xe_tile_error_regs.h |  8 ++
>  drivers/gpu/drm/xe/xe_hw_error.c             | 96 +++++++++++++++++++-
>  drivers/gpu/drm/xe/xe_hw_error.h             | 16 ++++
>  3 files changed, 117 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/gpu/drm/xe/regs/xe_tile_error_regs.h b/drivers/gpu/drm/xe/regs/xe_tile_error_regs.h
> index db78d6687213..fa16eaf9436b 100644
> --- a/drivers/gpu/drm/xe/regs/xe_tile_error_regs.h
> +++ b/drivers/gpu/drm/xe/regs/xe_tile_error_regs.h
> @@ -12,4 +12,12 @@
>  #define DEV_ERR_STAT_REG(x)                            XE_REG(_PICK_EVEN((x), \
>  								_DEV_ERR_STAT_CORRECTABLE, \
>  								_DEV_ERR_STAT_NONFATAL))
> +
> +#define PVC_GSC_HECI1_BASE                             0x00284000
> +#define PVC_GSC_HECI2_BASE                             0x00285000
please maintain the order of register definition as per address in this file.
> +#define _GSC_HEC_CORR_ERR_STATUS                       0x128
> +#define _GSC_HEC_UNCOR_ERR_STATUS                     0x118
> +#define GSC_HEC_ERR_STAT_REG(base, x)                  XE_REG(_PICK_EVEN((x), \
> +								(base) + _GSC_HEC_CORR_ERR_STATUS, \
> +								(base) + _GSC_HEC_UNCOR_ERR_STATUS))
>  #endif
> diff --git a/drivers/gpu/drm/xe/xe_hw_error.c b/drivers/gpu/drm/xe/xe_hw_error.c
> index 9595e3369656..eb76b8e6a338 100644
> --- a/drivers/gpu/drm/xe/xe_hw_error.c
> +++ b/drivers/gpu/drm/xe/xe_hw_error.c
> @@ -183,6 +183,28 @@ static const struct err_msg_cntr_pair err_stat_gt_correctable_vectr_reg[] = {
>  	[2 ... 3]         = {"L3BANK",		XE_GT_HW_ERR_L3BANK_CORR},
>  };
>  
> +static const struct err_msg_cntr_pair gsc_nonfatal_err_reg[] = {
> +	[0]         = {"MinuteIA Unexpected Shutdown",		XE_GSC_HW_ERR_MIA_SHUTDOWN_UNCOR},
> +	[1]         = {"MinuteIA Internal Error",		XE_GSC_HW_ERR_MIA_INTERNAL_UNCOR},
> +	[2]         = {"Double bit error on SRAM",		XE_GSC_HW_ERR_SRAM_UNCOR},
> +	[3]         = {"WDT 2nd Timeout",			XE_GSC_HW_ERR_WDG_UNCOR},
> +	[4]         = {"ROM has a parity error",		XE_GSC_HW_ERR_ROM_PARITY_UNCOR},
> +	[5]         = {"Ucode has a parity error",		XE_GSC_HW_ERR_UCODE_PARITY_UNCOR},
> +	[6]         = {"Errors Reported to and Detected by FW",	XE_GSC_HW_ERR_FW_UNCOR},
> +	[7]         = {"Glitch is detected on voltage rail",	XE_GSC_HW_ERR_VLT_GLITCH_UNCOR},
> +	[8]         = {"Fuse Pull Error",			XE_GSC_HW_ERR_FUSE_PULL_UNCOR},
> +	[9]         = {"Fuse CRC Check Failed on Fuse Pull",	XE_GSC_HW_ERR_FUSE_CRC_UNCOR},
> +	[10]        = {"Self Mbist Failed",			XE_GSC_HW_ERR_SELF_MBIST_UNCOR},
> +	[11]        = {"AON RF has parity error",		XE_GSC_HW_ERR_AON_RF_PARITY_UNCOR},
> +	[12 ... 31] = {"Undefined",				XE_GSC_HW_ERR_UNKNOWN_UNCOR},
> +};
> +
> +static const struct err_msg_cntr_pair gsc_correctable_err_reg[] = {
> +	[0]        = {"Single bit error on SRAM",			XE_GSC_HW_ERR_SRAM_CORR},
> +	[1]        = {"Errors Reported to FW and Detected by FW",	XE_GSC_HW_ERR_FW_CORR},
> +	[2 ... 31] = {"Undefined",					XE_GSC_HW_ERR_UNKNOWN_CORR},
> +};
> +
>  static void xe_assign_hw_err_regs(struct xe_device *xe)
>  {
>  	const struct err_msg_cntr_pair **dev_err_stat = xe->hw_err_regs.dev_err_stat;
> @@ -344,6 +366,71 @@ xe_gt_hw_error_handler(struct xe_gt *gt, const enum hardware_error hw_err)
>  		xe_gt_hw_error_status_reg_handler(gt, hw_err);
>  }
>  
> +static void
> +xe_gsc_hw_error_handler(struct xe_tile *tile, const enum hardware_error hw_err)
> +{
> +	const char *hw_err_str = hardware_error_type_to_str(hw_err);
> +	const struct err_msg_cntr_pair *errstat;
> +	struct xe_gt *mmio;
better name this as gt, as it really holds a struct xe_gt and not a struct mmio in tile
> +	unsigned long errsrc;
> +	const char *errmsg;
> +	u32 indx;
> +	u32 errbit;
> +	u32 base;
> +
> +	if ((tile_to_xe(tile)->info.platform != XE_PVC))
> +		return;
> +
> +	/* GSC errors are valid only on root tile and for NONFATAL and
> +	 * CORRECTABLE type.For non root tiles or FATAL type it should
> +	 * be categorized as undefined GSC HARDWARE ERROR
> +	 */
> +	base = PVC_GSC_HECI1_BASE;
> +
> +	if (tile->id || hw_err == HARDWARE_ERROR_FATAL) {
> +		drm_err_ratelimited(&tile_to_xe(tile)->drm, HW_ERR
> +				    "Undefined GSC %s error on tile%d\n", hw_err_str, tile->id);
> +		return;
> +	}
> +
> +	lockdep_assert_held(&tile_to_xe(tile)->irq.lock);
> +	if (hw_err == HARDWARE_ERROR_CORRECTABLE)
> +		errstat = gsc_correctable_err_reg;
> +	else
> +		errstat = gsc_nonfatal_err_reg;
let's have one common design of initializing all error registers at once during driver load.
> +
> +	mmio = tile->primary_gt;
> +	errsrc = xe_mmio_read32(mmio, GSC_HEC_ERR_STAT_REG(base, hw_err));
> +	if (!errsrc) {
> +		drm_err_ratelimited(&tile_to_xe(tile)->drm, HW_ERR
> +				    "GSC detected GSC_HEC_ERR_STAT_REG_%s blank!\n",  hw_err_str);
> +		goto clear_reg;
> +	}
> +
> +	drm_info(&tile_to_xe(tile)->drm, HW_ERR
> +		 "GSC_HEC_ERR_STAT_REG_%s=0x%08lx\n", hw_err_str, errsrc);
> +
> +	for_each_set_bit(errbit, &errsrc, 32) {
> +		errmsg = errstat[errbit].errmsg;
> +		indx = errstat[errbit].cntr_indx;
> +
> +		if (hw_err == HARDWARE_ERROR_CORRECTABLE) {
> +			drm_warn(&tile_to_xe(tile)->drm,
> +				 HW_ERR "GSC detected %s %s error, bit[%d] is set\n",
> +				 errmsg, hw_err_str, errbit);
the message should have common convention , like "TileN reported XXX" for all the errors we log
> +
> +		} else {
> +			drm_err_ratelimited(&tile_to_xe(tile)->drm,
> +					    HW_ERR "GSC detected %s %s error, bit[%d] is set\n",
> +					    errmsg, hw_err_str, errbit);
> +		}
may be you want to skip FW_ERR here, see below.
> +		tile->errors.count[indx]++;
> +	}
> +
> +clear_reg:
> +	xe_mmio_write32(mmio, GSC_HEC_ERR_STAT_REG(base, hw_err), errsrc);
> +}
> +
>  static void
>  xe_hw_error_source_handler(struct xe_tile *tile, const enum hardware_error hw_err)
>  {
> @@ -385,10 +472,13 @@ xe_hw_error_source_handler(struct xe_tile *tile, const enum hardware_error hw_er
>  					    HW_ERR "TILE%d detected %s %s error, bit[%d] is set\n",
>  					    tile->id, errmsg, hw_err_str, errbit);
>  		tile->errors.count[indx]++;
> -	}
>  
> -	if (errsrc & REG_BIT(0))
> -		xe_gt_hw_error_handler(tile->primary_gt, hw_err);
> +		if (errbit == 0)
> +			xe_gt_hw_error_handler(tile->primary_gt, hw_err);
> +
> +		if (errbit == 8)
> +			xe_gsc_hw_error_handler(tile, hw_err);
please define what 0 and 8 are.
> +	}
>  
>  	xe_mmio_write32(mmio, DEV_ERR_STAT_REG(hw_err), errsrc);
>  unlock:
> diff --git a/drivers/gpu/drm/xe/xe_hw_error.h b/drivers/gpu/drm/xe/xe_hw_error.h
> index 2812407dd4bf..155722a0af4c 100644
> --- a/drivers/gpu/drm/xe/xe_hw_error.h
> +++ b/drivers/gpu/drm/xe/xe_hw_error.h
> @@ -48,6 +48,22 @@ enum xe_tile_hw_errors {
>  	XE_TILE_HW_ERR_GSC_CORR,
>  	XE_TILE_HW_ERR_SOC_CORR,
>  	XE_TILE_HW_ERR_UNKNOWN_CORR,
> +	XE_GSC_HW_ERR_SRAM_CORR,
> +	XE_GSC_HW_ERR_FW_CORR,
I don't think counting FW_CORR is correct as the acutal count is maintained and reported by CSC via HECI
> +	XE_GSC_HW_ERR_UNKNOWN_CORR,
> +	XE_GSC_HW_ERR_MIA_SHUTDOWN_UNCOR,
> +	XE_GSC_HW_ERR_MIA_INTERNAL_UNCOR,
> +	XE_GSC_HW_ERR_SRAM_UNCOR,
> +	XE_GSC_HW_ERR_WDG_UNCOR,
> +	XE_GSC_HW_ERR_ROM_PARITY_UNCOR,
> +	XE_GSC_HW_ERR_UCODE_PARITY_UNCOR,
> +	XE_GSC_HW_ERR_FW_UNCOR,
> +	XE_GSC_HW_ERR_VLT_GLITCH_UNCOR,
> +	XE_GSC_HW_ERR_FUSE_PULL_UNCOR,
> +	XE_GSC_HW_ERR_FUSE_CRC_UNCOR,
> +	XE_GSC_HW_ERR_SELF_MBIST_UNCOR,
> +	XE_GSC_HW_ERR_AON_RF_PARITY_UNCOR,
we shall maintain uniform naming,
everywhere it is mentioned as NONFATAL, here is just UNCOR
> +	XE_GSC_HW_ERR_UNKNOWN_UNCOR,
>  	XE_TILE_HW_ERROR_MAX,
>  };
Thanks,
Aravind,
>

next prev parent reply	other threads:[~2023-10-11  7:15 UTC|newest]

Thread overview: 42+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-09-27 11:46 [Intel-xe] [PATCH 00/11] Supporting CSC and SOC HARDWARE ERROR HANDLING on PVC Himal Prasad Ghimiray
2023-09-27 11:43 ` [Intel-xe] ✓ CI.Patch_applied: success for " Patchwork
2023-09-27 11:43 ` [Intel-xe] ✗ CI.checkpatch: warning " Patchwork
2023-09-27 11:44 ` [Intel-xe] ✓ CI.KUnit: success " Patchwork
2023-09-27 11:46 ` [Intel-xe] [PATCH 01/11] drm/xe: Handle errors from various components Himal Prasad Ghimiray
2023-09-27 11:46 ` [Intel-xe] [PATCH 02/11] drm/xe: Log and count the GT hardware errors Himal Prasad Ghimiray
2023-09-27 11:46 ` [Intel-xe] [PATCH 03/11] drm/xe: Support GT hardware error reporting for PVC Himal Prasad Ghimiray
2023-09-27 11:46 ` [Intel-xe] [PATCH 04/11] drm/xe: Process fatal hardware errors Himal Prasad Ghimiray
2023-09-27 11:46 ` [Intel-xe] [PATCH 05/11] drm/xe: Support GSC hardware error reporting for PVC Himal Prasad Ghimiray
2023-10-11  7:18   ` Aravind Iddamsetty [this message]
2023-09-27 11:46 ` [Intel-xe] [PATCH 06/11] drm/xe: Notify userspace about GSC HW errors Himal Prasad Ghimiray
2023-10-11  7:23   ` Aravind Iddamsetty
2023-10-11  7:25     ` Ghimiray, Himal Prasad
2023-10-12  3:12       ` Aravind Iddamsetty
2023-09-27 11:46 ` [Intel-xe] [PATCH 07/11] drm/xe: Support SOC FATAL error handling for PVC Himal Prasad Ghimiray
2023-10-04  6:38   ` Aravind Iddamsetty
2023-10-04  6:50     ` Ghimiray, Himal Prasad
2023-10-08  9:32       ` Aravind Iddamsetty
2023-10-09  4:11         ` Ghimiray, Himal Prasad
2023-10-09  9:00           ` Aravind Iddamsetty
2023-10-09  9:15             ` Ghimiray, Himal Prasad
2023-10-10  6:27               ` Aravind Iddamsetty
2023-10-09  9:52   ` Aravind Iddamsetty
2023-10-09 10:14     ` Ghimiray, Himal Prasad
2023-09-27 11:46 ` [Intel-xe] [PATCH 08/11] drm/xe: Support SOC NONFATAL " Himal Prasad Ghimiray
2023-10-11  6:07   ` Aravind Iddamsetty
2023-09-27 11:46 ` [Intel-xe] [PATCH 09/11] drm/xe: Handle MDFI error severity Himal Prasad Ghimiray
2023-10-04 12:11   ` Aravind Iddamsetty
2023-09-27 11:46 ` [Intel-xe] [PATCH 10/11] drm/xe: Clear SOC CORRECTABLE error registers Himal Prasad Ghimiray
2023-10-09  9:58   ` Aravind Iddamsetty
2023-10-11  6:48   ` Aravind Iddamsetty
2023-10-11  6:52     ` Ghimiray, Himal Prasad
2023-10-12  2:59       ` Aravind Iddamsetty
2023-10-12  4:01         ` Ghimiray, Himal Prasad
2023-09-27 11:46 ` [Intel-xe] [PATCH 11/11] drm/xe: Clear all SoC errors post warm reset Himal Prasad Ghimiray
2023-10-11  6:56   ` Aravind Iddamsetty
2023-10-11  6:59     ` Ghimiray, Himal Prasad
2023-10-12  3:05       ` Aravind Iddamsetty
2023-09-27 11:51 ` [Intel-xe] ✓ CI.Build: success for Supporting CSC and SOC HARDWARE ERROR HANDLING on PVC Patchwork
2023-09-27 11:52 ` [Intel-xe] ✗ CI.Hooks: failure " Patchwork
2023-09-27 11:53 ` [Intel-xe] ✓ CI.checksparse: success " Patchwork
2023-09-27 12:28 ` [Intel-xe] ✗ CI.BAT: failure " Patchwork

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=f62b6408-cc93-33eb-e8e9-18a7ced769b1@linux.intel.com \
    --to=aravind.iddamsetty@linux.intel.com \
    --cc=himal.prasad.ghimiray@intel.com \
    --cc=intel-xe@lists.freedesktop.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.