From: Aravind Iddamsetty <aravind.iddamsetty@linux.intel.com>
To: Himal Prasad Ghimiray <himal.prasad.ghimiray@intel.com>,
intel-xe@lists.freedesktop.org
Subject: Re: [Intel-xe] [PATCH v2 04/10] drm/xe: Support GSC hardware error reporting for PVC.
Date: Thu, 19 Oct 2023 13:55:31 +0530 [thread overview]
Message-ID: <508aa2bb-004a-aec8-188e-e72e08f914f8@linux.intel.com> (raw)
In-Reply-To: <20231018040033.1227494-5-himal.prasad.ghimiray@intel.com>
On 18/10/23 09:30, Himal Prasad Ghimiray wrote:
> Add support to report GSC hw errors and counter update in case
> of correctable errors.
>
> v2
> - skip FW_ERR reporting via counters.
> - maintain uniform naming for enums.
> - Use same convention for error reporting.(Aravind)
>
> Cc: Aravind Iddamsetty <aravind.iddamsetty@intel.com>
> Signed-off-by: Himal Prasad Ghimiray <himal.prasad.ghimiray@intel.com>
> ---
> drivers/gpu/drm/xe/regs/xe_tile_error_regs.h | 9 ++
> drivers/gpu/drm/xe/xe_device_types.h | 1 +
> drivers/gpu/drm/xe/xe_hw_error.c | 96 ++++++++++++++++++++
> drivers/gpu/drm/xe/xe_hw_error.h | 19 +++-
> 4 files changed, 124 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/xe/regs/xe_tile_error_regs.h b/drivers/gpu/drm/xe/regs/xe_tile_error_regs.h
> index 2224f7d328e5..1d18f560f200 100644
> --- a/drivers/gpu/drm/xe/regs/xe_tile_error_regs.h
> +++ b/drivers/gpu/drm/xe/regs/xe_tile_error_regs.h
> @@ -7,10 +7,19 @@
>
> #include <linux/stddef.h>
>
> +#define _GSC_HEC_UNCOR_ERR_STATUS 0x118
> +#define _GSC_HEC_CORR_ERR_STATUS 0x128
> +#define GSC_HEC_ERR_STAT_REG(base, x) XE_REG(_PICK_EVEN((x), \
> + (base) + _GSC_HEC_CORR_ERR_STATUS, \
> + (base) + _GSC_HEC_UNCOR_ERR_STATUS))
> +
> #define _DEV_ERR_STAT_NONFATAL 0x100178
> #define _DEV_ERR_STAT_CORRECTABLE 0x10017c
> #define DEV_ERR_STAT_REG(x) XE_REG(_PICK_EVEN((x), \
> _DEV_ERR_STAT_CORRECTABLE, \
> _DEV_ERR_STAT_NONFATAL))
> #define XE_GT_ERROR 0
> +#define XE_GSC_ERROR 8
> +
> +#define PVC_GSC_HECI1_BASE 0x284000
> #endif
> diff --git a/drivers/gpu/drm/xe/xe_device_types.h b/drivers/gpu/drm/xe/xe_device_types.h
> index b86182dd89f3..2998ee517f0d 100644
> --- a/drivers/gpu/drm/xe/xe_device_types.h
> +++ b/drivers/gpu/drm/xe/xe_device_types.h
> @@ -416,6 +416,7 @@ struct xe_device {
> const struct err_name_index_pair *dev_err_stat[HARDWARE_ERROR_MAX];
> const struct err_name_index_pair *err_stat_gt[HARDWARE_ERROR_MAX];
> const struct err_name_index_pair *err_vctr_gt[HARDWARE_ERROR_MAX];
> + const struct err_name_index_pair *gsc_error[HARDWARE_ERROR_MAX];
> } hw_err_regs;
>
> /* private: */
> diff --git a/drivers/gpu/drm/xe/xe_hw_error.c b/drivers/gpu/drm/xe/xe_hw_error.c
> index e5141371c4dc..1e94ee72a34f 100644
> --- a/drivers/gpu/drm/xe/xe_hw_error.c
> +++ b/drivers/gpu/drm/xe/xe_hw_error.c
> @@ -165,11 +165,34 @@ static const struct err_name_index_pair pvc_err_vectr_gt_correctable_reg[] = {
> [2 ... 3] = {"L3BANK", XE_HW_ERR_GT_CORR_L3BANK},
> };
>
> +static const struct err_name_index_pair pvc_gsc_nonfatal_err_reg[] = {
> + [0] = {"MinuteIA Unexpected Shutdown", XE_HW_ERR_GSC_NONFATAL_MIA_SHUTDOWN},
> + [1] = {"MinuteIA Internal Error", XE_HW_ERR_GSC_NONFATAL_MIA_INTERNAL},
> + [2] = {"Double bit error on SRAM", XE_HW_ERR_GSC_NONFATAL_SRAM},
> + [3] = {"WDT 2nd Timeout", XE_HW_ERR_GSC_NONFATAL_WDG},
> + [4] = {"ROM has a parity error", XE_HW_ERR_GSC_NONFATAL_ROM_PARITY},
> + [5] = {"Ucode has a parity error", XE_HW_ERR_GSC_NONFATAL_UCODE_PARITY},
> + [6] = {"Errors Reported to and Detected by FW", XE_HW_ERR_TILE_UNSPEC},
> + [7] = {"Glitch is detected on voltage rail", XE_HW_ERR_GSC_NONFATAL_VLT_GLITCH},
> + [8] = {"Fuse Pull Error", XE_HW_ERR_GSC_NONFATAL_FUSE_PULL},
> + [9] = {"Fuse CRC Check Failed on Fuse Pull", XE_HW_ERR_GSC_NONFATAL_FUSE_CRC},
> + [10] = {"Self Mbist Failed", XE_HW_ERR_GSC_NONFATAL_SELF_MBIST},
> + [11] = {"AON RF has parity error", XE_HW_ERR_GSC_NONFATAL_AON_RF_PARITY},
> + [12 ... 31] = {"Undefined", XE_HW_ERR_GSC_NONFATAL_UNKNOWN},
> +};
> +
> +static const struct err_name_index_pair pvc_gsc_correctable_err_reg[] = {
> + [0] = {"Single bit error on SRAM", XE_HW_ERR_GSC_CORR_SRAM},
> + [1] = {"Errors Reported to FW and Detected by FW", XE_HW_ERR_TILE_UNSPEC},
> + [2 ... 31] = {"Undefined", XE_HW_ERR_GSC_CORR_UNKNOWN},
> +};
> +
> void xe_assign_hw_err_regs(struct xe_device *xe)
> {
> const struct err_name_index_pair **dev_err_stat = xe->hw_err_regs.dev_err_stat;
> const struct err_name_index_pair **err_stat_gt = xe->hw_err_regs.err_stat_gt;
> const struct err_name_index_pair **err_vctr_gt = xe->hw_err_regs.err_vctr_gt;
> + const struct err_name_index_pair **gsc_error = xe->hw_err_regs.gsc_error;
>
> /* Error reporting is supported only for DG2 and
> * PVC currently. Error reporting support for other
> @@ -191,6 +214,8 @@ void xe_assign_hw_err_regs(struct xe_device *xe)
> err_stat_gt[HARDWARE_ERROR_FATAL] = pvc_err_stat_gt_fatal_reg;
> err_vctr_gt[HARDWARE_ERROR_CORRECTABLE] = pvc_err_vectr_gt_correctable_reg;
> err_vctr_gt[HARDWARE_ERROR_FATAL] = pvc_err_vectr_gt_fatal_reg;
> + gsc_error[HARDWARE_ERROR_CORRECTABLE] = pvc_gsc_correctable_err_reg;
> + gsc_error[HARDWARE_ERROR_NONFATAL] = pvc_gsc_nonfatal_err_reg;
> }
>
> }
> @@ -362,6 +387,72 @@ xe_gt_hw_error_handler(struct xe_gt *gt, const enum hardware_error hw_err)
> xe_gt_hw_error_log_vector_reg(gt, hw_err);
> }
>
> +static void
> +xe_gsc_hw_error_handler(struct xe_tile *tile, const enum hardware_error hw_err)
> +{
> + const char *hw_err_str = hardware_error_type_to_str(hw_err);
> + const struct err_name_index_pair *errstat;
> + struct hardware_errors_regs *err_regs;
> + struct xe_gt *gt;
> + unsigned long errsrc;
> + const char *name;
> + u32 indx;
> + u32 errbit;
> + u32 base;
> +
> + if ((tile_to_xe(tile)->info.platform != XE_PVC))
> + return;
> +
> + /* GSC errors are valid only on root tile and for NONFATAL and
shall begin with a blank line
> + * CORRECTABLE type.For non root tiles or FATAL type it should
> + * be categorized as undefined GSC HARDWARE ERROR
> + */
> + base = PVC_GSC_HECI1_BASE;
> +
> + if (tile->id || hw_err == HARDWARE_ERROR_FATAL) {
> + drm_err_ratelimited(&tile_to_xe(tile)->drm, HW_ERR
> + "Tile%d reported GSC %s Undefined error.\n",
> + tile->id, hw_err_str);
> + return;
> + }
> +
> + lockdep_assert_held(&tile_to_xe(tile)->irq.lock);
> + err_regs = &tile_to_xe(tile)->hw_err_regs;
> + errstat = err_regs->gsc_error[hw_err];
> + gt = tile->primary_gt;
> + errsrc = xe_mmio_read32(gt, GSC_HEC_ERR_STAT_REG(base, hw_err));
> + if (!errsrc) {
> + drm_err_ratelimited(&tile_to_xe(tile)->drm, HW_ERR
> + "Tile0 reported GSC_HEC_ERR_STAT_REG_%s blank!\n", hw_err_str);
> + goto clear_reg;
> + }
> +
> + drm_dbg(&tile_to_xe(tile)->drm, HW_ERR
> + "Tile0 reported GSC_HEC_ERR_STAT_REG_%s=0x%08lx\n", hw_err_str, errsrc);
> +
> + for_each_set_bit(errbit, &errsrc, XE_RAS_REG_SIZE) {
> + name = errstat[errbit].name;
> + indx = errstat[errbit].index;
> +
> + if (hw_err == HARDWARE_ERROR_CORRECTABLE) {
> + drm_warn(&tile_to_xe(tile)->drm,
> + HW_ERR "Tile0 reported GSC %s %s error, bit[%d] is set\n",
> + name, hw_err_str, errbit);
> +
> + } else {
> + drm_err_ratelimited(&tile_to_xe(tile)->drm, HW_ERR
> + "Tile0 reported GSC %s %s error, bit[%d] is set\n",
> + name, hw_err_str, errbit);
> + }
> + if (indx != XE_HW_ERR_TILE_UNSPEC)
> + xe_update_hw_error_cnt(&tile_to_xe(tile)->drm,
> + &tile->errors.hw_error, indx);
> + }
> +
> +clear_reg:
> + xe_mmio_write32(gt, GSC_HEC_ERR_STAT_REG(base, hw_err), errsrc);
> +}
> +
> static void
> xe_hw_error_source_handler(struct xe_tile *tile, const enum hardware_error hw_err)
> {
> @@ -416,9 +507,14 @@ xe_hw_error_source_handler(struct xe_tile *tile, const enum hardware_error hw_er
> if (indx != XE_HW_ERR_TILE_UNSPEC)
> xe_update_hw_error_cnt(&tile_to_xe(tile)->drm,
> &tile->errors.hw_error, indx);
> +
> if (errbit == XE_GT_ERROR)
> xe_gt_hw_error_handler(tile->primary_gt, hw_err);
> +
> + if (errbit == XE_GSC_ERROR)
> + xe_gsc_hw_error_handler(tile, hw_err);
> }
> +
> xe_mmio_write32(gt, DEV_ERR_STAT_REG(hw_err), errsrc);
> unlock:
> spin_unlock_irqrestore(&tile_to_xe(tile)->irq.lock, flags);
> diff --git a/drivers/gpu/drm/xe/xe_hw_error.h b/drivers/gpu/drm/xe/xe_hw_error.h
> index ce924d2d6038..1dad66a85799 100644
> --- a/drivers/gpu/drm/xe/xe_hw_error.h
> +++ b/drivers/gpu/drm/xe/xe_hw_error.h
> @@ -38,7 +38,24 @@ enum xe_tile_hw_errors {
> XE_HW_ERR_TILE_CORR_SGUNIT,
> XE_HW_ERR_TILE_CORR_UNKNOWN,
> XE_HW_ERR_TILE_UNSPEC,
> - XE_HW_ERROR_TILE_MAX,
> +};
> +
> +enum xe_gsc_hw_errors {
> + XE_HW_ERR_GSC_CORR_SRAM = XE_HW_ERR_TILE_UNSPEC + 1,
> + XE_HW_ERR_GSC_CORR_UNKNOWN,
> + XE_HW_ERR_GSC_NONFATAL_MIA_SHUTDOWN,
> + XE_HW_ERR_GSC_NONFATAL_MIA_INTERNAL,
> + XE_HW_ERR_GSC_NONFATAL_SRAM,
> + XE_HW_ERR_GSC_NONFATAL_WDG,
> + XE_HW_ERR_GSC_NONFATAL_ROM_PARITY,
> + XE_HW_ERR_GSC_NONFATAL_UCODE_PARITY,
> + XE_HW_ERR_GSC_NONFATAL_VLT_GLITCH,
> + XE_HW_ERR_GSC_NONFATAL_FUSE_PULL,
> + XE_HW_ERR_GSC_NONFATAL_FUSE_CRC,
> + XE_HW_ERR_GSC_NONFATAL_SELF_MBIST,
> + XE_HW_ERR_GSC_NONFATAL_AON_RF_PARITY,
> + XE_HW_ERR_GSC_NONFATAL_UNKNOWN,
> + XE_HW_ERROR_TILE_MAX
> };
I'm sorry i suggested to have separate enum for GSC and SOC but looking at this feel
what you had earlier was better, the only concern i had was as they might not be applicable
on all platform but even having separate enums might not solve this.
>
> enum gt_vctr_registers {
with the above addressed Reviewed-by: Aravind Iddamsetty <aravind.iddamsetty@linux.intel.com>
Thanks,
Aravind.
next prev parent reply other threads:[~2023-10-19 8:22 UTC|newest]
Thread overview: 36+ messages / expand[flat|nested] mbox.gz Atom feed top
2023-10-18 4:00 [Intel-xe] [PATCH v9 00/10] Supporting RAS on XE Himal Prasad Ghimiray
2023-10-18 3:57 ` [Intel-xe] ✓ CI.Patch_applied: success for " Patchwork
2023-10-18 3:57 ` [Intel-xe] ✗ CI.checkpatch: warning " Patchwork
2023-10-18 3:59 ` [Intel-xe] ✓ CI.KUnit: success " Patchwork
2023-10-18 4:00 ` [Intel-xe] [PATCH v8 01/10] drm/xe: Handle errors from various components Himal Prasad Ghimiray
2023-10-19 8:23 ` Aravind Iddamsetty
2023-10-19 13:23 ` Upadhyay, Tejas
2023-10-18 4:00 ` [Intel-xe] [PATCH v7 02/10] drm/xe: Log and count the GT hardware errors Himal Prasad Ghimiray
2023-10-19 8:24 ` Aravind Iddamsetty
2023-10-18 4:00 ` [Intel-xe] [PATCH v6 03/10] drm/xe: Support GT hardware error reporting for PVC Himal Prasad Ghimiray
2023-10-19 8:25 ` Aravind Iddamsetty
2023-10-18 4:00 ` [Intel-xe] [PATCH v2 04/10] drm/xe: Support GSC " Himal Prasad Ghimiray
2023-10-19 8:25 ` Aravind Iddamsetty [this message]
2023-10-18 4:00 ` [Intel-xe] [PATCH v2 05/10] drm/xe: Notify userspace about GSC HW errors Himal Prasad Ghimiray
2023-10-19 0:52 ` Welty, Brian
2023-10-19 5:36 ` Ghimiray, Himal Prasad
2023-10-19 6:02 ` Aravind Iddamsetty
2023-10-19 6:36 ` Ghimiray, Himal Prasad
2023-10-18 4:00 ` [Intel-xe] [PATCH v3 06/10] drm/xe: Support SOC FATAL error handling for PVC Himal Prasad Ghimiray
2023-10-19 8:25 ` Aravind Iddamsetty
2023-10-18 4:00 ` [Intel-xe] [PATCH v2 07/10] drm/xe: Support SOC NONFATAL " Himal Prasad Ghimiray
2023-10-19 8:26 ` Aravind Iddamsetty
2023-10-18 4:00 ` [Intel-xe] [PATCH v2 08/10] drm/xe: Handle MDFI error severity Himal Prasad Ghimiray
2023-10-19 8:26 ` Aravind Iddamsetty
2023-10-18 4:00 ` [Intel-xe] [PATCH v2 09/10] drm/xe: Clear SOC CORRECTABLE error registers Himal Prasad Ghimiray
2023-10-19 8:26 ` Aravind Iddamsetty
2023-10-18 4:00 ` [Intel-xe] [PATCH v4 10/10] drm/xe: Clear all SoC errors post warm reset Himal Prasad Ghimiray
2023-10-19 8:26 ` Aravind Iddamsetty
2023-10-18 4:07 ` [Intel-xe] ✓ CI.Build: success for Supporting RAS on XE Patchwork
2023-10-18 4:08 ` [Intel-xe] ✓ CI.Hooks: " Patchwork
2023-10-18 4:09 ` [Intel-xe] ✓ CI.checksparse: " Patchwork
2023-10-18 4:45 ` [Intel-xe] ✓ CI.BAT: " Patchwork
-- strict thread matches above, loose matches on Subject: below --
2023-10-18 2:57 [Intel-xe] [PATCH v8 00/10] " Himal Prasad Ghimiray
2023-10-18 2:57 ` [Intel-xe] [PATCH v2 04/10] drm/xe: Support GSC hardware error reporting for PVC Himal Prasad Ghimiray
2023-10-18 2:48 [Intel-xe] [PATCH v8 00/10] *Supporting RAS on XE Himal Prasad Ghimiray
2023-10-18 2:48 ` [Intel-xe] [PATCH v2 04/10] drm/xe: Support GSC hardware error reporting for PVC Himal Prasad Ghimiray
2023-10-17 5:09 [Intel-xe] [PATCH v6 00/10] Supporting RAS on XE Himal Prasad Ghimiray
2023-10-17 5:09 ` [Intel-xe] [PATCH v2 04/10] drm/xe: Support GSC hardware error reporting for PVC Himal Prasad Ghimiray
2023-10-17 4:15 [Intel-xe] [PATCH v6 00/10] Supporting RAS on XE Himal Prasad Ghimiray
2023-10-17 4:15 ` [Intel-xe] [PATCH v2 04/10] drm/xe: Support GSC hardware error reporting for PVC Himal Prasad Ghimiray
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=508aa2bb-004a-aec8-188e-e72e08f914f8@linux.intel.com \
--to=aravind.iddamsetty@linux.intel.com \
--cc=himal.prasad.ghimiray@intel.com \
--cc=intel-xe@lists.freedesktop.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.