Re: [Intel-xe] [PATCH v5 3/4] drm/xe: Support GT hardware error reporting for PVC.

All of lore.kernel.org
 help / color / mirror / Atom feed

From: "Ghimiray, Himal Prasad" <himal.prasad.ghimiray@intel.com>
To: Aravind Iddamsetty <aravind.iddamsetty@linux.intel.com>,
	<intel-xe@lists.freedesktop.org>
Cc: Matt Roper <matthew.d.roper@intel.com>,
	Rodrigo Vivi <rodrigo.vivi@intel.com>
Subject: Re: [Intel-xe] [PATCH v5 3/4] drm/xe: Support GT hardware error reporting for PVC.
Date: Tue, 26 Sep 2023 10:41:55 +0530	[thread overview]
Message-ID: <237a5223-ff08-4d9d-8ddd-b2f9cf71e350@intel.com> (raw)
In-Reply-To: <63c64b22-501c-d6e1-669e-b85822d88819@linux.intel.com>


On 26-09-2023 09:51, Aravind Iddamsetty wrote:
> On 23/08/23 14:28, Himal Prasad Ghimiray wrote:
>> PVC supports GT error reporting via vector registers alongwith
>> error status register. Add support to report these errors and
>> update respective counters.
>> Incase of Subslice error reported by vector register, process the
>> error status register for applicable bits.
>>
>> Bspec: 54179, 54177, 53088, 53089
>>
>> Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
>> Cc: Aravind Iddamsetty <aravind.iddamsetty@intel.com>
>> Cc: Matthew Brost <matthew.brost@intel.com>
>> Cc: Matt Roper <matthew.d.roper@intel.com>
>> Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
>> Signed-off-by: Himal Prasad Ghimiray <himal.prasad.ghimiray@intel.com>
>> ---
>>   drivers/gpu/drm/xe/regs/xe_gt_error_regs.h |  16 +++
>>   drivers/gpu/drm/xe/xe_hw_error.c           | 122 ++++++++++++++++++++-
>>   drivers/gpu/drm/xe/xe_hw_error.h           |  20 ++++
>>   3 files changed, 154 insertions(+), 4 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/xe/regs/xe_gt_error_regs.h b/drivers/gpu/drm/xe/regs/xe_gt_error_regs.h
>> index 6180704a6149..39ea87914465 100644
>> --- a/drivers/gpu/drm/xe/regs/xe_gt_error_regs.h
>> +++ b/drivers/gpu/drm/xe/regs/xe_gt_error_regs.h
>> @@ -10,4 +10,20 @@
>>   #define ERR_STAT_GT_REG(x)              XE_REG(_PICK_EVEN((x), \
>>   						_ERR_STAT_GT_COR, \
>>   						_ERR_STAT_GT_NONFATAL))
>> +
>> +#define _ERR_STAT_GT_COR_VCTR_0         0x1002a0
>> +#define _ERR_STAT_GT_COR_VCTR_1         0x1002a4
>> +#define ERR_STAT_GT_COR_VCTR_REG(x)     XE_REG(_PICK_EVEN((x), \
>> +						_ERR_STAT_GT_COR_VCTR_0, \
>> +						_ERR_STAT_GT_COR_VCTR_1))
>> +
>> +#define _ERR_STAT_GT_FATAL_VCTR_0       0x100260
>> +#define _ERR_STAT_GT_FATAL_VCTR_1       0x100264
> the registers shall be defined in the ascending order of their addresses.
sure.
>> +#define ERR_STAT_GT_FATAL_VCTR_REG(x)   XE_REG(_PICK_EVEN((x), \
>> +						_ERR_STAT_GT_FATAL_VCTR_0, \
>> +						_ERR_STAT_GT_FATAL_VCTR_1))
>> +
>> +#define ERR_STAT_GT_VCTR_REG(hw_err, x) (hw_err == HARDWARE_ERROR_CORRECTABLE ? \
>> +						ERR_STAT_GT_COR_VCTR_REG(x) : \
>> +						ERR_STAT_GT_FATAL_VCTR_REG(x))
>>   #endif
>> diff --git a/drivers/gpu/drm/xe/xe_hw_error.c b/drivers/gpu/drm/xe/xe_hw_error.c
>> index 10aad0c396fb..deb020a509d2 100644
>> --- a/drivers/gpu/drm/xe/xe_hw_error.c
>> +++ b/drivers/gpu/drm/xe/xe_hw_error.c
>> @@ -148,6 +148,41 @@ static const struct err_msg_cntr_pair err_stat_gt_correctable_reg[] = {
>>   	[16 ... 31] = {"Undefined",             XE_GT_HW_ERR_UNKNOWN_CORR},
>>   };
>>   
>> +static const struct err_msg_cntr_pair pvc_err_stat_gt_fatal_reg[] = {
>> +	[0 ... 2]   =  {"Undefined",		XE_GT_HW_ERR_UNKNOWN_FATAL},
>> +	[3]         =  {"FPU",			XE_GT_HW_ERR_FPU_FATAL},
>> +	[4 ... 5]   =  {"Undefined",            XE_GT_HW_ERR_UNKNOWN_FATAL},
>> +	[6]         =  {"GUC SRAM",		XE_GT_HW_ERR_GUC_FATAL},
>> +	[7 ... 12]  =  {"Undefined",		XE_GT_HW_ERR_UNKNOWN_FATAL},
>> +	[13]        =  {"SLM",			XE_GT_HW_ERR_SLM_FATAL},
>> +	[14]        =  {"Undefined",		XE_GT_HW_ERR_UNKNOWN_FATAL},
>> +	[15]        =  {"EU GRF",		XE_GT_HW_ERR_EU_GRF_FATAL},
>> +	[16 ... 31] =  {"Undefined",            XE_GT_HW_ERR_UNKNOWN_FATAL},
>> +};
> let all fatal definitions be moved into the patch in which fatal is processed
Squashing patch 4 with pacth 1.
>> +
>> +static const struct err_msg_cntr_pair pvc_err_stat_gt_correctable_reg[] = {
>> +	[0]         = {"Undefined",             XE_GT_HW_ERR_UNKNOWN_CORR},
>> +	[1]         = {"SINGLE BIT GUC SRAM",	XE_GT_HW_ERR_GUC_CORR},
>> +	[2 ... 12]  = {"Undefined",		XE_GT_HW_ERR_UNKNOWN_CORR},
>> +	[13]        = {"SINGLE BIT SLM",	XE_GT_HW_ERR_SLM_CORR},
>> +	[14]        = {"SINGLE BIT EU IC",	XE_GT_HW_ERR_EU_IC_CORR},
>> +	[15]        = {"SINGLE BIT EU GRF",	XE_GT_HW_ERR_EU_GRF_CORR},
>> +	[16 ... 31] = {"Undefined",             XE_GT_HW_ERR_UNKNOWN_CORR},
>> +};
>> +
>> +static const struct err_msg_cntr_pair err_stat_gt_fatal_vectr_reg[] = {
>> +	[0 ... 1]         = {"SUBSLICE",	XE_GT_HW_ERR_SUBSLICE_FATAL},
>> +	[2 ... 3]         = {"L3BANK",		XE_GT_HW_ERR_L3BANK_FATAL},
>> +	[4 ... 5]         = {"Undefined",	XE_GT_HW_ERR_UNKNOWN_FATAL},
>> +	[6]               = {"TLB",		XE_GT_HW_ERR_TLB_FATAL},
>> +	[7]               = {"L3 FABRIC",	XE_GT_HW_ERR_L3_FABRIC_FATAL},
>> +};
>> +
>> +static const struct err_msg_cntr_pair err_stat_gt_correctable_vectr_reg[] = {
>> +	[0 ... 1]         = {"SUBSLICE",	XE_GT_HW_ERR_SUBSLICE_CORR},
>> +	[2 ... 3]         = {"L3BANK",		XE_GT_HW_ERR_L3BANK_CORR},
>> +};
>> +
>>   void xe_assign_hw_err_regs(struct xe_device *xe)
>>   {
>>   	const struct err_msg_cntr_pair **dev_err_stat = xe->hw_err_regs.dev_err_stat;
>> @@ -164,6 +199,8 @@ void xe_assign_hw_err_regs(struct xe_device *xe)
>>   		dev_err_stat[HARDWARE_ERROR_CORRECTABLE] = pvc_err_stat_correctable_reg;
>>   		dev_err_stat[HARDWARE_ERROR_NONFATAL] = pvc_err_stat_nonfatal_reg;
>>   		dev_err_stat[HARDWARE_ERROR_FATAL] = pvc_err_stat_fatal_reg;
>> +		err_stat_gt[HARDWARE_ERROR_CORRECTABLE] = pvc_err_stat_gt_correctable_reg;
>> +		err_stat_gt[HARDWARE_ERROR_FATAL] = pvc_err_stat_gt_fatal_reg;
>>   	} else {
>>   		/* For other platforms report only GT errors */
>>   		dev_err_stat[HARDWARE_ERROR_CORRECTABLE] = dev_err_stat_correctable_reg;
>> @@ -176,7 +213,7 @@ void xe_assign_hw_err_regs(struct xe_device *xe)
>>   }
>>   
>>   static void
>> -xe_gt_hw_error_handler(struct xe_gt *gt, const enum hardware_error hw_err)
>> +xe_gt_hw_error_status_reg_handler(struct xe_gt *gt, const enum hardware_error hw_err)
> is xe_gt_hw_error_log_status_reg sounding better ? and have this in the earlier patch
Ok
>>   {
>>   	const char *hw_err_str = hardware_error_type_to_str(hw_err);
>>   	const struct err_msg_cntr_pair *errstat;
>> @@ -186,9 +223,6 @@ xe_gt_hw_error_handler(struct xe_gt *gt, const enum hardware_error hw_err)
>>   	u32 indx;
>>   	u32 errbit;
>>   
>> -	if (gt_to_xe(gt)->info.platform == XE_PVC)
>> -		return;
>> -
>>   	lockdep_assert_held(&gt_to_xe(gt)->irq.lock);
>>   	err_regs = &gt_to_xe(gt)->hw_err_regs;
>>   	errsrc = xe_mmio_read32(gt, ERR_STAT_GT_REG(hw_err));
>> @@ -230,6 +264,86 @@ xe_gt_hw_error_handler(struct xe_gt *gt, const enum hardware_error hw_err)
>>   clear_reg: xe_mmio_write32(gt, ERR_STAT_GT_REG(hw_err), errsrc);
>>   }
>>   
>> +static void
>> +xe_gt_hw_error_vectr_reg_handler(struct xe_gt *gt, const enum hardware_error hw_err)
> similarly, xe_gt_hw_error_log_vector_reg
ok
>> +{
>> +	const char *hw_err_str = hardware_error_type_to_str(hw_err);
>> +	const struct err_msg_cntr_pair *errvctr;
>> +	const char *errmsg;
>> +	bool errstat_read;
>> +	u32 num_vctr_reg;
>> +	u32 indx;
>> +	u32 vctr;
>> +	u32 i;
>> +
>> +	switch (hw_err) {
>> +	case HARDWARE_ERROR_FATAL:
>> +		num_vctr_reg = ERR_STAT_GT_FATAL_VCTR_LEN;
>> +		errvctr = err_stat_gt_fatal_vectr_reg;
> why don't we define registers once like it's done  in  xe_assign_hw_err_regs.
>> +		break;
>> +	case HARDWARE_ERROR_NONFATAL:
>> +		/*  The GT Non Fatal Error Status Register has only reserved bits
>> +		 *  Nothing to service.
>> +		 */
>> +		drm_err_ratelimited(&gt_to_xe(gt)->drm, HW_ERR "GT%d detected %s error\n",
>> +				    gt->info.id, hw_err_str);
>> +		return;
>> +	case HARDWARE_ERROR_CORRECTABLE:
>> +		num_vctr_reg = ERR_STAT_GT_COR_VCTR_LEN;
>> +		errvctr = err_stat_gt_correctable_vectr_reg;
>> +		break;
>> +	default:
>> +		return;
>> +	}
>> +
>> +	errstat_read = false;
>> +
>> +	for (i = 0 ; i < num_vctr_reg; i++) {
>> +		vctr = xe_mmio_read32(gt, ERR_STAT_GT_VCTR_REG(hw_err, i));
>> +		if (!vctr)
>> +			continue;
>> +
>> +		errmsg = errvctr[i].errmsg;
>> +		indx = errvctr[i].cntr_indx;
>> +
>> +		if (hw_err == HARDWARE_ERROR_FATAL)
>> +			drm_err_ratelimited(&gt_to_xe(gt)->drm, HW_ERR
>> +					    "GT%d detected %s %s error. ERR_VECT_GT_%s[%d]:0x%08x\n",
>> +					    gt->info.id, errmsg, hw_err_str, hw_err_str, i, vctr);
>> +		else
>> +			drm_warn(&gt_to_xe(gt)->drm, HW_ERR
>> +				 "GT%d detected %s %s error. ERR_VECT_GT_%s[%d]:0x%08x\n",
>> +				 gt->info.id, errmsg, hw_err_str, hw_err_str, i, vctr);
>> +
>> +		if (i < ERR_STAT_GT_VCTR4)
>> +			gt->errors.count[indx] += hweight32(vctr);
>> +
>> +		if (i == ERR_STAT_GT_VCTR6)
>> +			gt->errors.count[indx] += hweight16(vctr);
>> +
>> +		if (i == ERR_STAT_GT_VCTR7)
>> +			gt->errors.count[indx] += hweight8(vctr);
>> +
>> +		if (i < ERR_STAT_GT_VCTR2 && !errstat_read) {
>> +			xe_gt_hw_error_status_reg_handler(gt, hw_err);
>> +			errstat_read = true;
> what is the need of errstat_read, isn't i < ERR_STAT_GT_VCTR2 sufficient
In case we see errors from both vect0 and vect1. We shouldn't be 
servicing error status register twice.
>
> and also instead of if check for every i we can have switch case
>
> switch (i) {
>
> case ERR_STAT_GT_VCTR0:
> case ERR_STAT_GT_VCTR1:
> case ERR_STAT_GT_VCTR2:
> case ERR_STAT_GT_VCTR3:
>
>     gt->errors.count[indx] += hweight32(vctr);
>
>      if ( i < ERR_STAT_GT_VCTR2)
>
>          xe_gt_hw_error_status_reg_handler(gt, hw_err);
>
>      break;
>
> case ERR_STAT_GT_VCTR6:
> case ERR_STAT_GT_VCTR7:
>
>      gt->errors.count[indx] += (i == ERR_STAT_GT_VCTR6) ? hweight16(vctr) : hweight8(vctr);
>      break;
>
> default:
Looks clean. Will use this.
>
> }
>
>> +		}
>> +
>> +		xe_mmio_write32(gt, ERR_STAT_GT_VCTR_REG(hw_err, i), vctr);
>> +	}
>> +}
>> +
>> +static void
>> +xe_gt_hw_error_handler(struct xe_gt *gt, const enum hardware_error hw_err)
>> +{
>> +	lockdep_assert_held(&gt_to_xe(gt)->irq.lock);
>> +
>> +	if (gt_to_xe(gt)->info.platform == XE_PVC)
>> +		xe_gt_hw_error_vectr_reg_handler(gt, hw_err);
>> +	else
>> +		xe_gt_hw_error_status_reg_handler(gt, hw_err);
>> +}
>> +
>>   static void
>>   xe_hw_error_source_handler(struct xe_tile *tile, const enum hardware_error hw_err)
>>   {
>> diff --git a/drivers/gpu/drm/xe/xe_hw_error.h b/drivers/gpu/drm/xe/xe_hw_error.h
>> index 82c947247c27..3fcbbcc338fe 100644
>> --- a/drivers/gpu/drm/xe/xe_hw_error.h
>> +++ b/drivers/gpu/drm/xe/xe_hw_error.h
>> @@ -51,8 +51,21 @@ enum xe_tile_hw_errors {
>>   	XE_TILE_HW_ERROR_MAX,
>>   };
>>   
>> +enum gt_vctr_registers {
>> +	ERR_STAT_GT_VCTR0 = 0,
>> +	ERR_STAT_GT_VCTR1,
>> +	ERR_STAT_GT_VCTR2,
>> +	ERR_STAT_GT_VCTR3,
>> +	ERR_STAT_GT_VCTR4,
>> +	ERR_STAT_GT_VCTR5,
>> +	ERR_STAT_GT_VCTR6,
>> +	ERR_STAT_GT_VCTR7,
>> +};
>> +
>>   /* Count of GT Correctable and FATAL HW ERRORS */
>>   enum xe_gt_hw_errors {
>> +	XE_GT_HW_ERR_SUBSLICE_CORR,
>> +	XE_GT_HW_ERR_L3BANK_CORR,
>>   	XE_GT_HW_ERR_L3_SNG_CORR,
>>   	XE_GT_HW_ERR_GUC_CORR,
>>   	XE_GT_HW_ERR_SAMPLER_CORR,
>> @@ -60,6 +73,8 @@ enum xe_gt_hw_errors {
>>   	XE_GT_HW_ERR_EU_IC_CORR,
>>   	XE_GT_HW_ERR_EU_GRF_CORR,
>>   	XE_GT_HW_ERR_UNKNOWN_CORR,
>> +	XE_GT_HW_ERR_SUBSLICE_FATAL,
>> +	XE_GT_HW_ERR_L3BANK_FATAL,
>>   	XE_GT_HW_ERR_ARR_BIST_FATAL,
>>   	XE_GT_HW_ERR_FPU_FATAL,
>>   	XE_GT_HW_ERR_L3_DOUB_FATAL,
>> @@ -71,10 +86,15 @@ enum xe_gt_hw_errors {
>>   	XE_GT_HW_ERR_SLM_FATAL,
>>   	XE_GT_HW_ERR_EU_IC_FATAL,
>>   	XE_GT_HW_ERR_EU_GRF_FATAL,
>> +	XE_GT_HW_ERR_TLB_FATAL,
>> +	XE_GT_HW_ERR_L3_FABRIC_FATAL,
>>   	XE_GT_HW_ERR_UNKNOWN_FATAL,
>>   	XE_GT_HW_ERROR_MAX,
>>   };
>>   
>> +#define ERR_STAT_GT_COR_VCTR_LEN (4)
>> +#define ERR_STAT_GT_FATAL_VCTR_LEN (8)
>> +
>>   struct err_msg_cntr_pair {
>>   	const char *errmsg;
>>   	const u32 cntr_indx;
> Thanks,
> Aravind.

next prev parent reply	other threads:[~2023-09-26  5:12 UTC|newest]

Thread overview: 21+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-08-23  8:58 [Intel-xe] [PATCH v5 0/4] Supporting RAS on XE Himal Prasad Ghimiray
2023-08-23  8:58 ` [Intel-xe] [PATCH v5 1/4] drm/xe: Handle errors from various components Himal Prasad Ghimiray
2023-09-26  4:20   ` Aravind Iddamsetty
2023-09-26  4:57     ` Ghimiray, Himal Prasad
2023-09-26 10:09       ` Aravind Iddamsetty
2023-10-04 12:07   ` Aravind Iddamsetty
2023-10-05  4:01     ` Aravind Iddamsetty
2023-08-23  8:58 ` [Intel-xe] [PATCH v5 2/4] drm/xe: Log and count the GT hardware errors Himal Prasad Ghimiray
2023-09-26  4:20   ` Aravind Iddamsetty
2023-09-26  5:08     ` Ghimiray, Himal Prasad
2023-08-23  8:58 ` [Intel-xe] [PATCH v5 3/4] drm/xe: Support GT hardware error reporting for PVC Himal Prasad Ghimiray
2023-09-26  4:21   ` Aravind Iddamsetty
2023-09-26  5:11     ` Ghimiray, Himal Prasad [this message]
2023-08-23  8:58 ` [Intel-xe] [PATCH v5 4/4] drm/xe: Process fatal hardware errors Himal Prasad Ghimiray
2023-09-26  4:21   ` Aravind Iddamsetty
2023-09-26 10:24     ` Ghimiray, Himal Prasad
2023-08-23  9:00 ` [Intel-xe] ✓ CI.Patch_applied: success for Supporting RAS on XE (rev4) Patchwork
2023-08-23  9:00 ` [Intel-xe] ✗ CI.checkpatch: warning " Patchwork
2023-08-23  9:01 ` [Intel-xe] ✓ CI.KUnit: success " Patchwork
2023-08-23  9:05 ` [Intel-xe] ✓ CI.Build: " Patchwork
2023-08-23  9:05 ` [Intel-xe] ✗ CI.Hooks: failure " Patchwork

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=237a5223-ff08-4d9d-8ddd-b2f9cf71e350@intel.com \
    --to=himal.prasad.ghimiray@intel.com \
    --cc=aravind.iddamsetty@linux.intel.com \
    --cc=intel-xe@lists.freedesktop.org \
    --cc=matthew.d.roper@intel.com \
    --cc=rodrigo.vivi@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.