Re: [PATCH v3 6/7] drm/xe/xe_hw_error: Handle CSC Firmware reported Hardware errors

Intel-XE Archive on lore.kernel.org
 help / color / mirror / Atom feed

From: Riana Tauro <riana.tauro@intel.com>
To: "Summers, Stuart" <stuart.summers@intel.com>,
	"intel-xe@lists.freedesktop.org" <intel-xe@lists.freedesktop.org>
Cc: "Jadav, Raag" <raag.jadav@intel.com>,
	"Anirban, Sk" <sk.anirban@intel.com>,
	"Vivi, Rodrigo" <rodrigo.vivi@intel.com>,
	"Scarbrough, Frank" <frank.scarbrough@intel.com>,
	"aravind.iddamsetty@linux.intel.com"
	<aravind.iddamsetty@linux.intel.com>,
	"Gupta, Anshuman" <anshuman.gupta@intel.com>,
	"Nerlige Ramappa, Umesh" <umesh.nerlige.ramappa@intel.com>,
	"De Marchi, Lucas" <lucas.demarchi@intel.com>
Subject: Re: [PATCH v3 6/7] drm/xe/xe_hw_error: Handle CSC Firmware reported Hardware errors
Date: Thu, 10 Jul 2025 11:08:09 +0530	[thread overview]
Message-ID: <301ed83e-8224-4557-8421-4dfa0fea0fac@intel.com> (raw)
In-Reply-To: <f4efe2419d53874cc7a1baa0c60f4c24cd0eeb68.camel@intel.com>

Hi Stuart

On 7/9/2025 11:27 PM, Summers, Stuart wrote:
> On Wed, 2025-07-02 at 19:41 +0530, Riana Tauro wrote:
>> Add support to handle CSC firmware reported errors. When CSC firmware
>> errors are encoutered, a error interrupt is received by the GFX
>> device as
>> a MSI interrupt.
>>
>> Device Source control registers indicates the source of the error as
>> CSC
>> The HEC error status register indicates that the error is firmware
>> reported
>> Depending on the type of error, the error cause is written to the HEC
>> Firmware error register.
>>
>> On encountering such CSC firmware errors, the graphics device is
>> non-recoverable from driver context. The only way to recover from
>> these
>> errors is firmware flash. The device is then wedged and userspace is
>> notified with a drm uevent
>>
>> v2: use vendor recovery method with
>>      runtime survivability (Christian, Rodrigo, Raag)
>>
>> Signed-off-by: Riana Tauro <riana.tauro@intel.com>
>> ---
>>   drivers/gpu/drm/xe/regs/xe_gsc_regs.h      |  2 +
>>   drivers/gpu/drm/xe/regs/xe_hw_error_regs.h |  7 ++-
>>   drivers/gpu/drm/xe/xe_device.c             | 11 +++-
>>   drivers/gpu/drm/xe/xe_device_types.h       |  3 +
>>   drivers/gpu/drm/xe/xe_hw_error.c           | 70
>> +++++++++++++++++++++-
>>   5 files changed, 88 insertions(+), 5 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/xe/regs/xe_gsc_regs.h
>> b/drivers/gpu/drm/xe/regs/xe_gsc_regs.h
>> index 9b66cc972a63..180be82672ab 100644
>> --- a/drivers/gpu/drm/xe/regs/xe_gsc_regs.h
>> +++ b/drivers/gpu/drm/xe/regs/xe_gsc_regs.h
>> @@ -13,6 +13,8 @@
>>   
>>   /* Definitions of GSC H/W registers, bits, etc */
>>   
>> +#define BMG_GSC_HECI1_BASE     0x373000
>> +
>>   #define MTL_GSC_HECI1_BASE     0x00116000
>>   #define MTL_GSC_HECI2_BASE     0x00117000
>>   
>> diff --git a/drivers/gpu/drm/xe/regs/xe_hw_error_regs.h
>> b/drivers/gpu/drm/xe/regs/xe_hw_error_regs.h
>> index ed9b81fb28a0..c146b9ef44eb 100644
>> --- a/drivers/gpu/drm/xe/regs/xe_hw_error_regs.h
>> +++ b/drivers/gpu/drm/xe/regs/xe_hw_error_regs.h
>> @@ -6,10 +6,15 @@
>>   #ifndef _XE_HW_ERROR_REGS_H_
>>   #define _XE_HW_ERROR_REGS_H_
>>   
>> +#define HEC_UNCORR_ERR_STATUS(base)                    XE_REG((base)
>> + 0x118)
>> +#define    UNCORR_FW_REPORTED_ERR                      BIT(6)
>> +
>> +#define HEC_UNCORR_FW_ERR_DW0(base)                    XE_REG((base)
>> + 0x124)
>> +
>>   #define DEV_ERR_STAT_NONFATAL                  0x100178
>>   #define DEV_ERR_STAT_CORRECTABLE               0x10017c
>>   #define
>> DEV_ERR_STAT_REG(x)                    XE_REG(_PICK_EVEN((x), \
>>                                                                   
>> DEV_ERR_STAT_CORRECTABLE, \
>>                                                                   
>> DEV_ERR_STAT_NONFATAL))
>> -
>> +#define   XE_CSC_ERROR                         BIT(17)
>>   #endif
>> diff --git a/drivers/gpu/drm/xe/xe_device.c
>> b/drivers/gpu/drm/xe/xe_device.c
>> index d6b680abc3ae..fbc50cebfc11 100644
>> --- a/drivers/gpu/drm/xe/xe_device.c
>> +++ b/drivers/gpu/drm/xe/xe_device.c
>> @@ -1154,6 +1154,7 @@ static void xe_device_wedged_fini(struct
>> drm_device *drm, void *arg)
>>    */
>>   void xe_device_declare_wedged(struct xe_device *xe)
>>   {
>> +       unsigned long recovery_method;
>>          struct xe_gt *gt;
>>          u8 id;
>>   
>> @@ -1169,6 +1170,12 @@ void xe_device_declare_wedged(struct xe_device
>> *xe)
>>                  return;
>>          }
>>   
>> +       /* Default recovery method */
>> +       recovery_method = DRM_WEDGE_RECOVERY_REBIND |
>> DRM_WEDGE_RECOVERY_BUS_RESET;
>> +
>> +       if (xe_survivability_mode_is_runtime(xe))
>> +               recovery_method = DRM_WEDGE_RECOVERY_VENDOR;
>> +
>>          for_each_gt(gt, xe, id)
>>                  xe_gt_declare_wedged(gt);
>>   
>> @@ -1181,8 +1188,6 @@ void xe_device_declare_wedged(struct xe_device
>> *xe)
>>                          dev_name(xe->drm.dev));
>>   
>>                  /* Notify userspace of wedged device */
>> -               drm_dev_wedged_event(&xe->drm,
>> -                                    DRM_WEDGE_RECOVERY_REBIND |
>> DRM_WEDGE_RECOVERY_BUS_RESET,
>> -                                    NULL);
>> +               drm_dev_wedged_event(&xe->drm, recovery_method,
>> NULL);
>>          }
>>   }
>> diff --git a/drivers/gpu/drm/xe/xe_device_types.h
>> b/drivers/gpu/drm/xe/xe_device_types.h
>> index 7e4f6d846af6..5daf5ba6bf51 100644
>> --- a/drivers/gpu/drm/xe/xe_device_types.h
>> +++ b/drivers/gpu/drm/xe/xe_device_types.h
>> @@ -241,6 +241,9 @@ struct xe_tile {
>>          /** @memirq: Memory Based Interrupts. */
>>          struct xe_memirq memirq;
>>   
>> +       /** @csc_hw_error_work: worker to report CSC HW errors */
>> +       struct work_struct csc_hw_error_work;
>> +
>>          /** @pcode: tile's PCODE */
>>          struct {
>>                  /** @pcode.lock: protecting tile's PCODE mailbox data
>> */
>> diff --git a/drivers/gpu/drm/xe/xe_hw_error.c
>> b/drivers/gpu/drm/xe/xe_hw_error.c
>> index 0f2590839900..73c788fd0dee 100644
>> --- a/drivers/gpu/drm/xe/xe_hw_error.c
>> +++ b/drivers/gpu/drm/xe/xe_hw_error.c
>> @@ -3,12 +3,16 @@
>>    * Copyright © 2025 Intel Corporation
>>    */
>>   
>> +#include "regs/xe_gsc_regs.h"
>>   #include "regs/xe_hw_error_regs.h"
>>   #include "regs/xe_irq_regs.h"
>>   
>>   #include "xe_device.h"
>>   #include "xe_hw_error.h"
>>   #include "xe_mmio.h"
>> +#include "xe_survivability_mode.h"
>> +
>> +#define  HEC_UNCORR_FW_ERR_BITS 4
>>   
>>   /* Error categories reported by hardware */
>>   enum hardware_error {
>> @@ -18,6 +22,13 @@ enum hardware_error {
>>          HARDWARE_ERROR_MAX,
>>   };
>>   
>> +static const char * const hec_uncorrected_fw_errors[] = {
>> +       "Fatal",
>> +       "CSE Disabled",
>> +       "FD Corruption",
>> +       "Data Corruption"
>> +};
>> +
>>   static const char *hw_error_to_str(const enum hardware_error hw_err)
>>   {
>>          switch (hw_err) {
>> @@ -32,6 +43,58 @@ static const char *hw_error_to_str(const enum
>> hardware_error hw_err)
>>          }
>>   }
>>   
>> +static void csc_hw_error_work(struct work_struct *work)
>> +{
>> +       struct xe_tile *tile = container_of(work, typeof(*tile),
>> csc_hw_error_work);
>> +       struct xe_device *xe = tile_to_xe(tile);
>> +       int ret;
>> +
>> +       ret = xe_survivability_mode_enable(xe,
>> XE_SURVIVABILITY_TYPE_RUNTIME);
>> +       if (ret)
>> +               drm_err(&xe->drm, "Failed to enable runtime
>> survivability mode\n");
>> +
>> +       xe_device_declare_wedged(xe);
>> +}
>> +
>> +static void csc_hw_error_handler(struct xe_tile *tile, const enum
>> hardware_error hw_err)
>> +{
>> +       const char *hw_err_str = hw_error_to_str(hw_err);
>> +       struct xe_device *xe = tile_to_xe(tile);
>> +       struct xe_mmio *mmio = &tile->mmio;
>> +       u32 base, err_bit, err_src;
>> +       unsigned long fw_err;
>> +
>> +       if (xe->info.platform != XE_BATTLEMAGE)
>> +               return;
>> +
>> +       /* Not supported in BMG */
>> +       if (hw_err == HARDWARE_ERROR_CORRECTABLE)
>> +               return;
> 
> Again, here and above, why are we specifically limiting this to BMG?

This is CSC error handler and this bit is present only from BMG and the 
heci base here is also specific to bmg. Hence the check
CSC in BMG doesn't support correctable errors.

Thanks
Riana

> 
> Thanks,
> Stuart
> 
>> +
>> +       base = BMG_GSC_HECI1_BASE;
>> +       lockdep_assert_held(&xe->irq.lock);
>> +       err_src = xe_mmio_read32(mmio, HEC_UNCORR_ERR_STATUS(base));
>> +       if (!err_src) {
>> +               drm_err_ratelimited(&xe->drm, HW_ERR "Tile%d reported
>> HEC_ERR_STATUS_%s blank\n",
>> +                                   tile->id, hw_err_str);
>> +               return;
>> +       }
>> +
>> +       if (err_src & UNCORR_FW_REPORTED_ERR) {
>> +               fw_err = xe_mmio_read32(mmio,
>> HEC_UNCORR_FW_ERR_DW0(base));
>> +               for_each_set_bit(err_bit, &fw_err,
>> HEC_UNCORR_FW_ERR_BITS) {
>> +                       drm_err_ratelimited(&xe->drm, HW_ERR
>> +                                           "%s: HEC Uncorrected FW
>> %s error reported, bit[%d] is set\n",
>> +                                            hw_err_str,
>> hec_uncorrected_fw_errors[err_bit],
>> +                                            err_bit);
>> +
>> +                       schedule_work(&tile->csc_hw_error_work);
>> +               }
>> +       }
>> +
>> +       xe_mmio_write32(mmio, HEC_UNCORR_ERR_STATUS(base), err_src);
>> +}
>> +
>>   static void hw_error_source_handler(struct xe_tile *tile, const enum
>> hardware_error hw_err)
>>   {
>>          const char *hw_err_str = hw_error_to_str(hw_err);
>> @@ -50,7 +113,8 @@ static void hw_error_source_handler(struct xe_tile
>> *tile, const enum hardware_er
>>                  goto unlock;
>>          }
>>   
>> -       /* TODO: Process errrors per source */
> 
> I still think we should have a print here to show the errors we
> received, especially since CSC isn't the only bit here. We're just only
> implementing recovery support for that case.
> 
> Thanks,
> Stuart
> 
>> +       if (err_src & XE_CSC_ERROR)
>> +               csc_hw_error_handler(tile, hw_err);
>>   
>>          xe_mmio_write32(&tile->mmio, DEV_ERR_STAT_REG(hw_err),
>> err_src);
>>   
>> @@ -101,8 +165,12 @@ static void process_hw_errors(struct xe_device
>> *xe)
>>    */
>>   void xe_hw_error_init(struct xe_device *xe)
>>   {
>> +       struct xe_tile *tile = xe_device_get_root_tile(xe);
>> +
>>          if (!IS_DGFX(xe) || IS_SRIOV_VF(xe))
>>                  return;
>>   
>> +       INIT_WORK(&tile->csc_hw_error_work, csc_hw_error_work);
>> +
>>          process_hw_errors(xe);
>>   }
>

next prev parent reply	other threads:[~2025-07-10  5:38 UTC|newest]

Thread overview: 36+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-07-02 14:11 [PATCH v3 0/7] Handle Firmware reported Hardware Errors Riana Tauro
2025-07-02 14:11 ` [PATCH v3 1/7] drm: Add a vendor-specific recovery method to device wedged uevent Riana Tauro
2025-07-03  4:06   ` Raag Jadav
2025-07-03  5:20     ` Riana Tauro
2025-07-03  6:40       ` Raag Jadav
2025-07-03  6:50         ` Riana Tauro
2025-07-02 14:11 ` [PATCH v3 2/7] drm/xe: Set GT as wedged before sending " Riana Tauro
2025-07-02 21:41   ` Rodrigo Vivi
2025-07-03  4:18   ` Raag Jadav
2025-07-03  5:18     ` Riana Tauro
2025-07-03  6:45       ` Raag Jadav
2025-07-07  6:44         ` Riana Tauro
2025-07-02 14:11 ` [PATCH v3 3/7] drm/xe/xe_survivability: Add support for Runtime survivability mode Riana Tauro
2025-07-02 21:40   ` Rodrigo Vivi
2025-07-03  5:16     ` Riana Tauro
2025-07-02 23:33   ` kernel test robot
2025-07-09 18:04   ` Summers, Stuart
2025-07-10  5:27     ` Riana Tauro
2025-07-15 17:30       ` Summers, Stuart
2025-07-02 14:11 ` [PATCH v3 4/7] drm/xe/doc: Document device wedged and runtime survivability Riana Tauro
2025-07-02 13:55   ` Riana Tauro
2025-07-03  7:19   ` Raag Jadav
2025-07-02 14:11 ` [PATCH v3 5/7] drm/xe: Add support to handle hardware errors Riana Tauro
2025-07-09 17:27   ` Summers, Stuart
2025-07-10  5:54     ` Riana Tauro
2025-07-02 14:11 ` [PATCH v3 6/7] drm/xe/xe_hw_error: Handle CSC Firmware reported Hardware errors Riana Tauro
2025-07-02 21:35   ` Rodrigo Vivi
2025-07-03  5:28     ` Riana Tauro
2025-07-09 17:57   ` Summers, Stuart
2025-07-10  5:38     ` Riana Tauro [this message]
2025-07-02 14:11 ` [PATCH v3 7/7] drm/xe/xe_hw_error: Add fault injection to trigger csc error handler Riana Tauro
2025-07-02 15:53 ` ✗ CI.checkpatch: warning for Handle Firmware reported Hardware Errors (rev3) Patchwork
2025-07-02 15:54 ` ✓ CI.KUnit: success " Patchwork
2025-07-02 16:17 ` ✗ CI.checksparse: warning " Patchwork
2025-07-02 16:39 ` ✓ Xe.CI.BAT: success " Patchwork
2025-07-04  6:45 ` ✗ Xe.CI.Full: failure " Patchwork

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=301ed83e-8224-4557-8421-4dfa0fea0fac@intel.com \
    --to=riana.tauro@intel.com \
    --cc=anshuman.gupta@intel.com \
    --cc=aravind.iddamsetty@linux.intel.com \
    --cc=frank.scarbrough@intel.com \
    --cc=intel-xe@lists.freedesktop.org \
    --cc=lucas.demarchi@intel.com \
    --cc=raag.jadav@intel.com \
    --cc=rodrigo.vivi@intel.com \
    --cc=sk.anirban@intel.com \
    --cc=stuart.summers@intel.com \
    --cc=umesh.nerlige.ramappa@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox