Intel-XE Archive on lore.kernel.org
 help / color / mirror / Atom feed
From: Riana Tauro <riana.tauro@intel.com>
To: Rodrigo Vivi <rodrigo.vivi@intel.com>
Cc: <intel-xe@lists.freedesktop.org>, <anshuman.gupta@intel.com>,
	<lucas.demarchi@intel.com>, <badal.nilawar@intel.com>
Subject: Re: [PATCH 2/2] drm/xe/xe_survivability: Add support for survivability mode v2
Date: Thu, 13 Nov 2025 13:56:19 +0530	[thread overview]
Message-ID: <291ca7a1-51fb-4f28-b1b7-1e4f9d12878c@intel.com> (raw)
In-Reply-To: <aRTQCoNGUf_D0f1F@intel.com>

Hi Rodrigo

On 11/12/2025 11:50 PM, Rodrigo Vivi wrote:
> On Wed, Nov 12, 2025 at 04:03:39PM +0530, Riana Tauro wrote:
>> v2 survivability breadcrumbs introduces a new mode called
>> SPI Flash Descriptor Override mode (FDO). This is enabled by
>> PCODE when MEI itself fails and firmware cannot be updated via
>> MEI using igsc. This mode provides the ability to update
>> the firmware directly via SPI driver.
>>
>> Xe KMD initializes the nvm aux driver if FDO mode is enabled.
>>
>> Userspace should check FDO mode entry in survivability info sysfs before
>> using the SPI driver to update firmware.
>>
>> 	/sys/bus/pci/devices/<device>/survivability_info/fdo_mode
>>
>> v2 also supports survivability mode for critical boot errors.
>>
>> Signed-off-by: Riana Tauro <riana.tauro@intel.com>
>> ---
>>   drivers/gpu/drm/xe/xe_pcode_api.h             |  2 +
>>   drivers/gpu/drm/xe/xe_survivability_mode.c    | 41 +++++++++++++++++--
>>   .../gpu/drm/xe/xe_survivability_mode_types.h  |  6 +++
>>   3 files changed, 46 insertions(+), 3 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/xe/xe_pcode_api.h b/drivers/gpu/drm/xe/xe_pcode_api.h
>> index 92bfcba51e19..d41f07f9194d 100644
>> --- a/drivers/gpu/drm/xe/xe_pcode_api.h
>> +++ b/drivers/gpu/drm/xe/xe_pcode_api.h
>> @@ -77,11 +77,13 @@
>>   
>>   #define PCODE_SCRATCH(x)		XE_REG(0x138320 + ((x) * 4))
>>   /* PCODE_SCRATCH0 */
>> +#define   BREADCRUMB_VERSION		REG_GENMASK(31, 29)
>>   #define   AUXINFO_REG_OFFSET		REG_GENMASK(17, 15)
>>   #define   OVERFLOW_REG_OFFSET		REG_GENMASK(14, 12)
>>   #define   HISTORY_TRACKING		REG_BIT(11)
>>   #define   OVERFLOW_SUPPORT		REG_BIT(10)
>>   #define   AUXINFO_SUPPORT		REG_BIT(9)
>> +#define   FDO_MODE			REG_BIT(4)
>>   #define   BOOT_STATUS			REG_GENMASK(3, 1)
>>   #define      CRITICAL_FAILURE		4
>>   #define      NON_CRITICAL_FAILURE	7
>> diff --git a/drivers/gpu/drm/xe/xe_survivability_mode.c b/drivers/gpu/drm/xe/xe_survivability_mode.c
>> index 3d9417911c33..d22fdd08d227 100644
>> --- a/drivers/gpu/drm/xe/xe_survivability_mode.c
>> +++ b/drivers/gpu/drm/xe/xe_survivability_mode.c
>> @@ -16,6 +16,7 @@
>>   #include "xe_heci_gsc.h"
>>   #include "xe_i2c.h"
>>   #include "xe_mmio.h"
>> +#include "xe_nvm.h"
>>   #include "xe_pcode_api.h"
>>   #include "xe_vsec.h"
>>   
>> @@ -66,6 +67,11 @@
>>    *
>>    * - ``aux_info<n>`` : Some failures have additional debug information
>>    *
>> + * - ``fdo_mode`` : To allow recovery in scenarios where MEI itself fails, a new SPI Flash
>> + *   Descriptor Override (FDO) mode is added in v2 survivability breadcrumbs. This mode is enabled
>> + *   by PCODE and provides the ability to directly update the firmware via SPI Driver without
>> + *   any dependency on MEI. Xe KMD initializes the nvm aux driver if FDO mode is enabled.
>> + *
>>    * Runtime Survivability
>>    * =====================
>>    *
>> @@ -95,6 +101,8 @@ enum scratch_reg {
>>   	MAX_SCRATCH_REG,
>>   };
>>   
>> +#define FDO_INFO	(MAX_SCRATCH_REG + 1)
>> +
>>   struct xe_survivability_attribute {
>>   	struct device_attribute attr;
>>   	u8 index;
>> @@ -131,6 +139,11 @@ static void populate_survivability_info(struct xe_device *xe)
>>   	set_survivability_info(mmio, info, CAPABILITY_INFO, "Capability Info");
>>   	reg_value = info[CAPABILITY_INFO].value;
>>   
>> +	survivability->version = REG_FIELD_GET(BREADCRUMB_VERSION, reg_value);
>> +	/* FDO mode is exposed only from version 2 */
>> +	if (survivability->version >= 2)
>> +		survivability->fdo_mode = REG_FIELD_GET(FDO_MODE, reg_value);
>> +
>>   	if (reg_value & HISTORY_TRACKING) {
>>   		set_survivability_info(mmio, info, POSTCODE_TRACE, "Postcode Trace");
>>   
>> @@ -193,6 +206,9 @@ static ssize_t survivability_info_show(struct device *dev,
>>   	struct xe_survivability_info *info = survivability->info;
>>   	struct xe_survivability_attribute *sa = dev_attr_to_survivability_attr(attr);
>>   
>> +	if (sa->index == FDO_INFO)
>> +		return sysfs_emit(buff, "%s\n", str_enabled_disabled(survivability->fdo_mode));
>> +
>>   	return sysfs_emit(buff, "0x%x\n", info[sa->index].value);
>>   }
>>   
>> @@ -210,13 +226,18 @@ SURVIVABILITY_ATTR_RO(aux_info1, AUX_INFO1);
>>   SURVIVABILITY_ATTR_RO(aux_info2, AUX_INFO2);
>>   SURVIVABILITY_ATTR_RO(aux_info3, AUX_INFO3);
>>   SURVIVABILITY_ATTR_RO(aux_info4, AUX_INFO4);
>> +SURVIVABILITY_ATTR_RO(fdo_mode, FDO_INFO);
>>   
>>   static void xe_survivability_mode_fini(void *arg)
>>   {
>>   	struct xe_device *xe = arg;
>> +	struct xe_survivability *survivability = &xe->survivability;
>>   	struct pci_dev *pdev = to_pci_dev(xe->drm.dev);
>>   	struct device *dev = &pdev->dev;
>>   
>> +	if (survivability->fdo_mode)
>> +		xe_nvm_fini(xe);
>> +
>>   	device_remove_file(dev, &dev_attr_survivability_mode);
>>   }
>>   
>> @@ -227,12 +248,16 @@ static umode_t survivability_info_attrs_visible(struct kobject *kobj, struct att
>>   	struct xe_survivability *survivability = &xe->survivability;
>>   	struct xe_survivability_info *info = survivability->info;
>>   
>> -	if (info[idx].value)
>> +	/* FDO mode is visible only when supported */
>> +	if (idx >= MAX_SCRATCH_REG && survivability->version >= 2)
> 
> should we also make the version a file inside survivability_info?
> for explicit check instead of implicit?

Are you suggesting that we had version in the sysfs and let the user 
check both version and fdo_mode entry before deciding?

> 
>> +		return 0400;
>> +	else if (info[idx].value)
>>   		return 0400;
>>   
>>   	return 0;
>>   }
>>   
>> +/* Attributes are ordered according to enum scratch_reg */
>>   static struct attribute *survivability_info_attrs[] = {
>>   	&attr_capability_info.attr.attr,
>>   	&attr_postcode_trace.attr.attr,
>> @@ -242,6 +267,7 @@ static struct attribute *survivability_info_attrs[] = {
>>   	&attr_aux_info2.attr.attr,
>>   	&attr_aux_info3.attr.attr,
>>   	&attr_aux_info4.attr.attr,
>> +	&attr_fdo_mode.attr.attr,
>>   	NULL,
>>   };
>>   
>> @@ -301,11 +327,18 @@ static int enable_boot_survivability_mode(struct pci_dev *pdev)
>>   	if (ret)
>>   		goto err;
>>   
>> +	if (survivability->fdo_mode) {
>> +		ret = xe_nvm_init(xe);
>> +		if (ret)
>> +			goto err;
> 
> should we really fail the survivability mode here?
> Or keep the survivability mode with some indication that fdo/nvm has failed?

FDO mode is only enabled when MEI path to update firmware is not 
available due to some error in MEI.

If initializing SPI also fails then user cannot update firmware in 
survivability mode. Staying in survivability mode will not be helpful
since both paths are unavailabe. That's why added the fail

I should remove failing on mei_init instead. Missed that. Thank you for
the review comment. Will fix it in new rev

Thanks
Riana
> 
>> +	}
>> +
>>   	dev_err(dev, "In Survivability Mode\n");
>>   
>>   	return 0;
>>   
>>   err:
>> +	dev_err(dev, "Failed to enable Survivability Mode\n");
>>   	survivability->mode = false;
>>   	return ret;
>>   }
>> @@ -436,8 +469,10 @@ int xe_survivability_mode_boot_enable(struct xe_device *xe)
>>   	if (ret)
>>   		return ret;
>>   
>> -	/* Log breadcrumbs but do not enter survivability mode for Critical boot errors */
>> -	if (survivability->boot_status == CRITICAL_FAILURE) {
>> +	/*
>> +	 * v2 supports survivability mode for critical errors
>> +	 */
>> +	if (survivability->version < 2  && survivability->boot_status == CRITICAL_FAILURE) {
>>   		log_survivability_info(pdev);
>>   		return -ENXIO;
>>   	}
>> diff --git a/drivers/gpu/drm/xe/xe_survivability_mode_types.h b/drivers/gpu/drm/xe/xe_survivability_mode_types.h
>> index 1ed122cf62f2..d887b443b397 100644
>> --- a/drivers/gpu/drm/xe/xe_survivability_mode_types.h
>> +++ b/drivers/gpu/drm/xe/xe_survivability_mode_types.h
>> @@ -37,6 +37,12 @@ struct xe_survivability {
>>   
>>   	/** @type: survivability type */
>>   	enum xe_survivability_type type;
>> +
>> +	/** @fdo_mode: indicates if FDO mode is enabled */
>> +	bool fdo_mode;
>> +
>> +	/** @version: breadcrumb version of survivability mode  */
>> +	u8 version;
>>   };
>>   
>>   #endif /* _XE_SURVIVABILITY_MODE_TYPES_H_ */
>> -- 
>> 2.47.1
>>


  reply	other threads:[~2025-11-13  8:26 UTC|newest]

Thread overview: 13+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-11-12 10:33 [PATCH 0/2] Redesign survivability mode Riana Tauro
2025-11-12 10:12 ` ✓ CI.KUnit: success for " Patchwork
2025-11-12 10:33 ` [PATCH 1/2] drm/xe/xe_survivability: Redesign survivability mode sysfs entries Riana Tauro
2025-11-12 18:22   ` Rodrigo Vivi
2025-11-13  8:30     ` Riana Tauro
2025-11-12 10:33 ` [PATCH 2/2] drm/xe/xe_survivability: Add support for survivability mode v2 Riana Tauro
2025-11-12 18:20   ` Rodrigo Vivi
2025-11-13  8:26     ` Riana Tauro [this message]
2025-11-13 22:45       ` Rodrigo Vivi
2025-11-20  5:21         ` Riana Tauro
2025-11-20 14:25           ` Rodrigo Vivi
2025-11-12 10:49 ` ✓ Xe.CI.BAT: success for Redesign survivability mode Patchwork
2025-11-12 12:31 ` ✗ Xe.CI.Full: failure " Patchwork

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=291ca7a1-51fb-4f28-b1b7-1e4f9d12878c@intel.com \
    --to=riana.tauro@intel.com \
    --cc=anshuman.gupta@intel.com \
    --cc=badal.nilawar@intel.com \
    --cc=intel-xe@lists.freedesktop.org \
    --cc=lucas.demarchi@intel.com \
    --cc=rodrigo.vivi@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox