All of lore.kernel.org
 help / color / mirror / Atom feed
From: Riana Tauro <riana.tauro@intel.com>
To: Rodrigo Vivi <rodrigo.vivi@intel.com>
Cc: <intel-xe@lists.freedesktop.org>, <anshuman.gupta@intel.com>,
	<aravind.iddamsetty@intel.com>, <jani.nikula@linux.intel.com>,
	<matthew.d.roper@intel.com>, <alexander.usyskin@intel.com>
Subject: Re: [PATCH v2 1/3] drm/xe: Add functions and sysfs for boot survivability
Date: Wed, 15 Jan 2025 21:47:53 +0530	[thread overview]
Message-ID: <bfb3420c-dc1b-4947-bfa7-2d60d6bce2db@intel.com> (raw)
In-Reply-To: <Z4E7DB6pbb11QDWT@intel.com>

Hi Rodrigo

On 1/10/2025 8:51 PM, Rodrigo Vivi wrote:
> On Wed, Jan 08, 2025 at 04:09:57PM +0530, Riana Tauro wrote:
>> Boot Survivability is a software based workflow for recovering a system
>> in a failed boot state. Here system recoverability is concerned with
>> recovering the firmware responsible for boot.
>>
>> This is implemented by loading the driver with bare minimum (no drm card)
>> to allow the firmware to be flashed through mei-gsc and collect telemetry.
>> The driver's probe flow is modified such that it enters survivability mode
>> when pcode initialization is incomplete and boot status denotes a failure.
>> In this mode, drm card is not exposed and presence of survivability_mode
>> entry in PCI sysfs  is used to indicate survivability mode and
>> provide additional information required for debug
>>
>> This patch adds initialization functions and exposes admin
>> readable sysfs entries
>>
>> The new sysfs will have the below layout
>>
>> 	/sys/bus/.../bdf
>>               	     ├── survivability_mode
>>
>> v2: reorder headers
>>      fix doc
>>      remove survivability info and use mode to display information
>>      use separate function for logging survivability information
>>      for critical error (Rodrigo)
>>
>> Signed-off-by: Riana Tauro <riana.tauro@intel.com>
>> ---
>>   drivers/gpu/drm/xe/Makefile                   |   1 +
>>   drivers/gpu/drm/xe/xe_device_types.h          |   4 +
>>   drivers/gpu/drm/xe/xe_pcode_api.h             |  14 ++
>>   drivers/gpu/drm/xe/xe_survivability_mode.c    | 231 ++++++++++++++++++
>>   drivers/gpu/drm/xe/xe_survivability_mode.h    |  17 ++
>>   .../gpu/drm/xe/xe_survivability_mode_types.h  |  35 +++
>>   6 files changed, 302 insertions(+)
>>   create mode 100644 drivers/gpu/drm/xe/xe_survivability_mode.c
>>   create mode 100644 drivers/gpu/drm/xe/xe_survivability_mode.h
>>   create mode 100644 drivers/gpu/drm/xe/xe_survivability_mode_types.h
>>
>> diff --git a/drivers/gpu/drm/xe/Makefile b/drivers/gpu/drm/xe/Makefile
>> index 5c97ad6ed738..fb1cb98ce891 100644
>> --- a/drivers/gpu/drm/xe/Makefile
>> +++ b/drivers/gpu/drm/xe/Makefile
>> @@ -95,6 +95,7 @@ xe-y += xe_bb.o \
>>   	xe_sa.o \
>>   	xe_sched_job.o \
>>   	xe_step.o \
>> +	xe_survivability_mode.o \
>>   	xe_sync.o \
>>   	xe_tile.o \
>>   	xe_tile_sysfs.o \
>> diff --git a/drivers/gpu/drm/xe/xe_device_types.h b/drivers/gpu/drm/xe/xe_device_types.h
>> index 8a7b15972413..0f5a052150c9 100644
>> --- a/drivers/gpu/drm/xe/xe_device_types.h
>> +++ b/drivers/gpu/drm/xe/xe_device_types.h
>> @@ -21,6 +21,7 @@
>>   #include "xe_pt_types.h"
>>   #include "xe_sriov_types.h"
>>   #include "xe_step_types.h"
>> +#include "xe_survivability_mode_types.h"
>>   
>>   #if IS_ENABLED(CONFIG_DRM_XE_DEBUG)
>>   #define TEST_VM_OPS_ERROR
>> @@ -341,6 +342,9 @@ struct xe_device {
>>   		u8 skip_pcode:1;
>>   	} info;
>>   
>> +	/** @survivability: survivability information for device */
>> +	struct xe_survivability survivability;
>> +
>>   	/** @irq: device interrupt state */
>>   	struct {
>>   		/** @irq.lock: lock for processing irq's on this device */
>> diff --git a/drivers/gpu/drm/xe/xe_pcode_api.h b/drivers/gpu/drm/xe/xe_pcode_api.h
>> index f153ce96f69a..4e373b8199ca 100644
>> --- a/drivers/gpu/drm/xe/xe_pcode_api.h
>> +++ b/drivers/gpu/drm/xe/xe_pcode_api.h
>> @@ -49,6 +49,20 @@
>>   /* Domain IDs (param2) */
>>   #define     PCODE_MBOX_DOMAIN_HBM		0x2
>>   
>> +#define PCODE_SCRATCH_ADDR(x)		XE_REG(0x138320 + ((x) * 4))
>> +/* PCODE_SCRATCH0 */
>> +#define   AUXINFO_REG_OFFSET		REG_GENMASK(17, 15)
>> +#define   OVERFLOW_REG_OFFSET		REG_GENMASK(14, 12)
>> +#define   HISTORY_TRACKING		REG_BIT(11)
>> +#define   OVERFLOW_SUPPORT		REG_BIT(10)
>> +#define   AUXINFO_SUPPORT		REG_BIT(9)
>> +#define   BOOT_STATUS			REG_GENMASK(3, 1)
>> +#define      CRITICAL_FAILURE		4
>> +#define      NON_CRITICAL_FAILURE	7
>> +
>> +/* Auxillary info bits */
>> +#define   AUXINFO_HISTORY_OFFSET	REG_GENMASK(31, 29)
>> +
>>   struct pcode_err_decode {
>>   	int errno;
>>   	const char *str;
>> diff --git a/drivers/gpu/drm/xe/xe_survivability_mode.c b/drivers/gpu/drm/xe/xe_survivability_mode.c
>> new file mode 100644
>> index 000000000000..077422ae009d
>> --- /dev/null
>> +++ b/drivers/gpu/drm/xe/xe_survivability_mode.c
>> @@ -0,0 +1,231 @@
>> +// SPDX-License-Identifier: MIT
>> +/*
>> + * Copyright © 2025 Intel Corporation
>> + */
>> +
>> +#include "xe_survivability_mode.h"
>> +#include "xe_survivability_mode_types.h"
>> +
>> +#include <drm/drm_managed.h>
>> +#include <linux/kobject.h>
>> +#include <linux/pci.h>
>> +#include <linux/sysfs.h>
>> +
>> +#include "xe_device.h"
>> +#include "xe_gt.h"
>> +#include "xe_mmio.h"
>> +#include "xe_pcode_api.h"
>> +
>> +#define MAX_SCRATCH_MMIO 8
>> +
>> +/**
>> + * DOC: Xe Boot Survivability
>> + *
>> + * Boot Survivability is a software based workflow for recovering a system in a failed boot state
>> + * Here system recoverability is concerned with recovering the firmware responsible for boot.
>> + *
>> + * This is implemented by loading the driver with bare minimum (no drm card) to allow the firmware
>> + * to be flashed through mei and collect telemetry. The driver's probe flow is modified
>> + * such that it enters survivability mode when pcode initialization is incomplete and boot status
>> + * denotes a failure. The driver then  populates the survivability_mode PCI sysfs indicating
>> + * survivability mode and provides additional information required for debug
>> + *
>> + * KMD exposes below admin-only readable sysfs in survivability mode
>> + *
>> + * device/survivability_mode: The presence of this file indicates that the card is in survivability
>> + *			      mode. Also, provides additional information on why the driver entered
>> + *			      survivability mode.
>> + *
>> + *			      Capability Information - Provides boot status
>> + *			      Postcode Information   - Provides information about the failure
>> + *			      Overflow Information   - Provides history of previous failures
>> + *			      Auxillary Information  - Certain failures may have information in
>> + *						       addition to postcode information
>> + */
>> +
>> +static void set_survivability_info(struct xe_device *xe, struct xe_survivability_info *info,
>> +				   int id, char *name)
>> +{
>> +	struct xe_mmio *mmio = xe_root_tile_mmio(xe);
>> +
>> +	strscpy(info[id].name, name, sizeof(info[id].name));
>> +	info[id].reg = PCODE_SCRATCH_ADDR(id).raw;
>> +	info[id].value = xe_mmio_read32(mmio, PCODE_SCRATCH_ADDR(id));
>> +}
>> +
>> +static int populate_survivability_info(struct xe_device *xe)
>> +{
>> +	struct xe_survivability *survivability = &xe->survivability;
>> +	struct xe_survivability_info *info = survivability->info;
>> +	u32 capability_info;
>> +	int id = 0;
>> +
>> +	set_survivability_info(xe, info, id, "Capability Info");
>> +	capability_info = info[id].value;
>> +
>> +	if (capability_info & HISTORY_TRACKING) {
>> +		id++;
>> +		set_survivability_info(xe, info, id, "Postcode Info");
>> +
>> +		if (capability_info & OVERFLOW_SUPPORT) {
>> +			id = REG_FIELD_GET(OVERFLOW_REG_OFFSET, capability_info);
>> +			/* ID should be within MAX_SCRATCH_MMIO */
>> +			if (id >= MAX_SCRATCH_MMIO)
>> +				return -EINVAL;
>> +			set_survivability_info(xe, info, id, "Overflow Info");
>> +		}
>> +	}
>> +
>> +	if (capability_info & AUXINFO_SUPPORT) {
>> +		u32 aux_info;
>> +		int index = 0;
>> +		char name[NAME_MAX];
>> +
>> +		id = REG_FIELD_GET(AUXINFO_REG_OFFSET, capability_info);
>> +		if (id >= MAX_SCRATCH_MMIO)
>> +			return -EINVAL;
>> +
>> +		snprintf(name, NAME_MAX, "Auxiliary Info %d", index);
>> +		set_survivability_info(xe, info, id, name);
>> +		aux_info = info[id].value;
>> +
>> +		while ((id = REG_FIELD_GET(AUXINFO_HISTORY_OFFSET, aux_info)) &&
>> +		       (id < MAX_SCRATCH_MMIO)) {
> 
> This is a clear case where 'for' is better. But also, generally here we
> try to limit while usages...
This is similar to linked list with the address of prev aux registers in 
the AUXINFO_HISTORY_OFFSET. So used while.

Using for would be like below

for (id = REG_FIELD_GET(AUXINFO_HISTORY_OFFSET, aux_info);
      aux_info && id < MAX_SCRATCH_MMIO; id 
=REG_FIELD_GET(AUXINFO_HISTORY_OFFSET, aux_info))

Isn't while better?
> 
>> +			index++;
>> +			snprintf(name, NAME_MAX, "Prev Auxiliary Info %d", index);
>> +			set_survivability_info(xe, info, id, name);
>> +			aux_info = info[id].value;
>> +		}
>> +	}
>> +
>> +	return 0;
>> +}
>> +
>> +static void log_survivability_info(struct xe_device *xe)
>> +{
>> +	struct xe_survivability *survivability = &xe->survivability;
>> +	struct xe_survivability_info *info = survivability->info;
>> +	int id;
>> +
>> +	drm_info(&xe->drm, "Survivability Boot Status : Critical Failure (%d)\n",
>> +		 survivability->boot_status);
> 
> hmm, since we are avoiding the drm, should we really use drm variants here?
> or the pci/dev ones?!

drm variants use the dev ones and prints the prefix if drm is not null.
Will change the drm_info in this file  but the logs in mei and vsec 
initialization would have to be retained.
> 
>> +	for (id = 0; id < MAX_SCRATCH_MMIO; id++) {
>> +		if (info[id].reg)
>> +			drm_info(&xe->drm, "%s: 0x%x - 0x%x\n", info[id].name,
>> +				 info[id].reg, info[id].value);
>> +	}
>> +}
>> +
>> +static ssize_t survivability_mode_show(struct device *dev,
>> +				       struct device_attribute *attr, char *buff)
>> +{
>> +	struct pci_dev *pdev = to_pci_dev(dev);
>> +	struct xe_device *xe = pdev_to_xe_device(pdev);
>> +	struct xe_survivability *survivability = &xe->survivability;
>> +	struct xe_survivability_info *info = survivability->info;
>> +	int index = 0, count = 0;
>> +
>> +	for (index = 0; index < MAX_SCRATCH_MMIO; index++) {
>> +		if (info[index].reg)
>> +			count += sysfs_emit_at(buff, count, "%s: 0x%x - 0x%x\n", info[index].name,
>> +					       info[index].reg, info[index].value);
>> +	}
>> +
>> +	return count;
>> +}
>> +
>> +static DEVICE_ATTR_ADMIN_RO(survivability_mode);
>> +
>> +static void enable_survivability_mode(struct xe_device *xe)
>> +{
>> +	struct xe_survivability *survivability = &xe->survivability;
>> +	struct device *dev = xe->drm.dev;
> 
> do we really have this pointer valid at this point?!
This is allocated in xe_device_create. Registration is done later in 
xe_device_probe so the prints and xe->drm.dev will be valid

Thanks
Riana
> 
>> +	int ret = 0;
>> +
>> +	/* set survivability mode */
>> +	survivability->mode = true;
>> +	drm_info(&xe->drm, "In Survivability Mode\n");
> 
> same here...
> 
>> +
>> +	/* create survivability mode sysfs */
>> +	ret = sysfs_create_file(&dev->kobj, &dev_attr_survivability_mode.attr);
>> +	if (ret) {
>> +		drm_warn(&xe->drm, "Failed to create survivability sysfs files\n");
>> +		return;
>> +	}
>> +}
>> +
>> +/**
>> + * xe_survivability_mode_required- checks if survivability mode is required
>> + * @xe: xe device instance
>> + *
>> + * This function reads the boot status of Pcode capability register
>> + *
>> + * Return: true if boot status indicates failure, false otherwise
>> + */
>> +bool xe_survivability_mode_required(struct xe_device *xe)
>> +{
>> +	struct xe_survivability *survivability = &xe->survivability;
>> +	struct xe_mmio *mmio = xe_root_tile_mmio(xe);
>> +	u32 data;
>> +
>> +	data = xe_mmio_read32(mmio, PCODE_SCRATCH_ADDR(0));
>> +	survivability->boot_status = REG_FIELD_GET(BOOT_STATUS, data);
>> +
>> +	return (survivability->boot_status == NON_CRITICAL_FAILURE ||
>> +		survivability->boot_status == CRITICAL_FAILURE);
>> +}
>> +
>> +/**
>> + * xe_survivability_mode_remove - remove survivability mode
>> + * @xe: xe device instance
>> + *
>> + * clean up sysfs entries of survivability mode
>> + */
>> +void xe_survivability_mode_remove(struct xe_device *xe)
>> +{
>> +	struct xe_survivability *survivability = &xe->survivability;
>> +	struct pci_dev *pdev = to_pci_dev(xe->drm.dev);
>> +
>> +	sysfs_remove_file(&xe->drm.dev->kobj, &dev_attr_survivability_mode.attr);
>> +	kfree(survivability->info);
>> +	pci_set_drvdata(pdev, NULL);
>> +}
>> +
>> +/**
>> + * xe_survivability_mode_init - Initialize the survivability mode
>> + * @xe: xe device instance
>> + *
>> + * Initializes the sysfs and required actions to enter survivability mode
>> + */
>> +void xe_survivability_mode_init(struct xe_device *xe)
>> +{
>> +	struct xe_survivability *survivability = &xe->survivability;
>> +	struct xe_survivability_info *info;
>> +	int ret = 0;
>> +
>> +	survivability->size = MAX_SCRATCH_MMIO;
>> +
>> +	info = kcalloc(survivability->size, sizeof(*info), GFP_KERNEL);
>> +	if (!info) {
>> +		ret = -ENOMEM;
>> +		goto err;
>> +	}
>> +
>> +	survivability->info = info;
>> +
>> +	ret = populate_survivability_info(xe);
>> +	if (ret)
>> +		goto err;
>> +
>> +	/* Only log debug information and exit if it is a critical failure */
>> +	if (survivability->boot_status == CRITICAL_FAILURE) {
>> +		log_survivability_info(xe);
>> +		kfree(survivability->info);
>> +		return;
>> +	}
>> +
>> +	enable_survivability_mode(xe);
>> +err:
>> +	if (ret)
>> +		drm_warn(&xe->drm, "%s failed, err: %d\n", __func__, ret);
> 
> same...
> 
>> +}
>> diff --git a/drivers/gpu/drm/xe/xe_survivability_mode.h b/drivers/gpu/drm/xe/xe_survivability_mode.h
>> new file mode 100644
>> index 000000000000..410e3ee5f5d1
>> --- /dev/null
>> +++ b/drivers/gpu/drm/xe/xe_survivability_mode.h
>> @@ -0,0 +1,17 @@
>> +/* SPDX-License-Identifier: MIT */
>> +/*
>> + * Copyright © 2025 Intel Corporation
>> + */
>> +
>> +#ifndef _XE_SURVIVABILITY_MODE_H_
>> +#define _XE_SURVIVABILITY_MODE_H_
>> +
>> +#include <linux/types.h>
>> +
>> +struct xe_device;
>> +
>> +void xe_survivability_mode_init(struct xe_device *xe);
>> +void xe_survivability_mode_remove(struct xe_device *xe);
>> +bool xe_survivability_mode_required(struct xe_device *xe);
>> +
>> +#endif /* _XE_SURVIVABILITY_MODE_H_ */
>> diff --git a/drivers/gpu/drm/xe/xe_survivability_mode_types.h b/drivers/gpu/drm/xe/xe_survivability_mode_types.h
>> new file mode 100644
>> index 000000000000..19d433e253df
>> --- /dev/null
>> +++ b/drivers/gpu/drm/xe/xe_survivability_mode_types.h
>> @@ -0,0 +1,35 @@
>> +/* SPDX-License-Identifier: MIT */
>> +/*
>> + * Copyright © 2025 Intel Corporation
>> + */
>> +
>> +#ifndef _XE_SURVIVABILITY_MODE_TYPES_H_
>> +#define _XE_SURVIVABILITY_MODE_TYPES_H_
>> +
>> +#include <linux/limits.h>
>> +#include <linux/types.h>
>> +
>> +struct xe_survivability_info {
>> +	char name[NAME_MAX];
>> +	u32 reg;
>> +	u32 value;
>> +};
>> +
>> +/**
>> + * struct xe_survivability: Contains survivability mode information
>> + */
>> +struct xe_survivability {
>> +	/** @info: struct that holds survivability info from scratch registers */
>> +	struct xe_survivability_info *info;
>> +
>> +	/** @size: number of scratch registers */
>> +	u32 size;
>> +
>> +	/** @boot_status: indicates critical/non critical boot failure */
>> +	u8 boot_status;
>> +
>> +	/** @mode: boolean to indicate survivability mode */
>> +	bool mode;
>> +};
>> +
> 
> I believe the only blocker is the while-vs-for loop. I believe the 'drm'
> could be avoided, but not a big deal if it is really working...
> 
>> +#endif /* _XE_SURVIVABILITY_MODE_TYPES_H_ */
>> -- 
>> 2.47.1
>>


  reply	other threads:[~2025-01-15 16:18 UTC|newest]

Thread overview: 20+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-01-08 10:39 [PATCH v2 0/3] Enable Survivability Mode Riana Tauro
2025-01-08 10:38 ` ✓ CI.Patch_applied: success for Enable Survivability mode (rev2) Patchwork
2025-01-08 10:38 ` ✗ CI.checkpatch: warning " Patchwork
2025-01-08 10:39 ` [PATCH v2 1/3] drm/xe: Add functions and sysfs for boot survivability Riana Tauro
2025-01-09  9:13   ` Riana Tauro
2025-01-09 10:25     ` Kulkarni, Ashwin Kumar
2025-01-10 15:21   ` Rodrigo Vivi
2025-01-15 16:17     ` Riana Tauro [this message]
2025-01-15 19:42       ` Rodrigo Vivi
2025-01-16  9:47         ` Riana Tauro
2025-01-08 10:39 ` [PATCH v2 2/3] drm/xe: Enable Boot Survivability mode Riana Tauro
2025-01-08 10:39 ` [PATCH v2 3/3] drm/xe: Initialize mei-gsc and vsec in survivability mode Riana Tauro
2025-01-10 15:26   ` Rodrigo Vivi
2025-01-16  9:52     ` Riana Tauro
2025-01-08 10:40 ` ✓ CI.KUnit: success for Enable Survivability mode (rev2) Patchwork
2025-01-08 10:58 ` ✓ CI.Build: " Patchwork
2025-01-08 11:00 ` ✓ CI.Hooks: " Patchwork
2025-01-08 11:01 ` ✓ CI.checksparse: " Patchwork
2025-01-08 11:29 ` ✓ Xe.CI.BAT: " Patchwork
2025-01-10  4:14 ` ✗ Xe.CI.Full: failure " Patchwork

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=bfb3420c-dc1b-4947-bfa7-2d60d6bce2db@intel.com \
    --to=riana.tauro@intel.com \
    --cc=alexander.usyskin@intel.com \
    --cc=anshuman.gupta@intel.com \
    --cc=aravind.iddamsetty@intel.com \
    --cc=intel-xe@lists.freedesktop.org \
    --cc=jani.nikula@linux.intel.com \
    --cc=matthew.d.roper@intel.com \
    --cc=rodrigo.vivi@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.