Re: [PATCH v4 07/13] drm/xe/xe_ras: Add support for uncorrectable core-compute errors

All of lore.kernel.org
 help / color / mirror / Atom feed

From: Raag Jadav <raag.jadav@intel.com>
To: Riana Tauro <riana.tauro@intel.com>
Cc: intel-xe@lists.freedesktop.org, anshuman.gupta@intel.com,
	rodrigo.vivi@intel.com, aravind.iddamsetty@linux.intel.com,
	badal.nilawar@intel.com, ravi.kishore.koppuravuri@intel.com,
	mallesh.koujalagi@intel.com, soham.purkait@intel.com
Subject: Re: [PATCH v4 07/13] drm/xe/xe_ras: Add support for uncorrectable core-compute errors
Date: Mon, 27 Apr 2026 10:24:56 +0200	[thread overview]
Message-ID: <ae8dWCGV-5XOumUH@black.igk.intel.com> (raw)
In-Reply-To: <20260417085812.4013309-22-riana.tauro@intel.com>

On Fri, Apr 17, 2026 at 02:28:19PM +0530, Riana Tauro wrote:
> Uncorrectable Core-Compute errors are classified into Global and Local
> errors.
> 
> Global error is an error that affects the entire device requiring a
> reset. This type of error is not isolated. When an AER is reported and
> error_detected is invoked return PCI_ERS_RESULT_NEED_RESET.
> 
> A Local error is confined to a specific component or context like a
> engine. These errors can be contained and recovered by resetting
> only the affected part without distrupting the rest of the device.
> 
> Upon detection of an Uncorrectable Local Core-Compute error, an AER is
> generated and GuC is notified of the error which triggers a engine reset.
> Return Recovered from PCI error callbacks for these errors.
> 
> Signed-off-by: Riana Tauro <riana.tauro@intel.com>
> ---
> v2: add newline and fix log
>     add bounds check (Mallesh)
>     add ras specific enum (Raag)
>     helper for sysctrl prepare command
>     process all errors before deciding recovery action
> 
> v3: remove TODO from commit message
>     remove redundant rlen check
>     fix loop
>     add check for sysctrl flooding (Raag)
>     do not use xe_ras prefix for static functions (Soham)
> 
> v4: remove rlen initialization to 0
>     remove local variable
>     add error message for length mismatch (Raag)
>     reset on sysctrl flooding
>     fix sysctrl flood condition
> ---
>  drivers/gpu/drm/xe/xe_ras.c       | 133 ++++++++++++++++++++++++++++++
>  drivers/gpu/drm/xe/xe_ras.h       |   3 +
>  drivers/gpu/drm/xe/xe_ras_types.h |  55 ++++++++++++
>  3 files changed, 191 insertions(+)
> 
> diff --git a/drivers/gpu/drm/xe/xe_ras.c b/drivers/gpu/drm/xe/xe_ras.c
> index bf780e953925..0cc7b3a6b13f 100644
> --- a/drivers/gpu/drm/xe/xe_ras.c
> +++ b/drivers/gpu/drm/xe/xe_ras.c
> @@ -5,7 +5,16 @@
>  
>  #include "xe_assert.h"
>  #include "xe_device_types.h"
> +#include "xe_printk.h"
>  #include "xe_ras.h"
> +#include "xe_ras_types.h"
> +#include "xe_sysctrl_mailbox.h"
> +#include "xe_sysctrl_mailbox_types.h"
> +
> +#define COMPUTE_ERROR_SEVERITY_MASK		GENMASK(26, 25)

I thought we used REG_ variants?

> +#define GLOBAL_UNCORR_ERROR			2
> +/* Modify as needed */
> +#define XE_SYSCTRL_ERROR_FLOOD			16
>  
>  /* Severity classification of detected errors */
>  enum xe_ras_severity {
> @@ -63,6 +72,130 @@ static inline const char *comp_to_str(struct xe_device *xe, u32 comp)
>  	return xe_ras_components[comp];
>  }
>  
> +static enum xe_ras_recovery_action handle_compute_errors(struct xe_device *xe,
> +							 struct xe_ras_error_array *arr)
> +{
> +	struct xe_ras_compute_error *error_info = (struct xe_ras_compute_error *)arr->error_details;
> +	struct xe_ras_error_common common = arr->error_class.common;
> +	u8 uncorr_type;
> +
> +	uncorr_type = FIELD_GET(COMPUTE_ERROR_SEVERITY_MASK, error_info->error_log_header);

Ditto.

> +	xe_err(xe, "[RAS]: %s %s Error detected", severity_to_str(xe, common.severity),
> +	       comp_to_str(xe, common.component));
> +
> +	/* Request a RESET if error is global */
> +	if (uncorr_type == GLOBAL_UNCORR_ERROR)
> +		return XE_RAS_RECOVERY_ACTION_RESET;
> +
> +	/* Local errors are recovered using a engine reset by GuC */
> +	return XE_RAS_RECOVERY_ACTION_RECOVERED;
> +}
> +
> +static void prepare_sysctrl_command(struct xe_sysctrl_mailbox_command *command,
> +				    u32 cmd_mask, void *request, size_t request_len,
> +				    void *response, size_t response_len)
> +{
> +	struct xe_sysctrl_app_msg_hdr hdr = {0};
> +
> +	hdr.data = FIELD_PREP(APP_HDR_GROUP_ID_MASK, XE_SYSCTRL_GROUP_GFSP) |
> +			   FIELD_PREP(APP_HDR_COMMAND_MASK, cmd_mask);

Ditto.

> +
> +	command->header = hdr;
> +	command->data_in = request;
> +	command->data_in_len = request_len;
> +	command->data_out = response;
> +	command->data_out_len = response_len;
> +}
> +
> +/**
> + * xe_ras_process_errors() - Process and contain hardware errors
> + * @xe: xe device instance
> + *
> + * Get error details from system controller and return recovery
> + * method. Called only from PCI error handling.
> + *
> + * Returns: recovery action to be taken
> + */
> +enum xe_ras_recovery_action xe_ras_process_errors(struct xe_device *xe)
> +{
> +	struct xe_sysctrl_mailbox_command command = {0};
> +	struct xe_ras_get_error_response response;
> +	enum xe_ras_recovery_action final_action;
> +	u32 count = 0;
> +	size_t rlen;
> +	int ret;
> +
> +	if (!xe->info.has_sysctrl)
> +		return XE_RAS_RECOVERY_ACTION_RESET;
> +
> +	/* Default action */
> +	final_action = XE_RAS_RECOVERY_ACTION_RECOVERED;
> +
> +	prepare_sysctrl_command(&command, XE_SYSCTRL_CMD_GET_SOC_ERROR, NULL, 0,
> +				&response, sizeof(response));
> +
> +	do {
> +		memset(&response, 0, sizeof(response));
> +
> +		ret = xe_sysctrl_send_command(&xe->sc, &command, &rlen);
> +		if (ret) {
> +			xe_err(xe, "[RAS]: sysctrl: error ret %d\n", ret);

We'll likely have more users of xe_sysctrl_send_command(), so probably
worth spelling out what exactly failed.

> +			goto err;
> +		}
> +
> +		if (rlen != sizeof(response)) {
> +			xe_err(xe, "[RAS]: sysctrl: response size mismatch. Expected %zu, got %zu\n",

Ditto.

> +			       sizeof(response), rlen);
> +			goto err;
> +		}
> +
> +		/* Report if number of errors exceeds the maximum errors supported*/
> +		if (response.num_errors > XE_RAS_NUM_ERROR_ARR)
> +			xe_err(xe, "[RAS]: sysctrl: number of errors received %d out of bound (%d)\n",
> +			       response.num_errors, XE_RAS_NUM_ERROR_ARR);
> +
> +		for (int i = 0; i < response.num_errors && i < XE_RAS_NUM_ERROR_ARR; i++) {
> +			struct xe_ras_error_array arr = response.error_arr[i];

Do we need this on the stack (again)? Why not just access response memory
using a pointer?

> +			enum xe_ras_recovery_action action;
> +			struct xe_ras_error_class error_class;
> +			u8 component;
> +
> +			error_class = arr.error_class;
> +			component = error_class.common.component;
> +
> +			switch (component) {
> +			case XE_RAS_COMPONENT_CORE_COMPUTE:
> +				action = handle_compute_errors(xe, &arr);
> +				break;
> +			default:
> +				xe_err(xe, "[RAS]: Unknown error component %u\n", component);
> +				action = XE_RAS_RECOVERY_ACTION_RESET;
> +				break;
> +			}
> +
> +			/*
> +			 * Retain the highest severity action. Process and log all errors
> +			 * and then take appropriate recovery action.
> +			 */
> +			if (action > final_action)
> +				final_action = action;
> +		}
> +
> +		/* Treat flooding as an system controller error */
> +		if (++count >= XE_SYSCTRL_ERROR_FLOOD) {

Nit: I know this does the job, but I'd make the logic consistent with
while(), i.e. start from flood value and return on count == 0.

> +			xe_err(xe, "[RAS]: sysctrl: response flooding\n");
> +			return XE_RAS_RECOVERY_ACTION_RESET;
> +		}
> +
> +	} while (response.additional_errors);
> +
> +	return final_action;
> +
> +err:
> +	return XE_RAS_RECOVERY_ACTION_RESET;
> +}
> +
>  #ifdef CONFIG_PCIEAER
>  static void aer_unmask_and_downgrade_internal_error(struct xe_device *xe)
>  {
> diff --git a/drivers/gpu/drm/xe/xe_ras.h b/drivers/gpu/drm/xe/xe_ras.h
> index 14cb973603e7..e191ab80080c 100644
> --- a/drivers/gpu/drm/xe/xe_ras.h
> +++ b/drivers/gpu/drm/xe/xe_ras.h
> @@ -6,8 +6,11 @@
>  #ifndef _XE_RAS_H_
>  #define _XE_RAS_H_
>  
> +#include "xe_ras_types.h"
> +
>  struct xe_device;
>  
>  void xe_ras_init(struct xe_device *xe);
> +enum xe_ras_recovery_action  xe_ras_process_errors(struct xe_device *xe);

Extra whitespace.

>  #endif
> diff --git a/drivers/gpu/drm/xe/xe_ras_types.h b/drivers/gpu/drm/xe/xe_ras_types.h
> index 3d0f9e94a404..8bebe958b8e8 100644
> --- a/drivers/gpu/drm/xe/xe_ras_types.h
> +++ b/drivers/gpu/drm/xe/xe_ras_types.h
> @@ -11,6 +11,24 @@
>  #define XE_RAS_NUM_ERROR_ARR		3
>  #define XE_RAS_MAX_ERROR_DETAILS	16
>  
> +/**
> + * enum xe_ras_recovery_action - RAS recovery actions
> + *
> + * @XE_RAS_RECOVERY_ACTION_RECOVERED: Error recovered
> + * @XE_RAS_RECOVERY_ACTION_RESET: Requires reset
> + * @XE_RAS_RECOVERY_ACTION_DISCONNECT: Requires disconnect
> + * @XE_RAS_RECOVERY_ACTION_MAX: Max action value
> + *
> + * This enum defines the possible recovery actions that can be taken in response
> + * to RAS errors.
> + */
> +enum xe_ras_recovery_action {
> +	XE_RAS_RECOVERY_ACTION_RECOVERED = 0,
> +	XE_RAS_RECOVERY_ACTION_RESET,
> +	XE_RAS_RECOVERY_ACTION_DISCONNECT,
> +	XE_RAS_RECOVERY_ACTION_MAX
> +};
> +
>  /**
>   * struct xe_ras_error_common - Common RAS error class
>   *
> @@ -93,4 +111,41 @@ struct xe_ras_get_error_response {
>  	struct xe_ras_error_array error_arr[XE_RAS_NUM_ERROR_ARR];
>  } __packed;
>  
> +/**
> + * struct xe_ras_compute_error - Error details of Core Compute error
> + */
> +struct xe_ras_compute_error {
> +	/** @error_log_header: Error Source and type */
> +	u32 error_log_header;
> +	/** @internal_error_log: Internal Error log */
> +	u32 internal_error_log;
> +	/** @fabric_log: Fabric Error log */
> +	u32 fabric_log;
> +	/** @internal_error_addr_log0: Internal Error addr log */
> +	u32 internal_error_addr_log0;
> +	/** @internal_error_addr_log1: Internal Error addr log */
> +	u32 internal_error_addr_log1;
> +	/** @packet_log0: Packet log */
> +	u32 packet_log0;
> +	/** @packet_log1: Packet log */
> +	u32 packet_log1;
> +	/** @packet_log2: Packet log */
> +	u32 packet_log2;
> +	/** @packet_log3: Packet log */
> +	u32 packet_log3;
> +	/** @packet_log4: Packet log */
> +	u32 packet_log4;
> +	/** @misc_log0: Misc log */
> +	u32 misc_log0;
> +	/** @misc_log1: Misc log */
> +	u32 misc_log1;
> +	/** @spare_log0: Spare log */
> +	u32 spare_log0;
> +	/** @spare_log1: Spare log */
> +	u32 spare_log1;
> +	/** @spare_log2: Spare log */
> +	u32 spare_log2;
> +	/** @spare_log3: Spare log */
> +	u32 spare_log3;
> +} __packed;

Unless we use the fields on code, let's mark it as reserved.

Also curious, should we be using these for cper or should be relying
on info queue?

Raag

>  #endif
> -- 
> 2.47.1
>

next prev parent reply	other threads:[~2026-04-27  8:25 UTC|newest]

Thread overview: 45+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-04-17  8:58 [PATCH v4 00/13] Introduce Xe Uncorrectable Error Handling Riana Tauro
2026-04-17  8:58 ` [PATCH v4 01/13] drm/xe/xe_survivability: Decouple survivability info from boot survivability Riana Tauro
2026-04-17  8:58 ` [PATCH v4 02/13] drm/xe/xe_pci_error: Implement PCI error recovery callbacks Riana Tauro
2026-04-27  6:35   ` Raag Jadav
2026-05-06 13:59     ` Tauro, Riana
2026-04-17  8:58 ` [PATCH v4 03/13] drm/xe/xe_pci_error: Group all devres to release them on PCIe slot reset Riana Tauro
2026-04-17  8:58 ` [PATCH v4 04/13] drm/xe: Skip device access during PCI error recovery Riana Tauro
2026-04-30 12:58   ` Anshuman Gupta
2026-05-06 15:41     ` Tauro, Riana
2026-04-17  8:58 ` [PATCH v4 05/13] drm/xe/xe_ras: Initialize Uncorrectable AER Registers Riana Tauro
2026-04-27  7:56   ` Raag Jadav
2026-05-05  5:22     ` Tauro, Riana
2026-04-17  8:58 ` [PATCH v4 06/13] drm/xe/xe_ras: Add basic structures and commands for uncorrectable errors Riana Tauro
2026-04-17 17:38   ` Matt Roper
2026-04-17 21:25     ` Jadav, Raag
2026-04-17 21:32       ` Matt Roper
2026-04-20  5:34         ` Tauro, Riana
2026-04-20  7:49           ` Raag Jadav
2026-04-17  8:58 ` [PATCH v4 07/13] drm/xe/xe_ras: Add support for uncorrectable core-compute errors Riana Tauro
2026-04-27  8:24   ` Raag Jadav [this message]
2026-05-05  5:28     ` Tauro, Riana
2026-04-17  8:58 ` [PATCH v4 08/13] drm/xe/xe_ras: Handle uncorrectable SoC Internal errors Riana Tauro
2026-04-17  8:58 ` [PATCH v4 09/13] drm/xe/xe_ras: Handle uncorrectable device memory errors Riana Tauro
2026-04-21  6:08   ` Upadhyay, Tejas
2026-05-05  5:03     ` Tauro, Riana
2026-04-17  8:58 ` [PATCH v4 10/13] drm/xe/xe_ras: Add support to offline/decline a page Riana Tauro
2026-04-21  6:21   ` Upadhyay, Tejas
2026-05-05  5:16     ` Tauro, Riana
2026-04-17  8:58 ` [PATCH v4 11/13] drm/xe/xe_ras: Add support for page offline list and queue commands Riana Tauro
2026-04-21  6:19   ` Upadhyay, Tejas
2026-05-05  5:08     ` Tauro, Riana
2026-04-21  9:10   ` Upadhyay, Tejas
2026-05-05  5:17     ` Tauro, Riana
2026-04-17  8:58 ` [PATCH v4 12/13] drm/xe/xe_ras: Query errors from system controller on probe Riana Tauro
2026-04-28 11:46   ` Raag Jadav
2026-05-05 13:50     ` Tauro, Riana
2026-04-17  8:58 ` [PATCH v4 13/13] drm/xe/xe_pci_error: Process errors in mmio_enabled Riana Tauro
2026-04-28 11:39   ` Raag Jadav
2026-05-05  5:31     ` Tauro, Riana
2026-04-30 11:15   ` Gupta, Anshuman
2026-05-02 17:55     ` Raag Jadav
2026-04-20 13:33 ` ✗ CI.checkpatch: warning for Introduce Xe Uncorrectable Error Handling (rev4) Patchwork
2026-04-20 13:35 ` ✓ CI.KUnit: success " Patchwork
2026-04-20 14:42 ` ✓ Xe.CI.BAT: " Patchwork
2026-04-20 17:14 ` ✗ Xe.CI.FULL: failure " Patchwork

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=ae8dWCGV-5XOumUH@black.igk.intel.com \
    --to=raag.jadav@intel.com \
    --cc=anshuman.gupta@intel.com \
    --cc=aravind.iddamsetty@linux.intel.com \
    --cc=badal.nilawar@intel.com \
    --cc=intel-xe@lists.freedesktop.org \
    --cc=mallesh.koujalagi@intel.com \
    --cc=ravi.kishore.koppuravuri@intel.com \
    --cc=riana.tauro@intel.com \
    --cc=rodrigo.vivi@intel.com \
    --cc=soham.purkait@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.