Hi Riana,
Uncorrectable errors from different endpoints in the device are steered to
the USP which is a PCI Advanced Error Reporting (AER) Compliant device.
Downgrade all the errors to non-fatal to prevent PCIe bus driver
from triggering a Secondary Bus Reset (SBR). This allows error
detection, containment and recovery in the driver.
The Uncorrectable Error Severity Register has the 'Uncorrectable
Internal Error Severity' set to fatal by default. Set this to
non-fatal and unmask the error.
Signed-off-by: Riana Tauro <riana.tauro@intel.com>
---
drivers/gpu/drm/xe/Makefile | 1 +
drivers/gpu/drm/xe/xe_device.c | 3 ++
drivers/gpu/drm/xe/xe_ras.c | 71 ++++++++++++++++++++++++++++++++++
drivers/gpu/drm/xe/xe_ras.h | 13 +++++++
4 files changed, 88 insertions(+)
create mode 100644 drivers/gpu/drm/xe/xe_ras.c
create mode 100644 drivers/gpu/drm/xe/xe_ras.h
diff --git a/drivers/gpu/drm/xe/Makefile b/drivers/gpu/drm/xe/Makefile
index 5581f2180b5c..85ec53eb0b62 100644
--- a/drivers/gpu/drm/xe/Makefile
+++ b/drivers/gpu/drm/xe/Makefile
@@ -110,6 +110,7 @@ xe-y += xe_bb.o \
xe_pxp_debugfs.o \
xe_pxp_submit.o \
xe_query.o \
+ xe_ras.o \
xe_range_fence.o \
xe_reg_sr.o \
xe_reg_whitelist.o \
diff --git a/drivers/gpu/drm/xe/xe_device.c b/drivers/gpu/drm/xe/xe_device.c
index f418ebf04f0f..be89ffc9eade 100644
--- a/drivers/gpu/drm/xe/xe_device.c
+++ b/drivers/gpu/drm/xe/xe_device.c
@@ -59,6 +59,7 @@
#include "xe_psmi.h"
#include "xe_pxp.h"
#include "xe_query.h"
+#include "xe_ras.h"
#include "xe_shrinker.h"
#include "xe_soc_remapper.h"
#include "xe_survivability_mode.h"
@@ -1019,6 +1020,8 @@ int xe_device_probe(struct xe_device *xe)
xe_vsec_init(xe);
+ xe_ras_init(xe);
+
err = xe_sriov_init_late(xe);
if (err)
goto err_unregister_display;
diff --git a/drivers/gpu/drm/xe/xe_ras.c b/drivers/gpu/drm/xe/xe_ras.c
new file mode 100644
index 000000000000..ba5ed37aed28
--- /dev/null
+++ b/drivers/gpu/drm/xe/xe_ras.c
@@ -0,0 +1,71 @@
+// SPDX-License-Identifier: MIT
+/*
+ * Copyright © 2026 Intel Corporation
+ */
+#include <linux/pci.h>
+
+#include "xe_device_types.h"
+#include "xe_ras.h"
+
+#ifdef CONFIG_PCIEAER
+static void unmask_and_downgrade_internal_error(struct xe_device *xe)
+{
+ struct pci_dev *pdev = to_pci_dev(xe->drm.dev);
+ struct pci_dev *vsp, *usp;
+ u32 aer_uncorr_sev, aer_uncorr_mask;
+ u16 aer_cap;
+
+ /* Gfx Device Hierarchy: USP-->VSP-->SGunit */
+ vsp = pci_upstream_bridge(pdev);
+ if (!vsp)
+ return;
+
+ usp = pci_upstream_bridge(vsp);
+ if (!usp)
+ return;
+
+ aer_cap = usp->aer_cap;
+
+ if (!aer_cap)
+ return;
+
+ /*
+ * All errors are steered to USP which is a PCIe AER Complaint device.
+ * Downgrade all the errors to non-fatal to prevent PCIe bus driver
+ * from triggering a Secondary Bus Reset (SBR). This allows error
+ * detection, containment and recovery in the driver.
+ *
+ * The Uncorrectable Error Severity Register has the 'Uncorrectable
+ * Internal Error Severity' set to fatal by default. Set this to
+ * non-fatal and unmask the error.
+ */
+
Before unmasking the PCI_ERR_UNC_INTN bit, we shall clear stale event in PCI_ERR_UNCOR_STATUS register that would be signaled once we unmask the bit. (Assuming the bit wasn't unmasked already.)
There is a pci_aer_unmask_internal_errors() helper declared in drivers/pci/pcie/aer.c which we could probably use by exporting it.
Also do you think it makes more sense to move this to pci quirks, because in virtualized environment the XeKMD might be in VM(passthrough model) and USP in host then this might not work.
+ /* Initialize Uncorrectable Error Severity Register */
+ pci_read_config_dword(usp, aer_cap + PCI_ERR_UNCOR_SEVER, &aer_uncorr_sev);
+ aer_uncorr_sev &= ~PCI_ERR_UNC_INTN;
+ pci_write_config_dword(usp, aer_cap + PCI_ERR_UNCOR_SEVER, aer_uncorr_sev);
+
+ /* Initialize Uncorrectable Error Mask Register */
+ pci_read_config_dword(usp, aer_cap + PCI_ERR_UNCOR_MASK, &aer_uncorr_mask);
+ aer_uncorr_mask &= ~PCI_ERR_UNC_INTN;
+ pci_write_config_dword(usp, aer_cap + PCI_ERR_UNCOR_MASK, aer_uncorr_mask);
+
+ pci_save_state(usp);
+}
+#endif
+
+/**
+ * xe_ras_init - Initialize Xe RAS
+ * @xe: xe device instance
+ *
+ * Initialize Xe RAS
+ */
+void xe_ras_init(struct xe_device *xe)
+{
+ if (!xe->info.has_sysctrl)
+ return;
+
+#ifdef CONFIG_PCIEAER
+ unmask_and_downgrade_internal_error(xe);
+#endif
+}
diff --git a/drivers/gpu/drm/xe/xe_ras.h b/drivers/gpu/drm/xe/xe_ras.h
new file mode 100644
index 000000000000..14cb973603e7
--- /dev/null
+++ b/drivers/gpu/drm/xe/xe_ras.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * Copyright © 2026 Intel Corporation
+ */
+
+#ifndef _XE_RAS_H_
+#define _XE_RAS_H_
+
+struct xe_device;
+
+void xe_ras_init(struct xe_device *xe);
+
+#endif
Thanks,