public inbox for intel-xe@lists.freedesktop.org
 help / color / mirror / Atom feed
From: Raag Jadav <raag.jadav@intel.com>
To: intel-xe@lists.freedesktop.org
Cc: matthew.brost@intel.com, rodrigo.vivi@intel.com,
	thomas.hellstrom@linux.intel.com, riana.tauro@intel.com,
	michal.wajdeczko@intel.com, matthew.d.roper@intel.com,
	michal.winiarski@intel.com, matthew.auld@intel.com,
	maarten@lankhorst.se, jani.nikula@intel.com,
	lukasz.laguna@intel.com, zhanjun.dong@intel.com, lukas@wunner.de,
	Raag Jadav <raag.jadav@intel.com>
Subject: [PATCH v5 9/9] drm/xe/pci: Introduce PCIe FLR
Date: Mon,  6 Apr 2026 19:37:22 +0530	[thread overview]
Message-ID: <20260406140722.154445-10-raag.jadav@intel.com> (raw)
In-Reply-To: <20260406140722.154445-1-raag.jadav@intel.com>

With bare minimum pieces in place, we can finally introduce PCIe Function
Level Reset (FLR) handling which re-initializes hardware state without the
need for reloading the driver from userspace. All VRAM contents are lost
along with hardware state and driver takes care of recreating the required
kernel bos as part of re-initialization, but user still needs to recreate
user bos and reload context after PCIe FLR.

Signed-off-by: Raag Jadav <raag.jadav@intel.com>
---
v2: Spell out Function Level Reset (Jani)
v5: Prevent PM ref leak for wedged device (Matthew Brost)
---
 drivers/gpu/drm/xe/Makefile          |   1 +
 drivers/gpu/drm/xe/xe_device_types.h |   3 +
 drivers/gpu/drm/xe/xe_pci.c          |   1 +
 drivers/gpu/drm/xe/xe_pci.h          |   2 +
 drivers/gpu/drm/xe/xe_pci_err.c      | 160 +++++++++++++++++++++++++++
 5 files changed, 167 insertions(+)
 create mode 100644 drivers/gpu/drm/xe/xe_pci_err.c

diff --git a/drivers/gpu/drm/xe/Makefile b/drivers/gpu/drm/xe/Makefile
index f9abaf687d46..06b5d53e1629 100644
--- a/drivers/gpu/drm/xe/Makefile
+++ b/drivers/gpu/drm/xe/Makefile
@@ -100,6 +100,7 @@ xe-y += xe_bb.o \
 	xe_page_reclaim.o \
 	xe_pat.o \
 	xe_pci.o \
+	xe_pci_err.o \
 	xe_pci_rebar.o \
 	xe_pcode.o \
 	xe_pm.o \
diff --git a/drivers/gpu/drm/xe/xe_device_types.h b/drivers/gpu/drm/xe/xe_device_types.h
index 150c76b2acaf..b743b3986205 100644
--- a/drivers/gpu/drm/xe/xe_device_types.h
+++ b/drivers/gpu/drm/xe/xe_device_types.h
@@ -482,6 +482,9 @@ struct xe_device {
 	/** @needs_flr_on_fini: requests function-reset on fini */
 	bool needs_flr_on_fini;
 
+	/** @flr_prepared: Prepared for function-reset */
+	bool flr_prepared;
+
 	/** @wedged: Struct to control Wedged States and mode */
 	struct {
 		/** @wedged.flag: Xe device faced a critical error and is now blocked. */
diff --git a/drivers/gpu/drm/xe/xe_pci.c b/drivers/gpu/drm/xe/xe_pci.c
index 26eb58e11056..f3515c91e534 100644
--- a/drivers/gpu/drm/xe/xe_pci.c
+++ b/drivers/gpu/drm/xe/xe_pci.c
@@ -1332,6 +1332,7 @@ static struct pci_driver xe_pci_driver = {
 #ifdef CONFIG_PM_SLEEP
 	.driver.pm = &xe_pm_ops,
 #endif
+	.err_handler = &xe_pci_err_handlers,
 };
 
 /**
diff --git a/drivers/gpu/drm/xe/xe_pci.h b/drivers/gpu/drm/xe/xe_pci.h
index 11bcc5fe2c5b..85e85e8508c3 100644
--- a/drivers/gpu/drm/xe/xe_pci.h
+++ b/drivers/gpu/drm/xe/xe_pci.h
@@ -8,6 +8,8 @@
 
 struct pci_dev;
 
+extern const struct pci_error_handlers xe_pci_err_handlers;
+
 int xe_register_pci_driver(void);
 void xe_unregister_pci_driver(void);
 struct xe_device *xe_pci_to_pf_device(struct pci_dev *pdev);
diff --git a/drivers/gpu/drm/xe/xe_pci_err.c b/drivers/gpu/drm/xe/xe_pci_err.c
new file mode 100644
index 000000000000..339e8688d37f
--- /dev/null
+++ b/drivers/gpu/drm/xe/xe_pci_err.c
@@ -0,0 +1,160 @@
+// SPDX-License-Identifier: MIT
+/*
+ * Copyright © 2026 Intel Corporation
+ */
+
+#include "xe_bo_evict.h"
+#include "xe_device.h"
+#include "xe_gt.h"
+#include "xe_gt_idle.h"
+#include "xe_i2c.h"
+#include "xe_irq.h"
+#include "xe_late_bind_fw.h"
+#include "xe_pci.h"
+#include "xe_pcode.h"
+#include "xe_printk.h"
+#include "xe_pxp.h"
+#include "xe_wa.h"
+
+/* TODO: Extend support as a follow-up */
+#define XE_FLR_SKIP		(!IS_DGFX(xe) || IS_SRIOV_VF(xe) || pci_num_vf(pdev) || \
+				 xe->info.probe_display)
+
+static int xe_flr_prepare(struct xe_device *xe)
+{
+	struct xe_gt *gt;
+	int err;
+	u8 id;
+
+	err = xe_pxp_pm_suspend(xe->pxp);
+	if (err)
+		return err;
+
+	xe_late_bind_wait_for_worker_completion(&xe->late_bind);
+
+	xe_irq_disable(xe);
+
+	for_each_gt(gt, xe, id)
+		xe_gt_flr_prepare(gt);
+
+	// TODO: Drop all user bos
+	xe_bo_pci_dev_remove_pinned(xe);
+	unmap_mapping_range(xe->drm.anon_inode->i_mapping, 0, 0, 1);
+
+	return 0;
+}
+
+static int xe_flr_done(struct xe_device *xe)
+{
+	struct xe_tile *tile;
+	struct xe_gt *gt;
+	int err;
+	u8 id;
+
+	for_each_gt(gt, xe, id)
+		xe_gt_idle_disable_c6(gt);
+
+	for_each_tile(tile, xe, id)
+		xe_wa_apply_tile_workarounds(tile);
+
+	err = xe_pcode_ready(xe, true);
+	if (err)
+		return err;
+
+	xe_device_assert_lmem_ready(xe);
+
+	err = xe_bo_restore_map(xe);
+	if (err)
+		return err;
+
+	for_each_gt(gt, xe, id) {
+		err = xe_gt_flr_done(gt);
+		if (err)
+			return err;
+	}
+
+	xe_i2c_pm_resume(xe, true);
+
+	xe_irq_resume(xe);
+
+	for_each_gt(gt, xe, id) {
+		err = xe_gt_resume(gt);
+		if (err)
+			return err;
+	}
+
+	xe_pxp_pm_resume(xe->pxp);
+
+	xe_late_bind_fw_load(&xe->late_bind);
+
+	return 0;
+}
+
+static void xe_pci_reset_prepare(struct pci_dev *pdev)
+{
+	struct xe_device *xe = pdev_to_xe_device(pdev);
+
+	if (XE_FLR_SKIP) {
+		xe_err(xe, "PCIe FLR not supported\n");
+		return;
+	}
+
+	if (xe_device_wedged(xe)) {
+		xe_err(xe, "PCIe FLR aborted, device in unexpected state\n");
+		return;
+	}
+
+	/* Wedge the device to prevent userspace access but don't send the event yet */
+	atomic_set(&xe->wedged.flag, 1);
+
+	/*
+	 * The hardware could be in corrupted state and access unreliable, but we try to
+	 * update data structures and cleanup any pending work to avoid side effects during
+	 * PCIe FLR. This will be similar to xe_pm_suspend() flow but without migration.
+	 */
+	if (xe_flr_prepare(xe)) {
+		xe_err(xe, "Failed to prepare for PCIe FLR\n");
+		return;
+	}
+
+	xe->flr_prepared = true;
+	xe_info(xe, "Prepared for PCIe FLR\n");
+}
+
+static void xe_pci_reset_done(struct pci_dev *pdev)
+{
+	struct xe_device *xe = pdev_to_xe_device(pdev);
+
+	if (XE_FLR_SKIP)
+		return;
+
+	if (!xe_device_wedged(xe) || !xe->flr_prepared)
+		return;
+
+	/* Unprepare early in case we fail */
+	xe->flr_prepared = false;
+
+	/*
+	 * We already have the data structures intact, so try to re-initialize the device.
+	 * This will be similar to xe_pm_resume() flow, except we'll also need to recreate
+	 * all VRAM contents.
+	 */
+	if (xe_flr_done(xe)) {
+		xe_err(xe, "Re-initialization failed\n");
+		return;
+	}
+
+	/* Unwedge to allow userspace access */
+	atomic_set(&xe->wedged.flag, 0);
+
+	xe_info(xe, "Re-initialization success\n");
+}
+
+/*
+ * PCIe Function Level Reset (FLR) support only.
+ * TODO: Add PCIe error handlers using similar flow.
+ */
+const struct pci_error_handlers xe_pci_err_handlers = {
+	.reset_prepare = xe_pci_reset_prepare,
+	.reset_done = xe_pci_reset_done,
+};
-- 
2.43.0


  parent reply	other threads:[~2026-04-06 14:12 UTC|newest]

Thread overview: 32+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-04-06 14:07 [PATCH v5 0/9] Introduce Xe PCIe FLR Raag Jadav
2026-04-06 14:07 ` [PATCH v5 1/9] drm/xe/uc_fw: Allow re-initializing firmware Raag Jadav
2026-04-15 16:06   ` Daniele Ceraolo Spurio
2026-04-06 14:07 ` [PATCH v5 2/9] drm/xe/guc_submit: Introduce guc_exec_queue_reinit() Raag Jadav
2026-04-06 14:07 ` [PATCH v5 3/9] drm/xe/gt: Introduce FLR helpers Raag Jadav
2026-04-15 16:25   ` Daniele Ceraolo Spurio
2026-04-06 14:07 ` [PATCH v5 4/9] drm/xe/irq: Introduce xe_irq_disable() Raag Jadav
2026-04-06 14:07 ` [PATCH v5 5/9] drm/xe: Introduce xe_device_assert_lmem_ready() Raag Jadav
2026-04-06 14:07 ` [PATCH v5 6/9] drm/xe/bo_evict: Introduce xe_bo_restore_map() Raag Jadav
2026-04-06 14:07 ` [PATCH v5 7/9] drm/xe/exec_queue: Introduce xe_exec_queue_reinit() Raag Jadav
2026-04-15 16:10   ` Daniele Ceraolo Spurio
2026-04-15 16:48     ` Daniele Ceraolo Spurio
2026-04-15 17:02       ` Daniele Ceraolo Spurio
2026-04-06 14:07 ` [PATCH v5 8/9] drm/xe/migrate: Introduce xe_migrate_reinit() Raag Jadav
2026-04-06 14:07 ` Raag Jadav [this message]
2026-04-15  8:43   ` [PATCH v5 9/9] drm/xe/pci: Introduce PCIe FLR Laguna, Lukasz
2026-04-15  9:46     ` Raag Jadav
2026-04-15 10:33       ` Laguna, Lukasz
2026-04-15 10:54         ` Raag Jadav
2026-04-16  6:40           ` Raag Jadav
2026-04-17  7:10             ` Laguna, Lukasz
2026-04-15 16:45   ` Daniele Ceraolo Spurio
2026-04-06 14:18 ` ✗ CI.checkpatch: warning for Introduce Xe PCIe FLR (rev5) Patchwork
2026-04-06 14:19 ` ✓ CI.KUnit: success " Patchwork
2026-04-06 14:54 ` ✓ Xe.CI.BAT: " Patchwork
2026-04-06 18:08 ` ✗ Xe.CI.FULL: failure " Patchwork
2026-04-10 14:22 ` [PATCH v5 0/9] Introduce Xe PCIe FLR Raag Jadav
2026-04-10 18:22   ` Maarten Lankhorst
2026-04-11  8:11     ` Raag Jadav
2026-04-15 15:47 ` Daniele Ceraolo Spurio
2026-04-16  6:19   ` Raag Jadav
2026-04-16  6:35     ` Matthew Brost

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260406140722.154445-10-raag.jadav@intel.com \
    --to=raag.jadav@intel.com \
    --cc=intel-xe@lists.freedesktop.org \
    --cc=jani.nikula@intel.com \
    --cc=lukas@wunner.de \
    --cc=lukasz.laguna@intel.com \
    --cc=maarten@lankhorst.se \
    --cc=matthew.auld@intel.com \
    --cc=matthew.brost@intel.com \
    --cc=matthew.d.roper@intel.com \
    --cc=michal.wajdeczko@intel.com \
    --cc=michal.winiarski@intel.com \
    --cc=riana.tauro@intel.com \
    --cc=rodrigo.vivi@intel.com \
    --cc=thomas.hellstrom@linux.intel.com \
    --cc=zhanjun.dong@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox