Intel-XE Archive on lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/4] drm/xe: Introduce a simple wedged state
@ 2024-04-09 22:15 Rodrigo Vivi
  2024-04-09 22:15 ` [PATCH 2/4] drm/xe: declare wedged upon GuC load failure Rodrigo Vivi
                   ` (12 more replies)
  0 siblings, 13 replies; 36+ messages in thread
From: Rodrigo Vivi @ 2024-04-09 22:15 UTC (permalink / raw)
  To: intel-xe
  Cc: Rodrigo Vivi, Ashutosh Dixit, Tvrtko Ursulin,
	Thomas Hellström, Lucas De Marchi, Anshuman Gupta,
	Himal Prasad Ghimiray

Introduce a very simple 'wedged' state where any attempt
to access the GPU is entirely blocked.

On some critical cases, like on gt_reset failure, we need to
block any other attempt to use the GPU. Otherwise we are at
a risk of reaching cases that would force us to reboot the machine.

So, when this cases are identified we corner and block any GPU
access. No IOCTL and not even another GT reset should be attempted.

The 'wedged' state in Xe is an end state with no way back.
Only a device "re-probe" (unbind + bind) can restore the GPU access.

v2: - s/wedged/busted (Lucas)
    - use unbind+bind instead of module reload (Lucas)
    - added more info on unbind operations and instruction on bug report
    - only print the message once.

v3: - s/busted/wedged (Ashutosh, Tvrtko, Thomas)
    - don't assume user has sudo and tee available (Lucas)

v4: - remove unnecessary cases around ct communication or migration.

Cc: Ashutosh Dixit <ashutosh.dixit@intel.com>
Cc: Tvrtko Ursulin <tursulin@ursulin.net>
Cc: Thomas Hellström <thomas.hellstrom@linux.intel.com>
Cc: Lucas De Marchi <lucas.demarchi@intel.com>
Cc: Anshuman Gupta <anshuman.gupta@intel.com>
Reviewed-by: Himal Prasad Ghimiray <himal.prasad.ghimiray@intel.com> #v2
Reviewed-by: Lucas De Marchi <lucas.demarchi@intel.com> #v2
Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
---
 drivers/gpu/drm/xe/xe_device.c       |  6 ++++++
 drivers/gpu/drm/xe/xe_device.h       | 20 ++++++++++++++++++++
 drivers/gpu/drm/xe/xe_device_types.h |  3 +++
 drivers/gpu/drm/xe/xe_gt.c           |  5 ++++-
 drivers/gpu/drm/xe/xe_guc_pc.c       |  3 +++
 5 files changed, 36 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/xe/xe_device.c b/drivers/gpu/drm/xe/xe_device.c
index 9083f5e02dd9..67de795e43b3 100644
--- a/drivers/gpu/drm/xe/xe_device.c
+++ b/drivers/gpu/drm/xe/xe_device.c
@@ -142,6 +142,9 @@ static long xe_drm_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	struct xe_device *xe = to_xe_device(file_priv->minor->dev);
 	long ret;
 
+	if (xe_device_wedged(xe))
+		return -ECANCELED;
+
 	ret = xe_pm_runtime_get_ioctl(xe);
 	if (ret >= 0)
 		ret = drm_ioctl(file, cmd, arg);
@@ -157,6 +160,9 @@ static long xe_drm_compat_ioctl(struct file *file, unsigned int cmd, unsigned lo
 	struct xe_device *xe = to_xe_device(file_priv->minor->dev);
 	long ret;
 
+	if (xe_device_wedged(xe))
+		return -ECANCELED;
+
 	ret = xe_pm_runtime_get_ioctl(xe);
 	if (ret >= 0)
 		ret = drm_compat_ioctl(file, cmd, arg);
diff --git a/drivers/gpu/drm/xe/xe_device.h b/drivers/gpu/drm/xe/xe_device.h
index d413bc2c6be5..c532209c5bbd 100644
--- a/drivers/gpu/drm/xe/xe_device.h
+++ b/drivers/gpu/drm/xe/xe_device.h
@@ -176,4 +176,24 @@ void xe_device_snapshot_print(struct xe_device *xe, struct drm_printer *p);
 u64 xe_device_canonicalize_addr(struct xe_device *xe, u64 address);
 u64 xe_device_uncanonicalize_addr(struct xe_device *xe, u64 address);
 
+static inline bool xe_device_wedged(struct xe_device *xe)
+{
+	return atomic_read(&xe->wedged);
+}
+
+static inline void xe_device_declare_wedged(struct xe_device *xe)
+{
+	if (!atomic_xchg(&xe->wedged, 1)) {
+		xe->needs_flr_on_fini = true;
+		drm_err(&xe->drm,
+			"CRITICAL: Xe has declared device %s as wedged.\n"
+			"IOCTLs and executions are blocked until device is probed again with unbind and bind operations:\n"
+			"echo '%s' > /sys/bus/pci/drivers/xe/unbind\n"
+			"echo '%s' > /sys/bus/pci/drivers/xe/bind\n"
+			"Please file a _new_ bug report at https://gitlab.freedesktop.org/drm/xe/kernel/issues/new\n",
+			dev_name(xe->drm.dev), dev_name(xe->drm.dev),
+			dev_name(xe->drm.dev));
+	}
+}
+
 #endif
diff --git a/drivers/gpu/drm/xe/xe_device_types.h b/drivers/gpu/drm/xe/xe_device_types.h
index faa32407efa5..b9ef60f21750 100644
--- a/drivers/gpu/drm/xe/xe_device_types.h
+++ b/drivers/gpu/drm/xe/xe_device_types.h
@@ -458,6 +458,9 @@ struct xe_device {
 	/** @needs_flr_on_fini: requests function-reset on fini */
 	bool needs_flr_on_fini;
 
+	/** @wedged: Xe device faced a critical error and is now blocked. */
+	atomic_t wedged;
+
 	/* private: */
 
 #if IS_ENABLED(CONFIG_DRM_XE_DISPLAY)
diff --git a/drivers/gpu/drm/xe/xe_gt.c b/drivers/gpu/drm/xe/xe_gt.c
index cfa5da900461..0844081b88ef 100644
--- a/drivers/gpu/drm/xe/xe_gt.c
+++ b/drivers/gpu/drm/xe/xe_gt.c
@@ -633,6 +633,9 @@ static int gt_reset(struct xe_gt *gt)
 {
 	int err;
 
+	if (xe_device_wedged(gt_to_xe(gt)))
+		return -ECANCELED;
+
 	/* We only support GT resets with GuC submission */
 	if (!xe_device_uc_enabled(gt_to_xe(gt)))
 		return -ENODEV;
@@ -685,7 +688,7 @@ static int gt_reset(struct xe_gt *gt)
 err_fail:
 	xe_gt_err(gt, "reset failed (%pe)\n", ERR_PTR(err));
 
-	gt_to_xe(gt)->needs_flr_on_fini = true;
+	xe_device_declare_wedged(gt_to_xe(gt));
 
 	return err;
 }
diff --git a/drivers/gpu/drm/xe/xe_guc_pc.c b/drivers/gpu/drm/xe/xe_guc_pc.c
index 521ae24f2314..f4663f1b0a80 100644
--- a/drivers/gpu/drm/xe/xe_guc_pc.c
+++ b/drivers/gpu/drm/xe/xe_guc_pc.c
@@ -902,6 +902,9 @@ static void xe_guc_pc_fini(struct drm_device *drm, void *arg)
 		return;
 	}
 
+	if (xe_device_wedged(xe))
+		return;
+
 	XE_WARN_ON(xe_force_wake_get(gt_to_fw(pc_to_gt(pc)), XE_FORCEWAKE_ALL));
 	XE_WARN_ON(xe_guc_pc_gucrc_disable(pc));
 	XE_WARN_ON(xe_guc_pc_stop(pc));
-- 
2.44.0


^ permalink raw reply related	[flat|nested] 36+ messages in thread
* [PATCH 1/4] drm/xe: Introduce a simple wedged state
@ 2024-04-23 22:18 Rodrigo Vivi
  2024-04-23 22:18 ` [PATCH 3/4] drm/xe: Force wedged state and block GT reset upon any GPU hang Rodrigo Vivi
  0 siblings, 1 reply; 36+ messages in thread
From: Rodrigo Vivi @ 2024-04-23 22:18 UTC (permalink / raw)
  To: intel-xe
  Cc: Rodrigo Vivi, Ashutosh Dixit, Tvrtko Ursulin,
	Thomas Hellström, Lucas De Marchi, Anshuman Gupta,
	Himal Prasad Ghimiray

Introduce a very simple 'wedged' state where any attempt
to access the GPU is entirely blocked.

On some critical cases, like on gt_reset failure, we need to
block any other attempt to use the GPU. Otherwise we are at
a risk of reaching cases that would force us to reboot the machine.

So, when this cases are identified we corner and block any GPU
access. No IOCTL and not even another GT reset should be attempted.

The 'wedged' state in Xe is an end state with no way back.
Only a device "re-probe" (unbind + bind) can restore the GPU access.

v2: - s/wedged/busted (Lucas)
    - use unbind+bind instead of module reload (Lucas)
    - added more info on unbind operations and instruction on bug report
    - only print the message once.

v3: - s/busted/wedged (Ashutosh, Tvrtko, Thomas)
    - don't assume user has sudo and tee available (Lucas)

v4: - remove unnecessary cases around ct communication or migration.

Cc: Ashutosh Dixit <ashutosh.dixit@intel.com>
Cc: Tvrtko Ursulin <tursulin@ursulin.net>
Cc: Thomas Hellström <thomas.hellstrom@linux.intel.com>
Cc: Lucas De Marchi <lucas.demarchi@intel.com>
Cc: Anshuman Gupta <anshuman.gupta@intel.com>
Reviewed-by: Himal Prasad Ghimiray <himal.prasad.ghimiray@intel.com>
Reviewed-by: Lucas De Marchi <lucas.demarchi@intel.com> #v2
Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
---
 drivers/gpu/drm/xe/xe_device.c       |  6 ++++++
 drivers/gpu/drm/xe/xe_device.h       | 20 ++++++++++++++++++++
 drivers/gpu/drm/xe/xe_device_types.h |  3 +++
 drivers/gpu/drm/xe/xe_gt.c           |  5 ++++-
 drivers/gpu/drm/xe/xe_guc_pc.c       |  3 +++
 5 files changed, 36 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/xe/xe_device.c b/drivers/gpu/drm/xe/xe_device.c
index 55bbc8b8df15..76a7b37a4a53 100644
--- a/drivers/gpu/drm/xe/xe_device.c
+++ b/drivers/gpu/drm/xe/xe_device.c
@@ -137,6 +137,9 @@ static long xe_drm_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	struct xe_device *xe = to_xe_device(file_priv->minor->dev);
 	long ret;
 
+	if (xe_device_wedged(xe))
+		return -ECANCELED;
+
 	ret = xe_pm_runtime_get_ioctl(xe);
 	if (ret >= 0)
 		ret = drm_ioctl(file, cmd, arg);
@@ -152,6 +155,9 @@ static long xe_drm_compat_ioctl(struct file *file, unsigned int cmd, unsigned lo
 	struct xe_device *xe = to_xe_device(file_priv->minor->dev);
 	long ret;
 
+	if (xe_device_wedged(xe))
+		return -ECANCELED;
+
 	ret = xe_pm_runtime_get_ioctl(xe);
 	if (ret >= 0)
 		ret = drm_compat_ioctl(file, cmd, arg);
diff --git a/drivers/gpu/drm/xe/xe_device.h b/drivers/gpu/drm/xe/xe_device.h
index 36d4434ebccc..d2e4249d37ce 100644
--- a/drivers/gpu/drm/xe/xe_device.h
+++ b/drivers/gpu/drm/xe/xe_device.h
@@ -167,4 +167,24 @@ void xe_device_snapshot_print(struct xe_device *xe, struct drm_printer *p);
 u64 xe_device_canonicalize_addr(struct xe_device *xe, u64 address);
 u64 xe_device_uncanonicalize_addr(struct xe_device *xe, u64 address);
 
+static inline bool xe_device_wedged(struct xe_device *xe)
+{
+	return atomic_read(&xe->wedged);
+}
+
+static inline void xe_device_declare_wedged(struct xe_device *xe)
+{
+	if (!atomic_xchg(&xe->wedged, 1)) {
+		xe->needs_flr_on_fini = true;
+		drm_err(&xe->drm,
+			"CRITICAL: Xe has declared device %s as wedged.\n"
+			"IOCTLs and executions are blocked until device is probed again with unbind and bind operations:\n"
+			"echo '%s' > /sys/bus/pci/drivers/xe/unbind\n"
+			"echo '%s' > /sys/bus/pci/drivers/xe/bind\n"
+			"Please file a _new_ bug report at https://gitlab.freedesktop.org/drm/xe/kernel/issues/new\n",
+			dev_name(xe->drm.dev), dev_name(xe->drm.dev),
+			dev_name(xe->drm.dev));
+	}
+}
+
 #endif
diff --git a/drivers/gpu/drm/xe/xe_device_types.h b/drivers/gpu/drm/xe/xe_device_types.h
index 2e62450d86e1..9b0f3ddc6d50 100644
--- a/drivers/gpu/drm/xe/xe_device_types.h
+++ b/drivers/gpu/drm/xe/xe_device_types.h
@@ -459,6 +459,9 @@ struct xe_device {
 	/** @needs_flr_on_fini: requests function-reset on fini */
 	bool needs_flr_on_fini;
 
+	/** @wedged: Xe device faced a critical error and is now blocked. */
+	atomic_t wedged;
+
 	/* private: */
 
 #if IS_ENABLED(CONFIG_DRM_XE_DISPLAY)
diff --git a/drivers/gpu/drm/xe/xe_gt.c b/drivers/gpu/drm/xe/xe_gt.c
index 491d0413de15..e922e77f5010 100644
--- a/drivers/gpu/drm/xe/xe_gt.c
+++ b/drivers/gpu/drm/xe/xe_gt.c
@@ -633,6 +633,9 @@ static int gt_reset(struct xe_gt *gt)
 {
 	int err;
 
+	if (xe_device_wedged(gt_to_xe(gt)))
+		return -ECANCELED;
+
 	/* We only support GT resets with GuC submission */
 	if (!xe_device_uc_enabled(gt_to_xe(gt)))
 		return -ENODEV;
@@ -685,7 +688,7 @@ static int gt_reset(struct xe_gt *gt)
 err_fail:
 	xe_gt_err(gt, "reset failed (%pe)\n", ERR_PTR(err));
 
-	gt_to_xe(gt)->needs_flr_on_fini = true;
+	xe_device_declare_wedged(gt_to_xe(gt));
 
 	return err;
 }
diff --git a/drivers/gpu/drm/xe/xe_guc_pc.c b/drivers/gpu/drm/xe/xe_guc_pc.c
index 509649d0e65e..8fc757900ed1 100644
--- a/drivers/gpu/drm/xe/xe_guc_pc.c
+++ b/drivers/gpu/drm/xe/xe_guc_pc.c
@@ -902,6 +902,9 @@ static void xe_guc_pc_fini(struct drm_device *drm, void *arg)
 		return;
 	}
 
+	if (xe_device_wedged(xe))
+		return;
+
 	XE_WARN_ON(xe_force_wake_get(gt_to_fw(pc_to_gt(pc)), XE_FORCEWAKE_ALL));
 	XE_WARN_ON(xe_guc_pc_gucrc_disable(pc));
 	XE_WARN_ON(xe_guc_pc_stop(pc));
-- 
2.44.0


^ permalink raw reply related	[flat|nested] 36+ messages in thread
* [PATCH 0/4] Introduce a wedged state
@ 2024-04-03 15:07 Rodrigo Vivi
  2024-04-03 15:07 ` [PATCH 3/4] drm/xe: Force wedged state and block GT reset upon any GPU hang Rodrigo Vivi
  0 siblings, 1 reply; 36+ messages in thread
From: Rodrigo Vivi @ 2024-04-03 15:07 UTC (permalink / raw)
  To: intel-xe; +Cc: matthew.brost, lucas.demarchi, Rodrigo Vivi

Let's introduce a wedged state in Xe.

First of all, we need some more robust protection here.
One of the current biggest pain developers are facing with
Xe is that if something goes wrong and gt-reset cannot solve
the issue or when guc load has failed, we would like to
stay in a system where we can debug and reload the module,
instead of getting forced to reboot the system and start over.

The first patch of this series is enough for this protection,
and it is ready for that.

The next reasoning for this series is that in many cases,
software validation teams need to work in a system where
the memory is intact after the the first hang. For this
case, the next change in this series adds a very aggressive
wedged mode, where guc is blocked to attempt a reset and
our xe driver doesn't do any clean-up other than just signal
some fences.

In this aggressive mode it is expected that some memory will
not be clean and even some leaks might happen. Later, during
rebind and even module reload, many kernel warns will pop-up.
But again, the target here is not to have a safe fail out,
but only a specialized debug of the gpu hang. While
writing this, I realize that I need to add a config protection
to this mode to be only used in debug+expert mode. This will
come soon while addressing some comments that this new version
of the series might receive.

Also, this series includes some IGT tests (sent to igt-dev
mailing list:
1. basic-wedge: tests the mode 1 with a fail inject in the
   		gt-reset then check if accesses are blocked
		and do a rebind to restore the access and
		confirm with a simple exec.
2. wedged-at-any-timeout: tests the aggressive mode 2 described
   			  above, configuring with the debugfs, and
			  then running a simple-hang case. Then
			  also checking for the blocked access,
			  and then rebind to restore the
			  device and confirm with a simple good
			  execution.

$ sudo ./build/tests/xe_wedged
IGT-Version: 1.28-ged62883ec967 (x86_64) (Linux: 6.9.0-rc1+ x86_64)
Using IGT_SRANDOM=1712103638 for randomisation
Opened device: /dev/dri/card0
Starting subtest: basic-wedged
Subtest basic-wedged: SUCCESS (2.855s)
Starting subtest: wedged-at-any-timeout
Subtest wedged-at-any-timeout: SUCCESS (4.580s)

Thanks,
Rodrigo.

Rodrigo Vivi (4):
  drm/xe: Introduce a simple wedged state
  drm/xe: declare wedged upon GuC load failure
  drm/xe: Force wedged state and block GT reset upon any GPU hang
  drm/xe: Introduce the wedged_mode debugfs

 drivers/gpu/drm/xe/xe_debugfs.c             | 56 ++++++++++++++++++++
 drivers/gpu/drm/xe/xe_device.c              | 55 ++++++++++++++++++++
 drivers/gpu/drm/xe/xe_device.h              |  7 +++
 drivers/gpu/drm/xe/xe_device_types.h        | 10 ++++
 drivers/gpu/drm/xe/xe_gt.c                  |  5 +-
 drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c |  3 ++
 drivers/gpu/drm/xe/xe_guc.c                 | 41 +++++++--------
 drivers/gpu/drm/xe/xe_guc_ads.c             | 57 ++++++++++++++++++++-
 drivers/gpu/drm/xe/xe_guc_ads.h             |  1 +
 drivers/gpu/drm/xe/xe_guc_ct.c              |  8 +++
 drivers/gpu/drm/xe/xe_guc_pc.c              |  3 ++
 drivers/gpu/drm/xe/xe_guc_submit.c          | 47 ++++++++++++++---
 drivers/gpu/drm/xe/xe_module.c              |  5 ++
 drivers/gpu/drm/xe/xe_module.h              |  1 +
 14 files changed, 267 insertions(+), 32 deletions(-)

-- 
2.44.0


^ permalink raw reply	[flat|nested] 36+ messages in thread

end of thread, other threads:[~2024-04-24 17:02 UTC | newest]

Thread overview: 36+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2024-04-09 22:15 [PATCH 1/4] drm/xe: Introduce a simple wedged state Rodrigo Vivi
2024-04-09 22:15 ` [PATCH 2/4] drm/xe: declare wedged upon GuC load failure Rodrigo Vivi
2024-04-09 23:41   ` Matthew Brost
2024-04-10  6:50   ` Ghimiray, Himal Prasad
2024-04-16 17:40     ` Lucas De Marchi
2024-04-16 17:06   ` Lucas De Marchi
2024-04-16 19:05   ` Rodrigo Vivi
2024-04-16 19:13     ` Lucas De Marchi
2024-04-16 19:19       ` Ghimiray, Himal Prasad
2024-04-09 22:15 ` [PATCH 3/4] drm/xe: Force wedged state and block GT reset upon any GPU hang Rodrigo Vivi
2024-04-10 17:58   ` Matthew Brost
2024-04-16 17:19   ` Lucas De Marchi
2024-04-16 19:08     ` Rodrigo Vivi
2024-04-09 22:15 ` [PATCH 4/4] drm/xe: Introduce the wedged_mode debugfs Rodrigo Vivi
2024-04-17 19:51   ` Lucas De Marchi
2024-04-17 20:29     ` Rodrigo Vivi
2024-04-17 22:50       ` Lucas De Marchi
2024-04-18  5:14   ` Ghimiray, Himal Prasad
2024-04-18 10:44     ` Ghimiray, Himal Prasad
2024-04-09 22:21 ` ✓ CI.Patch_applied: success for series starting with [1/4] drm/xe: Introduce a simple wedged state Patchwork
2024-04-09 22:22 ` ✓ CI.checkpatch: " Patchwork
2024-04-09 22:23 ` ✓ CI.KUnit: " Patchwork
2024-04-09 22:34 ` ✓ CI.Build: " Patchwork
2024-04-09 22:37 ` ✓ CI.Hooks: " Patchwork
2024-04-09 22:38 ` ✓ CI.checksparse: " Patchwork
2024-04-09 23:02 ` ✓ CI.BAT: " Patchwork
2024-04-10  0:14 ` ✗ CI.FULL: failure " Patchwork
2024-04-16 17:03 ` [PATCH 1/4] " Lucas De Marchi
2024-04-16 19:20 ` Ghimiray, Himal Prasad
  -- strict thread matches above, loose matches on Subject: below --
2024-04-23 22:18 Rodrigo Vivi
2024-04-23 22:18 ` [PATCH 3/4] drm/xe: Force wedged state and block GT reset upon any GPU hang Rodrigo Vivi
2024-04-24  3:20   ` Ghimiray, Himal Prasad
2024-04-24 12:25     ` Rodrigo Vivi
2024-04-24 17:01       ` Ghimiray, Himal Prasad
2024-04-03 15:07 [PATCH 0/4] Introduce a wedged state Rodrigo Vivi
2024-04-03 15:07 ` [PATCH 3/4] drm/xe: Force wedged state and block GT reset upon any GPU hang Rodrigo Vivi
2024-04-04 17:52   ` Matthew Brost
2024-04-04 18:01     ` Rodrigo Vivi

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox