[PATCH v3 1/2] drm: introduce KMS recovery mechanism

public inbox for dri-devel@lists.freedesktop.org
 help / color / mirror / Atom feed

* [PATCH v3 1/2] drm: introduce KMS recovery mechanism
@ 2026-02-12 23:08 Hamza Mahfooz
  2026-02-12 23:09 ` [PATCH v3 2/2] drm/amd/display: add vendor specific reset Hamza Mahfooz
                   ` (2 more replies)
  0 siblings, 3 replies; 16+ messages in thread
From: Hamza Mahfooz @ 2026-02-12 23:08 UTC (permalink / raw)
  To: dri-devel
  Cc: Michel Dänzer, Hamza Mahfooz, Harry Wentland, Leo Li,
	Rodrigo Siqueira, Alex Deucher, Christian König,
	David Airlie, Simona Vetter, Maarten Lankhorst, Maxime Ripard,
	Thomas Zimmermann, Mario Limonciello, Alex Hung, Wayne Lin,
	Aurabindo Pillai, Ivan Lipski, Timur Kristóf,
	Dominik Kaszewski, amd-gfx, linux-kernel

There should be a mechanism for drivers to respond to flip_done
timeouts. Since, as it stands it is possible for the display to stall
indefinitely, necessitating a hard reset. So, introduce a new mechanism
that tries various methods of recovery with increasing aggression, in
the following order:

1. Force a full modeset (have the compositor reprogram the state from
   scratch).
2. As a last resort, have the driver attempt a vendor specific reset
   (which they can do by reading the return value of
   drm_atomic_helper_wait_for_flip_done()).

Signed-off-by: Hamza Mahfooz <someguy@effective-light.com>
---
v2: new to the series
v3: get rid of page_flip_timeout() and have
    drm_atomic_helper_wait_for_flip_done() return a error.
---
 drivers/gpu/drm/drm_atomic_helper.c | 45 +++++++++++++++++++++++++----
 include/drm/drm_atomic_helper.h     |  4 +--
 include/drm/drm_device.h            | 24 +++++++++++++++
 3 files changed, 66 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/drm_atomic_helper.c b/drivers/gpu/drm/drm_atomic_helper.c
index 5840e9cc6f66..6ae1234b9e20 100644
--- a/drivers/gpu/drm/drm_atomic_helper.c
+++ b/drivers/gpu/drm/drm_atomic_helper.c
@@ -42,6 +42,7 @@
 #include <drm/drm_gem_atomic_helper.h>
 #include <drm/drm_panic.h>
 #include <drm/drm_print.h>
+#include <drm/drm_probe_helper.h>
 #include <drm/drm_self_refresh_helper.h>
 #include <drm/drm_vblank.h>
 #include <drm/drm_writeback.h>
@@ -1864,11 +1865,15 @@ EXPORT_SYMBOL(drm_atomic_helper_wait_for_vblanks);
  *
  * This requires that drivers use the nonblocking commit tracking support
  * initialized using drm_atomic_helper_setup_commit().
+ *
+ * Returns:
+ * -ETIMEDOUT to indicate that drivers can attempt a vendor reset, 0 otherwise.
  */
-void drm_atomic_helper_wait_for_flip_done(struct drm_device *dev,
-					  struct drm_atomic_state *state)
+int drm_atomic_helper_wait_for_flip_done(struct drm_device *dev,
+					 struct drm_atomic_state *state)
 {
 	struct drm_crtc *crtc;
+	int ret = 0;
 	int i;
 
 	for (i = 0; i < dev->mode_config.num_crtc; i++) {
@@ -1881,13 +1886,43 @@ void drm_atomic_helper_wait_for_flip_done(struct drm_device *dev,
 			continue;
 
 		ret = wait_for_completion_timeout(&commit->flip_done, 10 * HZ);
-		if (ret == 0)
-			drm_err(dev, "[CRTC:%d:%s] flip_done timed out\n",
-				crtc->base.id, crtc->name);
+		if (!ret) {
+			switch (dev->reset_phase) {
+			case DRM_KMS_RESET_NONE:
+				drm_err(dev, "[CRTC:%d:%s] flip_done timed out\n",
+					crtc->base.id, crtc->name);
+				dev->reset_phase = DRM_KMS_RESET_FORCE_MODESET;
+				drm_kms_helper_hotplug_event(dev);
+				break;
+			case DRM_KMS_RESET_FORCE_MODESET:
+				drm_err(dev, "[CRTC:%d:%s] force full modeset failed\n",
+					crtc->base.id, crtc->name);
+				dev->reset_phase = DRM_KMS_RESET_VENDOR;
+				ret = -ETIMEDOUT;
+				break;
+			case DRM_KMS_RESET_VENDOR:
+				drm_err(dev, "[CRTC:%d:%s] KMS recovery failed!\n",
+					crtc->base.id, crtc->name);
+				dev->reset_phase = DRM_KMS_RESET_GIVE_UP;
+				break;
+			default:
+				break;
+			}
+
+			goto exit;
+		}
+	}
+
+	if (dev->reset_phase) {
+		drm_info(dev, "KMS recovery succeeded!\n");
+		dev->reset_phase = DRM_KMS_RESET_NONE;
 	}
 
+exit:
 	if (state->fake_commit)
 		complete_all(&state->fake_commit->flip_done);
+
+	return ret;
 }
 EXPORT_SYMBOL(drm_atomic_helper_wait_for_flip_done);
 
diff --git a/include/drm/drm_atomic_helper.h b/include/drm/drm_atomic_helper.h
index 53382fe93537..298c8dff3993 100644
--- a/include/drm/drm_atomic_helper.h
+++ b/include/drm/drm_atomic_helper.h
@@ -79,8 +79,8 @@ int drm_atomic_helper_wait_for_fences(struct drm_device *dev,
 void drm_atomic_helper_wait_for_vblanks(struct drm_device *dev,
 					struct drm_atomic_state *old_state);
 
-void drm_atomic_helper_wait_for_flip_done(struct drm_device *dev,
-					  struct drm_atomic_state *old_state);
+int drm_atomic_helper_wait_for_flip_done(struct drm_device *dev,
+					 struct drm_atomic_state *old_state);
 
 void
 drm_atomic_helper_update_legacy_modeset_state(struct drm_device *dev,
diff --git a/include/drm/drm_device.h b/include/drm/drm_device.h
index bc78fb77cc27..1244d7527e7b 100644
--- a/include/drm/drm_device.h
+++ b/include/drm/drm_device.h
@@ -66,6 +66,23 @@ enum switch_power_state {
 	DRM_SWITCH_POWER_DYNAMIC_OFF = 3,
 };
 
+/**
+ * enum drm_kms_reset_phase - reset phase of drm device
+ */
+enum drm_kms_reset_phase {
+	/** @DRM_KMS_RESET_NONE: Not currently attempting recovery */
+	DRM_KMS_RESET_NONE,
+
+	/** @DRM_KMS_RESET_FORCE_MODESET: Force a full modeset */
+	DRM_KMS_RESET_FORCE_MODESET,
+
+	/** @DRM_KMS_RESET_VENDOR: Attempt a vendor reset */
+	DRM_KMS_RESET_VENDOR,
+
+	/** @DRM_KMS_RESET_GIVE_UP: All recovery methods failed */
+	DRM_KMS_RESET_GIVE_UP,
+};
+
 /**
  * struct drm_device - DRM device structure
  *
@@ -375,6 +392,13 @@ struct drm_device {
 	 * Root directory for debugfs files.
 	 */
 	struct dentry *debugfs_root;
+
+	/**
+	 * @reset_phase:
+	 *
+	 * Reset phase that the device is in.
+	 */
+	enum drm_kms_reset_phase reset_phase;
 };
 
 void drm_dev_set_dma_dev(struct drm_device *dev, struct device *dma_dev);
-- 
2.53.0


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH v3 2/2] drm/amd/display: add vendor specific reset
  2026-02-12 23:08 [PATCH v3 1/2] drm: introduce KMS recovery mechanism Hamza Mahfooz
@ 2026-02-12 23:09 ` Hamza Mahfooz
  2026-02-18  9:34   ` Christian König
  2026-02-13  0:18 ` [PATCH v3 1/2] drm: introduce KMS recovery mechanism Mario Limonciello
  2026-02-18  9:31 ` Christian König
  2 siblings, 1 reply; 16+ messages in thread
From: Hamza Mahfooz @ 2026-02-12 23:09 UTC (permalink / raw)
  To: dri-devel
  Cc: Michel Dänzer, Hamza Mahfooz, Harry Wentland, Leo Li,
	Rodrigo Siqueira, Alex Deucher, Christian König,
	David Airlie, Simona Vetter, Maarten Lankhorst, Maxime Ripard,
	Thomas Zimmermann, Mario Limonciello, Alex Hung, Aurabindo Pillai,
	Wayne Lin, Ivan Lipski, Timur Kristóf, Dominik Kaszewski,
	amd-gfx, linux-kernel

We now have a means to respond to page flip timeouts. So, hook up
support by sending out a wedged event if
drm_atomic_helper_wait_for_flip_done() fails.

Signed-off-by: Hamza Mahfooz <someguy@effective-light.com>
---
v2: send a wedged event instead of attempting a GPU reset.
v3: read return value of drm_atomic_helper_wait_for_flip_done().
---
 drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
index 7c51d8d7e73c..06d8353cb616 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
@@ -87,6 +87,7 @@
 #include <drm/drm_atomic_uapi.h>
 #include <drm/drm_atomic_helper.h>
 #include <drm/drm_blend.h>
+#include <drm/drm_drv.h>
 #include <drm/drm_fixed.h>
 #include <drm/drm_fourcc.h>
 #include <drm/drm_edid.h>
@@ -11085,8 +11086,12 @@ static void amdgpu_dm_atomic_commit_tail(struct drm_atomic_state *state)
 	/* Signal HW programming completion */
 	drm_atomic_helper_commit_hw_done(state);
 
-	if (wait_for_vblank)
-		drm_atomic_helper_wait_for_flip_done(dev, state);
+	if (wait_for_vblank) {
+		if (drm_atomic_helper_wait_for_flip_done(dev, state))
+			drm_dev_wedged_event(dev, DRM_WEDGE_RECOVERY_REBIND |
+					     DRM_WEDGE_RECOVERY_BUS_RESET,
+					     NULL);
+	}
 
 	drm_atomic_helper_cleanup_planes(dev, state);
 
-- 
2.53.0


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* Re: [PATCH v3 1/2] drm: introduce KMS recovery mechanism
  2026-02-12 23:08 [PATCH v3 1/2] drm: introduce KMS recovery mechanism Hamza Mahfooz
  2026-02-12 23:09 ` [PATCH v3 2/2] drm/amd/display: add vendor specific reset Hamza Mahfooz
@ 2026-02-13  0:18 ` Mario Limonciello
  2026-02-13 19:35   ` Hamza Mahfooz
  2026-02-18  9:31 ` Christian König
  2 siblings, 1 reply; 16+ messages in thread
From: Mario Limonciello @ 2026-02-13  0:18 UTC (permalink / raw)
  To: Hamza Mahfooz, dri-devel
  Cc: Michel Dänzer, Harry Wentland, Leo Li, Rodrigo Siqueira,
	Alex Deucher, Christian König, David Airlie, Simona Vetter,
	Maarten Lankhorst, Maxime Ripard, Thomas Zimmermann, Alex Hung,
	Wayne Lin, Aurabindo Pillai, Ivan Lipski, Timur Kristóf,
	Dominik Kaszewski, amd-gfx, linux-kernel



On 2/12/2026 5:08 PM, Hamza Mahfooz wrote:
> There should be a mechanism for drivers to respond to flip_done
> timeouts. Since, as it stands it is possible for the display to stall
> indefinitely, necessitating a hard reset. So, introduce a new mechanism
> that tries various methods of recovery with increasing aggression, in
> the following order:
> 
> 1. Force a full modeset (have the compositor reprogram the state from
>     scratch).
> 2. As a last resort, have the driver attempt a vendor specific reset
>     (which they can do by reading the return value of
>     drm_atomic_helper_wait_for_flip_done()).

Since you were able to (relatively) reliably reproduce a problem in 
amdgpu, how far in your iterative flow did you get?  Did you manage to 
need the vendor specific handling?  And presumably that helped?

> 
> Signed-off-by: Hamza Mahfooz <someguy@effective-light.com>
> ---
> v2: new to the series
> v3: get rid of page_flip_timeout() and have
>      drm_atomic_helper_wait_for_flip_done() return a error.
> ---
>   drivers/gpu/drm/drm_atomic_helper.c | 45 +++++++++++++++++++++++++----
>   include/drm/drm_atomic_helper.h     |  4 +--
>   include/drm/drm_device.h            | 24 +++++++++++++++
>   3 files changed, 66 insertions(+), 7 deletions(-)
> 
> diff --git a/drivers/gpu/drm/drm_atomic_helper.c b/drivers/gpu/drm/drm_atomic_helper.c
> index 5840e9cc6f66..6ae1234b9e20 100644
> --- a/drivers/gpu/drm/drm_atomic_helper.c
> +++ b/drivers/gpu/drm/drm_atomic_helper.c
> @@ -42,6 +42,7 @@
>   #include <drm/drm_gem_atomic_helper.h>
>   #include <drm/drm_panic.h>
>   #include <drm/drm_print.h>
> +#include <drm/drm_probe_helper.h>
>   #include <drm/drm_self_refresh_helper.h>
>   #include <drm/drm_vblank.h>
>   #include <drm/drm_writeback.h>
> @@ -1864,11 +1865,15 @@ EXPORT_SYMBOL(drm_atomic_helper_wait_for_vblanks);
>    *
>    * This requires that drivers use the nonblocking commit tracking support
>    * initialized using drm_atomic_helper_setup_commit().
> + *
> + * Returns:
> + * -ETIMEDOUT to indicate that drivers can attempt a vendor reset, 0 otherwise.
>    */
> -void drm_atomic_helper_wait_for_flip_done(struct drm_device *dev,
> -					  struct drm_atomic_state *state)
> +int drm_atomic_helper_wait_for_flip_done(struct drm_device *dev,
> +					 struct drm_atomic_state *state)
>   {
>   	struct drm_crtc *crtc;
> +	int ret = 0;
>   	int i;
>   
>   	for (i = 0; i < dev->mode_config.num_crtc; i++) {
> @@ -1881,13 +1886,43 @@ void drm_atomic_helper_wait_for_flip_done(struct drm_device *dev,
>   			continue;
>   
>   		ret = wait_for_completion_timeout(&commit->flip_done, 10 * HZ);
> -		if (ret == 0)
> -			drm_err(dev, "[CRTC:%d:%s] flip_done timed out\n",
> -				crtc->base.id, crtc->name);
> +		if (!ret) {
> +			switch (dev->reset_phase) {
> +			case DRM_KMS_RESET_NONE:
> +				drm_err(dev, "[CRTC:%d:%s] flip_done timed out\n",
> +					crtc->base.id, crtc->name);
> +				dev->reset_phase = DRM_KMS_RESET_FORCE_MODESET;
> +				drm_kms_helper_hotplug_event(dev);
> +				break;

Since you're iterating multiple CRTCs if you manage to recover from one
with this call shouldn't you keep iterating the rest?

> +			case DRM_KMS_RESET_FORCE_MODESET:
> +				drm_err(dev, "[CRTC:%d:%s] force full modeset failed\n",
> +					crtc->base.id, crtc->name);
> +				dev->reset_phase = DRM_KMS_RESET_VENDOR;
> +				ret = -ETIMEDOUT;
> +				break;
> +			case DRM_KMS_RESET_VENDOR:
> +				drm_err(dev, "[CRTC:%d:%s] KMS recovery failed!\n",
> +					crtc->base.id, crtc->name);
> +				dev->reset_phase = DRM_KMS_RESET_GIVE_UP;
> +				break;
> +			default:
> +				break;
> +			}
> +
> +			goto exit;
> +		}
> +	}
> +
> +	if (dev->reset_phase) {
> +		drm_info(dev, "KMS recovery succeeded!\n");
> +		dev->reset_phase = DRM_KMS_RESET_NONE;
>   	}
>   
> +exit:
>   	if (state->fake_commit)
>   		complete_all(&state->fake_commit->flip_done);
> +
> +	return ret;
>   }
>   EXPORT_SYMBOL(drm_atomic_helper_wait_for_flip_done);
>   
> diff --git a/include/drm/drm_atomic_helper.h b/include/drm/drm_atomic_helper.h
> index 53382fe93537..298c8dff3993 100644
> --- a/include/drm/drm_atomic_helper.h
> +++ b/include/drm/drm_atomic_helper.h
> @@ -79,8 +79,8 @@ int drm_atomic_helper_wait_for_fences(struct drm_device *dev,
>   void drm_atomic_helper_wait_for_vblanks(struct drm_device *dev,
>   					struct drm_atomic_state *old_state);
>   
> -void drm_atomic_helper_wait_for_flip_done(struct drm_device *dev,
> -					  struct drm_atomic_state *old_state);
> +int drm_atomic_helper_wait_for_flip_done(struct drm_device *dev,
> +					 struct drm_atomic_state *old_state);
>   
>   void
>   drm_atomic_helper_update_legacy_modeset_state(struct drm_device *dev,
> diff --git a/include/drm/drm_device.h b/include/drm/drm_device.h
> index bc78fb77cc27..1244d7527e7b 100644
> --- a/include/drm/drm_device.h
> +++ b/include/drm/drm_device.h
> @@ -66,6 +66,23 @@ enum switch_power_state {
>   	DRM_SWITCH_POWER_DYNAMIC_OFF = 3,
>   };
>   
> +/**
> + * enum drm_kms_reset_phase - reset phase of drm device
> + */
> +enum drm_kms_reset_phase {
> +	/** @DRM_KMS_RESET_NONE: Not currently attempting recovery */
> +	DRM_KMS_RESET_NONE,
> +
> +	/** @DRM_KMS_RESET_FORCE_MODESET: Force a full modeset */
> +	DRM_KMS_RESET_FORCE_MODESET,
> +
> +	/** @DRM_KMS_RESET_VENDOR: Attempt a vendor reset */
> +	DRM_KMS_RESET_VENDOR,
> +
> +	/** @DRM_KMS_RESET_GIVE_UP: All recovery methods failed */
> +	DRM_KMS_RESET_GIVE_UP,
> +};
> +
>   /**
>    * struct drm_device - DRM device structure
>    *
> @@ -375,6 +392,13 @@ struct drm_device {
>   	 * Root directory for debugfs files.
>   	 */
>   	struct dentry *debugfs_root;
> +
> +	/**
> +	 * @reset_phase:
> +	 *
> +	 * Reset phase that the device is in.
> +	 */
> +	enum drm_kms_reset_phase reset_phase;
>   };
>   
>   void drm_dev_set_dma_dev(struct drm_device *dev, struct device *dma_dev);


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v3 1/2] drm: introduce KMS recovery mechanism
  2026-02-13  0:18 ` [PATCH v3 1/2] drm: introduce KMS recovery mechanism Mario Limonciello
@ 2026-02-13 19:35   ` Hamza Mahfooz
  2026-02-14 14:02     ` Michel Dänzer
  0 siblings, 1 reply; 16+ messages in thread
From: Hamza Mahfooz @ 2026-02-13 19:35 UTC (permalink / raw)
  To: Mario Limonciello
  Cc: dri-devel, Michel Dänzer, Harry Wentland, Leo Li,
	Rodrigo Siqueira, Alex Deucher, Christian König,
	David Airlie, Simona Vetter, Maarten Lankhorst, Maxime Ripard,
	Thomas Zimmermann, Alex Hung, Wayne Lin, Aurabindo Pillai,
	Ivan Lipski, Timur Kristóf, Dominik Kaszewski, amd-gfx,
	linux-kernel

On Thu, Feb 12, 2026 at 06:18:17PM -0600, Mario Limonciello wrote:
> Since you were able to (relatively) reliably reproduce a problem in amdgpu,
> how far in your iterative flow did you get?  Did you manage to need the
> vendor specific handling?  And presumably that helped?
> 

Every time I've tested it (with my repro) the full modeset has failed
and it was able to recover with the vendor specific handling. Though
it's worth noting that I strongly suspect a firmware hang in my case[1].

[1] https://lore.kernel.org/r/aYplYyf6Pp20lOAD@hal-station/

> > @@ -1881,13 +1886,43 @@ void drm_atomic_helper_wait_for_flip_done(struct drm_device *dev,
> >   			continue;
> >   		ret = wait_for_completion_timeout(&commit->flip_done, 10 * HZ);
> > -		if (ret == 0)
> > -			drm_err(dev, "[CRTC:%d:%s] flip_done timed out\n",
> > -				crtc->base.id, crtc->name);
> > +		if (!ret) {
> > +			switch (dev->reset_phase) {
> > +			case DRM_KMS_RESET_NONE:
> > +				drm_err(dev, "[CRTC:%d:%s] flip_done timed out\n",
> > +					crtc->base.id, crtc->name);
> > +				dev->reset_phase = DRM_KMS_RESET_FORCE_MODESET;
> > +				drm_kms_helper_hotplug_event(dev);
> > +				break;
> 
> Since you're iterating multiple CRTCs if you manage to recover from one
> with this call shouldn't you keep iterating the rest?
> 

Most measures that the can be implemented at the kernel level (including
forcing a full modeset), can't save the the current commit. So, in all
likelihood we will just end up waiting an extra 10 seconds per CRTC
(assuming they haven't completed already, unrelated to the forced
modeset).

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v3 1/2] drm: introduce KMS recovery mechanism
  2026-02-13 19:35   ` Hamza Mahfooz
@ 2026-02-14 14:02     ` Michel Dänzer
  2026-02-14 22:16       ` Hamza Mahfooz
  0 siblings, 1 reply; 16+ messages in thread
From: Michel Dänzer @ 2026-02-14 14:02 UTC (permalink / raw)
  To: Hamza Mahfooz, Mario Limonciello
  Cc: dri-devel, Harry Wentland, Leo Li, Rodrigo Siqueira, Alex Deucher,
	Christian König, David Airlie, Simona Vetter,
	Maarten Lankhorst, Maxime Ripard, Thomas Zimmermann, Alex Hung,
	Wayne Lin, Aurabindo Pillai, Ivan Lipski, Timur Kristóf,
	Dominik Kaszewski, amd-gfx, linux-kernel

On 2/13/26 20:35, Hamza Mahfooz wrote:
> On Thu, Feb 12, 2026 at 06:18:17PM -0600, Mario Limonciello wrote:
> 
>>> @@ -1881,13 +1886,43 @@ void drm_atomic_helper_wait_for_flip_done(struct drm_device *dev,
>>>   			continue;
>>>   		ret = wait_for_completion_timeout(&commit->flip_done, 10 * HZ);
>>> -		if (ret == 0)
>>> -			drm_err(dev, "[CRTC:%d:%s] flip_done timed out\n",
>>> -				crtc->base.id, crtc->name);
>>> +		if (!ret) {
>>> +			switch (dev->reset_phase) {
>>> +			case DRM_KMS_RESET_NONE:
>>> +				drm_err(dev, "[CRTC:%d:%s] flip_done timed out\n",
>>> +					crtc->base.id, crtc->name);
>>> +				dev->reset_phase = DRM_KMS_RESET_FORCE_MODESET;
>>> +				drm_kms_helper_hotplug_event(dev);
>>> +				break;
>>
>> Since you're iterating multiple CRTCs if you manage to recover from one
>> with this call shouldn't you keep iterating the rest?
> 
> Most measures that the can be implemented at the kernel level (including
> forcing a full modeset), can't save the the current commit.

Why couldn't a full modeset?


> So, in all likelihood we will just end up waiting an extra 10 seconds per CRTC
> (assuming they haven't completed already, unrelated to the forced
> modeset).

In principle it's possible to do (the equivalent of) a modeset with the current state for all CRTCs, no need to do it separately per CRTC.


-- 
Earthling Michel Dänzer       \        GNOME / Xwayland / Mesa developer
https://redhat.com             \               Libre software enthusiast

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v3 1/2] drm: introduce KMS recovery mechanism
  2026-02-14 14:02     ` Michel Dänzer
@ 2026-02-14 22:16       ` Hamza Mahfooz
  2026-02-16  9:28         ` Michel Dänzer
  0 siblings, 1 reply; 16+ messages in thread
From: Hamza Mahfooz @ 2026-02-14 22:16 UTC (permalink / raw)
  To: Michel Dänzer
  Cc: Mario Limonciello, dri-devel, Harry Wentland, Leo Li,
	Rodrigo Siqueira, Alex Deucher, Christian König,
	David Airlie, Simona Vetter, Maarten Lankhorst, Maxime Ripard,
	Thomas Zimmermann, Alex Hung, Wayne Lin, Aurabindo Pillai,
	Ivan Lipski, Timur Kristóf, Dominik Kaszewski, amd-gfx,
	linux-kernel

On Sat, Feb 14, 2026 at 03:02:49PM +0100, Michel Dänzer wrote:
> Why couldn't a full modeset?
> 

As far as I see it the only reasons why we should be timing out is
either an interrupt went missing (perhaps due to a race condition in
driver code) or hung hardware. In either case, the interrupt associated
with the page flip for the current commit is long gone.

> In principle it's possible to do (the equivalent of) a modeset with the current state for all CRTCs, no need to do it separately per CRTC.

AFAIK that is what the uevent is already doing (unless I'm mistaken).

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v3 1/2] drm: introduce KMS recovery mechanism
  2026-02-14 22:16       ` Hamza Mahfooz
@ 2026-02-16  9:28         ` Michel Dänzer
  2026-02-18  0:45           ` Hamza Mahfooz
  0 siblings, 1 reply; 16+ messages in thread
From: Michel Dänzer @ 2026-02-16  9:28 UTC (permalink / raw)
  To: Hamza Mahfooz
  Cc: Mario Limonciello, dri-devel, Harry Wentland, Leo Li,
	Rodrigo Siqueira, Alex Deucher, Christian König,
	David Airlie, Simona Vetter, Maarten Lankhorst, Maxime Ripard,
	Thomas Zimmermann, Alex Hung, Wayne Lin, Aurabindo Pillai,
	Ivan Lipski, Timur Kristóf, Dominik Kaszewski, amd-gfx,
	linux-kernel

On 2/14/26 23:16, Hamza Mahfooz wrote:
> On Sat, Feb 14, 2026 at 03:02:49PM +0100, Michel Dänzer wrote:
>> Why couldn't a full modeset?
> 
> As far as I see it the only reasons why we should be timing out is
> either an interrupt went missing (perhaps due to a race condition in
> driver code) or hung hardware. In either case, the interrupt associated
> with the page flip for the current commit is long gone.

That's a matter of bookkeeping, the interrupt isn't required to keep track of the commit and complete it.


>> In principle it's possible to do (the equivalent of) a modeset with the current state for all CRTCs, no need to do it separately per CRTC.
> 
> AFAIK that is what the uevent is already doing (unless I'm mistaken).

This is about just doing a full modeset, which isn't something user space can do in response to a wedged event.


-- 
Earthling Michel Dänzer       \        GNOME / Xwayland / Mesa developer
https://redhat.com             \               Libre software enthusiast

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v3 1/2] drm: introduce KMS recovery mechanism
  2026-02-16  9:28         ` Michel Dänzer
@ 2026-02-18  0:45           ` Hamza Mahfooz
  2026-02-18  9:22             ` Michel Dänzer
  0 siblings, 1 reply; 16+ messages in thread
From: Hamza Mahfooz @ 2026-02-18  0:45 UTC (permalink / raw)
  To: Michel Dänzer
  Cc: Mario Limonciello, dri-devel, Harry Wentland, Leo Li,
	Rodrigo Siqueira, Alex Deucher, Christian König,
	David Airlie, Simona Vetter, Maarten Lankhorst, Maxime Ripard,
	Thomas Zimmermann, Alex Hung, Wayne Lin, Aurabindo Pillai,
	Ivan Lipski, Timur Kristóf, Dominik Kaszewski, amd-gfx,
	linux-kernel

On Mon, Feb 16, 2026 at 10:28:13AM +0100, Michel Dänzer wrote:
> On 2/14/26 23:16, Hamza Mahfooz wrote:
> > On Sat, Feb 14, 2026 at 03:02:49PM +0100, Michel Dänzer wrote:
> >> Why couldn't a full modeset?
> > 
> > As far as I see it the only reasons why we should be timing out is
> > either an interrupt went missing (perhaps due to a race condition in
> > driver code) or hung hardware. In either case, the interrupt associated
> > with the page flip for the current commit is long gone.
> 
> That's a matter of bookkeeping, the interrupt isn't required to keep track of the commit and complete it.

Oh, if you're talking about sending out a "fake"
drm_crtc_send_vblank_event(), I had considered that. Though,
drivers that have hardware vblank counters take ownership of the
relevant `struct drm_pending_vblank_event` [1] pretty early on.
So, there wouldn't be a way to ensure that they send that out
and that would mean we never get a chance to force a full modeset.

[1] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/include/drm/drm_crtc.h?h=v6.19#n386

> 
> 
> >> In principle it's possible to do (the equivalent of) a modeset with the current state for all CRTCs, no need to do it separately per CRTC.
> > 
> > AFAIK that is what the uevent is already doing (unless I'm mistaken).
> 
> This is about just doing a full modeset, which isn't something user space can do in response to a wedged event.

I was referring to what compositors are doing in response to
`drm_kms_helper_hotplug_event()`. Perhaps, the enum constants should be
renamed, since the forced modeset is actually sent when the current
reset phase is DRM_KMS_RESET_NONE (the phase is updated before sending
out the event though).

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v3 1/2] drm: introduce KMS recovery mechanism
  2026-02-18  0:45           ` Hamza Mahfooz
@ 2026-02-18  9:22             ` Michel Dänzer
  2026-02-20  8:42               ` Michel Dänzer
  0 siblings, 1 reply; 16+ messages in thread
From: Michel Dänzer @ 2026-02-18  9:22 UTC (permalink / raw)
  To: Hamza Mahfooz
  Cc: Mario Limonciello, dri-devel, Harry Wentland, Leo Li,
	Rodrigo Siqueira, Alex Deucher, Christian König,
	David Airlie, Simona Vetter, Maarten Lankhorst, Maxime Ripard,
	Thomas Zimmermann, Alex Hung, Wayne Lin, Aurabindo Pillai,
	Ivan Lipski, Timur Kristóf, Dominik Kaszewski, amd-gfx,
	linux-kernel

On 2/18/26 01:45, Hamza Mahfooz wrote:
> On Mon, Feb 16, 2026 at 10:28:13AM +0100, Michel Dänzer wrote:
>> On 2/14/26 23:16, Hamza Mahfooz wrote:
>>> On Sat, Feb 14, 2026 at 03:02:49PM +0100, Michel Dänzer wrote:
>>>> Why couldn't a full modeset?
>>>
>>> As far as I see it the only reasons why we should be timing out is
>>> either an interrupt went missing (perhaps due to a race condition in
>>> driver code) or hung hardware. In either case, the interrupt associated
>>> with the page flip for the current commit is long gone.
>>
>> That's a matter of bookkeeping, the interrupt isn't required to keep track of the commit and complete it.
> 
> Oh, if you're talking about sending out a "fake"
> drm_crtc_send_vblank_event(), I had considered that. Though,
> drivers that have hardware vblank counters take ownership of the
> relevant `struct drm_pending_vblank_event` [1] pretty early on.
> So, there wouldn't be a way to ensure that they send that out
> and that would mean we never get a chance to force a full modeset.

"never get a chance" why? The driver can do a full modeset and then send out the event(s) with the current vblank sequence number(s) and timestamp(s) at that time.


>>>> In principle it's possible to do (the equivalent of) a modeset with the current state for all CRTCs, no need to do it separately per CRTC.
>>>
>>> AFAIK that is what the uevent is already doing (unless I'm mistaken).
>>
>> This is about just doing a full modeset, which isn't something user space can do in response to a wedged event.
> 
> I was referring to what compositors are doing in response to
> `drm_kms_helper_hotplug_event()`. Perhaps, the enum constants should be
> renamed, since the forced modeset is actually sent when the current
> reset phase is DRM_KMS_RESET_NONE (the phase is updated before sending
> out the event though).

Ah, you're talking about the drm_kms_helper_hotplug_event call in drm_atomic_helper_wait_for_flip_done (I thought "uevent" referred to drm_dev_wedged_event in patch 2).

I don't know about other compositors, but I don't think mutter will do a modeset in response to a hotplug event if no KMS state changed (because some monitors can generate spurious hotplug events).

(Just one more reason why kicking the full modeset can down to user space is questionable)


-- 
Earthling Michel Dänzer       \        GNOME / Xwayland / Mesa developer
https://redhat.com             \               Libre software enthusiast

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v3 1/2] drm: introduce KMS recovery mechanism
  2026-02-12 23:08 [PATCH v3 1/2] drm: introduce KMS recovery mechanism Hamza Mahfooz
  2026-02-12 23:09 ` [PATCH v3 2/2] drm/amd/display: add vendor specific reset Hamza Mahfooz
  2026-02-13  0:18 ` [PATCH v3 1/2] drm: introduce KMS recovery mechanism Mario Limonciello
@ 2026-02-18  9:31 ` Christian König
  2 siblings, 0 replies; 16+ messages in thread
From: Christian König @ 2026-02-18  9:31 UTC (permalink / raw)
  To: Hamza Mahfooz, dri-devel
  Cc: Michel Dänzer, Harry Wentland, Leo Li, Rodrigo Siqueira,
	Alex Deucher, David Airlie, Simona Vetter, Maarten Lankhorst,
	Maxime Ripard, Thomas Zimmermann, Mario Limonciello, Alex Hung,
	Wayne Lin, Aurabindo Pillai, Ivan Lipski, Timur Kristóf,
	Dominik Kaszewski, amd-gfx, linux-kernel

On 2/13/26 00:08, Hamza Mahfooz wrote:
> There should be a mechanism for drivers to respond to flip_done
> timeouts. Since, as it stands it is possible for the display to stall
> indefinitely, necessitating a hard reset. So, introduce a new mechanism
> that tries various methods of recovery with increasing aggression, in
> the following order:
> 
> 1. Force a full modeset (have the compositor reprogram the state from
>    scratch).
> 2. As a last resort, have the driver attempt a vendor specific reset
>    (which they can do by reading the return value of
>    drm_atomic_helper_wait_for_flip_done()).
> 
> Signed-off-by: Hamza Mahfooz <someguy@effective-light.com>
> ---
> v2: new to the series
> v3: get rid of page_flip_timeout() and have
>     drm_atomic_helper_wait_for_flip_done() return a error.
> ---
>  drivers/gpu/drm/drm_atomic_helper.c | 45 +++++++++++++++++++++++++----
>  include/drm/drm_atomic_helper.h     |  4 +--
>  include/drm/drm_device.h            | 24 +++++++++++++++
>  3 files changed, 66 insertions(+), 7 deletions(-)
> 
> diff --git a/drivers/gpu/drm/drm_atomic_helper.c b/drivers/gpu/drm/drm_atomic_helper.c
> index 5840e9cc6f66..6ae1234b9e20 100644
> --- a/drivers/gpu/drm/drm_atomic_helper.c
> +++ b/drivers/gpu/drm/drm_atomic_helper.c
> @@ -42,6 +42,7 @@
>  #include <drm/drm_gem_atomic_helper.h>
>  #include <drm/drm_panic.h>
>  #include <drm/drm_print.h>
> +#include <drm/drm_probe_helper.h>
>  #include <drm/drm_self_refresh_helper.h>
>  #include <drm/drm_vblank.h>
>  #include <drm/drm_writeback.h>
> @@ -1864,11 +1865,15 @@ EXPORT_SYMBOL(drm_atomic_helper_wait_for_vblanks);
>   *
>   * This requires that drivers use the nonblocking commit tracking support
>   * initialized using drm_atomic_helper_setup_commit().
> + *
> + * Returns:
> + * -ETIMEDOUT to indicate that drivers can attempt a vendor reset, 0 otherwise.
>   */
> -void drm_atomic_helper_wait_for_flip_done(struct drm_device *dev,
> -					  struct drm_atomic_state *state)
> +int drm_atomic_helper_wait_for_flip_done(struct drm_device *dev,
> +					 struct drm_atomic_state *state)
>  {
>  	struct drm_crtc *crtc;
> +	int ret = 0;
>  	int i;
>  
>  	for (i = 0; i < dev->mode_config.num_crtc; i++) {
> @@ -1881,13 +1886,43 @@ void drm_atomic_helper_wait_for_flip_done(struct drm_device *dev,
>  			continue;
>  
>  		ret = wait_for_completion_timeout(&commit->flip_done, 10 * HZ);
> -		if (ret == 0)
> -			drm_err(dev, "[CRTC:%d:%s] flip_done timed out\n",
> -				crtc->base.id, crtc->name);
> +		if (!ret) {
> +			switch (dev->reset_phase) {
> +			case DRM_KMS_RESET_NONE:

That state machine is driver specific and so doesn't belong into DRM.

So please completely nuke that and just return the error code to the driver.

Regards,
Christian.

> +				drm_err(dev, "[CRTC:%d:%s] flip_done timed out\n",
> +					crtc->base.id, crtc->name);
> +				dev->reset_phase = DRM_KMS_RESET_FORCE_MODESET;
> +				drm_kms_helper_hotplug_event(dev);
> +				break;
> +			case DRM_KMS_RESET_FORCE_MODESET:
> +				drm_err(dev, "[CRTC:%d:%s] force full modeset failed\n",
> +					crtc->base.id, crtc->name);
> +				dev->reset_phase = DRM_KMS_RESET_VENDOR;
> +				ret = -ETIMEDOUT;
> +				break;
> +			case DRM_KMS_RESET_VENDOR:
> +				drm_err(dev, "[CRTC:%d:%s] KMS recovery failed!\n",
> +					crtc->base.id, crtc->name);
> +				dev->reset_phase = DRM_KMS_RESET_GIVE_UP;
> +				break;
> +			default:
> +				break;
> +			}
> +
> +			goto exit;
> +		}
> +	}
> +
> +	if (dev->reset_phase) {
> +		drm_info(dev, "KMS recovery succeeded!\n");
> +		dev->reset_phase = DRM_KMS_RESET_NONE;
>  	}
>  
> +exit:
>  	if (state->fake_commit)
>  		complete_all(&state->fake_commit->flip_done);
> +
> +	return ret;
>  }
>  EXPORT_SYMBOL(drm_atomic_helper_wait_for_flip_done);
>  
> diff --git a/include/drm/drm_atomic_helper.h b/include/drm/drm_atomic_helper.h
> index 53382fe93537..298c8dff3993 100644
> --- a/include/drm/drm_atomic_helper.h
> +++ b/include/drm/drm_atomic_helper.h
> @@ -79,8 +79,8 @@ int drm_atomic_helper_wait_for_fences(struct drm_device *dev,
>  void drm_atomic_helper_wait_for_vblanks(struct drm_device *dev,
>  					struct drm_atomic_state *old_state);
>  
> -void drm_atomic_helper_wait_for_flip_done(struct drm_device *dev,
> -					  struct drm_atomic_state *old_state);
> +int drm_atomic_helper_wait_for_flip_done(struct drm_device *dev,
> +					 struct drm_atomic_state *old_state);
>  
>  void
>  drm_atomic_helper_update_legacy_modeset_state(struct drm_device *dev,
> diff --git a/include/drm/drm_device.h b/include/drm/drm_device.h
> index bc78fb77cc27..1244d7527e7b 100644
> --- a/include/drm/drm_device.h
> +++ b/include/drm/drm_device.h
> @@ -66,6 +66,23 @@ enum switch_power_state {
>  	DRM_SWITCH_POWER_DYNAMIC_OFF = 3,
>  };
>  
> +/**
> + * enum drm_kms_reset_phase - reset phase of drm device
> + */
> +enum drm_kms_reset_phase {
> +	/** @DRM_KMS_RESET_NONE: Not currently attempting recovery */
> +	DRM_KMS_RESET_NONE,
> +
> +	/** @DRM_KMS_RESET_FORCE_MODESET: Force a full modeset */
> +	DRM_KMS_RESET_FORCE_MODESET,
> +
> +	/** @DRM_KMS_RESET_VENDOR: Attempt a vendor reset */
> +	DRM_KMS_RESET_VENDOR,
> +
> +	/** @DRM_KMS_RESET_GIVE_UP: All recovery methods failed */
> +	DRM_KMS_RESET_GIVE_UP,
> +};
> +
>  /**
>   * struct drm_device - DRM device structure
>   *
> @@ -375,6 +392,13 @@ struct drm_device {
>  	 * Root directory for debugfs files.
>  	 */
>  	struct dentry *debugfs_root;
> +
> +	/**
> +	 * @reset_phase:
> +	 *
> +	 * Reset phase that the device is in.
> +	 */
> +	enum drm_kms_reset_phase reset_phase;
>  };
>  
>  void drm_dev_set_dma_dev(struct drm_device *dev, struct device *dma_dev);


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v3 2/2] drm/amd/display: add vendor specific reset
  2026-02-12 23:09 ` [PATCH v3 2/2] drm/amd/display: add vendor specific reset Hamza Mahfooz
@ 2026-02-18  9:34   ` Christian König
  0 siblings, 0 replies; 16+ messages in thread
From: Christian König @ 2026-02-18  9:34 UTC (permalink / raw)
  To: Hamza Mahfooz, dri-devel
  Cc: Michel Dänzer, Harry Wentland, Leo Li, Rodrigo Siqueira,
	Alex Deucher, David Airlie, Simona Vetter, Maarten Lankhorst,
	Maxime Ripard, Thomas Zimmermann, Mario Limonciello, Alex Hung,
	Aurabindo Pillai, Wayne Lin, Ivan Lipski, Timur Kristóf,
	Dominik Kaszewski, amd-gfx, linux-kernel

On 2/13/26 00:09, Hamza Mahfooz wrote:
> We now have a means to respond to page flip timeouts. So, hook up
> support by sending out a wedged event if
> drm_atomic_helper_wait_for_flip_done() fails.
> 
> Signed-off-by: Hamza Mahfooz <someguy@effective-light.com>
> ---
> v2: send a wedged event instead of attempting a GPU reset.
> v3: read return value of drm_atomic_helper_wait_for_flip_done().
> ---
>  drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 9 +++++++--
>  1 file changed, 7 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
> index 7c51d8d7e73c..06d8353cb616 100644
> --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
> +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
> @@ -87,6 +87,7 @@
>  #include <drm/drm_atomic_uapi.h>
>  #include <drm/drm_atomic_helper.h>
>  #include <drm/drm_blend.h>
> +#include <drm/drm_drv.h>
>  #include <drm/drm_fixed.h>
>  #include <drm/drm_fourcc.h>
>  #include <drm/drm_edid.h>
> @@ -11085,8 +11086,12 @@ static void amdgpu_dm_atomic_commit_tail(struct drm_atomic_state *state)
>  	/* Signal HW programming completion */
>  	drm_atomic_helper_commit_hw_done(state);
>  
> -	if (wait_for_vblank)
> -		drm_atomic_helper_wait_for_flip_done(dev, state);
> +	if (wait_for_vblank) {
> +		if (drm_atomic_helper_wait_for_flip_done(dev, state))
> +			drm_dev_wedged_event(dev, DRM_WEDGE_RECOVERY_REBIND |
> +					     DRM_WEDGE_RECOVERY_BUS_RESET,
> +					     NULL);

Well that is the worst of all options since it will completely kill userspace.

We need to figure out what exactly is necessary to get the CRTC going again.

Regards,
Christian.

> +	}
>  
>  	drm_atomic_helper_cleanup_planes(dev, state);
>  


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v3 1/2] drm: introduce KMS recovery mechanism
  2026-02-18  9:22             ` Michel Dänzer
@ 2026-02-20  8:42               ` Michel Dänzer
  2026-02-20  9:09                 ` Michel Dänzer
  0 siblings, 1 reply; 16+ messages in thread
From: Michel Dänzer @ 2026-02-20  8:42 UTC (permalink / raw)
  To: Hamza Mahfooz
  Cc: Mario Limonciello, dri-devel, Harry Wentland, Leo Li,
	Rodrigo Siqueira, Alex Deucher, Christian König,
	David Airlie, Simona Vetter, Maarten Lankhorst, Maxime Ripard,
	Thomas Zimmermann, Alex Hung, Wayne Lin, Aurabindo Pillai,
	Ivan Lipski, Timur Kristóf, Dominik Kaszewski, amd-gfx,
	linux-kernel

On 2/18/26 10:22, Michel Dänzer wrote:
> On 2/18/26 01:45, Hamza Mahfooz wrote:
>> On Mon, Feb 16, 2026 at 10:28:13AM +0100, Michel Dänzer wrote:
>>> On 2/14/26 23:16, Hamza Mahfooz wrote:
>>>> On Sat, Feb 14, 2026 at 03:02:49PM +0100, Michel Dänzer wrote:
>>>>
>>>>> In principle it's possible to do (the equivalent of) a modeset with the current state for all CRTCs, no need to do it separately per CRTC.
>>>>
>>>> AFAIK that is what the uevent is already doing (unless I'm mistaken).
>>>
>>> This is about just doing a full modeset, which isn't something user space can do in response to a wedged event.
>>
>> I was referring to what compositors are doing in response to
>> `drm_kms_helper_hotplug_event()`. Perhaps, the enum constants should be
>> renamed, since the forced modeset is actually sent when the current
>> reset phase is DRM_KMS_RESET_NONE (the phase is updated before sending
>> out the event though).
> 
> Ah, you're talking about the drm_kms_helper_hotplug_event call in drm_atomic_helper_wait_for_flip_done (I thought "uevent" referred to drm_dev_wedged_event in patch 2).
> 
> I don't know about other compositors, but I don't think mutter will do a modeset in response to a hotplug event if no KMS state changed (because some monitors can generate spurious hotplug events).

FWIW, if it really turns out impossible for the kernel to do a modeset itself (which I remain unconvinced of), one way to require a modeset from user space is to set the "link-status" connector property to "Bad".


-- 
Earthling Michel Dänzer       \        GNOME / Xwayland / Mesa developer
https://redhat.com             \               Libre software enthusiast

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v3 1/2] drm: introduce KMS recovery mechanism
  2026-02-20  8:42               ` Michel Dänzer
@ 2026-02-20  9:09                 ` Michel Dänzer
  2026-02-20 14:08                   ` Michel Dänzer
  2026-04-17 16:08                   ` Michel Dänzer
  0 siblings, 2 replies; 16+ messages in thread
From: Michel Dänzer @ 2026-02-20  9:09 UTC (permalink / raw)
  To: Hamza Mahfooz
  Cc: Mario Limonciello, dri-devel, Harry Wentland, Leo Li,
	Rodrigo Siqueira, Alex Deucher, Christian König,
	David Airlie, Simona Vetter, Maarten Lankhorst, Maxime Ripard,
	Thomas Zimmermann, Alex Hung, Wayne Lin, Aurabindo Pillai,
	Ivan Lipski, Timur Kristóf, Dominik Kaszewski, amd-gfx,
	linux-kernel

On 2/20/26 09:42, Michel Dänzer wrote:
> On 2/18/26 10:22, Michel Dänzer wrote:
>> On 2/18/26 01:45, Hamza Mahfooz wrote:
>>> On Mon, Feb 16, 2026 at 10:28:13AM +0100, Michel Dänzer wrote:
>>>> On 2/14/26 23:16, Hamza Mahfooz wrote:
>>>>> On Sat, Feb 14, 2026 at 03:02:49PM +0100, Michel Dänzer wrote:
>>>>>
>>>>>> In principle it's possible to do (the equivalent of) a modeset with the current state for all CRTCs, no need to do it separately per CRTC.
>>>>>
>>>>> AFAIK that is what the uevent is already doing (unless I'm mistaken).
>>>>
>>>> This is about just doing a full modeset, which isn't something user space can do in response to a wedged event.
>>>
>>> I was referring to what compositors are doing in response to
>>> `drm_kms_helper_hotplug_event()`. Perhaps, the enum constants should be
>>> renamed, since the forced modeset is actually sent when the current
>>> reset phase is DRM_KMS_RESET_NONE (the phase is updated before sending
>>> out the event though).
>>
>> Ah, you're talking about the drm_kms_helper_hotplug_event call in drm_atomic_helper_wait_for_flip_done (I thought "uevent" referred to drm_dev_wedged_event in patch 2).
>>
>> I don't know about other compositors, but I don't think mutter will do a modeset in response to a hotplug event if no KMS state changed (because some monitors can generate spurious hotplug events).
> 
> FWIW, if it really turns out impossible for the kernel to do a modeset itself (which I remain unconvinced of), one way to require a modeset from user space is to set the "link-status" connector property to "Bad".

Having written that down, a reason why the kernel can't just do a modeset itself occurred to me: A modeset might affect other CRTCs than the ones affected by the timed-out commit, which could interact badly with other pending commits affecting those other CRTCs.

The "link-status" property seems like the best solution so far.

(Note that not all compositors are paying attention to it yet, they'll need to be fixed)


-- 
Earthling Michel Dänzer       \        GNOME / Xwayland / Mesa developer
https://redhat.com             \               Libre software enthusiast

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v3 1/2] drm: introduce KMS recovery mechanism
  2026-02-20  9:09                 ` Michel Dänzer
@ 2026-02-20 14:08                   ` Michel Dänzer
  2026-04-17 16:08                   ` Michel Dänzer
  1 sibling, 0 replies; 16+ messages in thread
From: Michel Dänzer @ 2026-02-20 14:08 UTC (permalink / raw)
  To: Hamza Mahfooz
  Cc: Mario Limonciello, dri-devel, Harry Wentland, Leo Li,
	Rodrigo Siqueira, Alex Deucher, Christian König,
	David Airlie, Simona Vetter, Maarten Lankhorst, Maxime Ripard,
	Thomas Zimmermann, Alex Hung, Wayne Lin, Aurabindo Pillai,
	Ivan Lipski, Timur Kristóf, Dominik Kaszewski, amd-gfx,
	linux-kernel

On 2/20/26 10:09, Michel Dänzer wrote:
> On 2/20/26 09:42, Michel Dänzer wrote:
>> On 2/18/26 10:22, Michel Dänzer wrote:
>>> On 2/18/26 01:45, Hamza Mahfooz wrote:
>>>> On Mon, Feb 16, 2026 at 10:28:13AM +0100, Michel Dänzer wrote:
>>>>> On 2/14/26 23:16, Hamza Mahfooz wrote:
>>>>>> On Sat, Feb 14, 2026 at 03:02:49PM +0100, Michel Dänzer wrote:
>>>>>>
>>>>>>> In principle it's possible to do (the equivalent of) a modeset with the current state for all CRTCs, no need to do it separately per CRTC.
>>>>>>
>>>>>> AFAIK that is what the uevent is already doing (unless I'm mistaken).
>>>>>
>>>>> This is about just doing a full modeset, which isn't something user space can do in response to a wedged event.
>>>>
>>>> I was referring to what compositors are doing in response to
>>>> `drm_kms_helper_hotplug_event()`. Perhaps, the enum constants should be
>>>> renamed, since the forced modeset is actually sent when the current
>>>> reset phase is DRM_KMS_RESET_NONE (the phase is updated before sending
>>>> out the event though).
>>>
>>> Ah, you're talking about the drm_kms_helper_hotplug_event call in drm_atomic_helper_wait_for_flip_done (I thought "uevent" referred to drm_dev_wedged_event in patch 2).
>>>
>>> I don't know about other compositors, but I don't think mutter will do a modeset in response to a hotplug event if no KMS state changed (because some monitors can generate spurious hotplug events).
>>
>> FWIW, if it really turns out impossible for the kernel to do a modeset itself (which I remain unconvinced of), one way to require a modeset from user space is to set the "link-status" connector property to "Bad".
> 
> Having written that down, a reason why the kernel can't just do a modeset itself occurred to me: A modeset might affect other CRTCs than the ones affected by the timed-out commit, which could interact badly with other pending commits affecting those other CRTCs.
> 
> The "link-status" property seems like the best solution so far.

Discussing the "link-status" property, it turns out I slightly misremembered its purpose, it's not really suitable for this.

Maybe the hotplug event is the only way after all.


-- 
Earthling Michel Dänzer       \        GNOME / Xwayland / Mesa developer
https://redhat.com             \               Libre software enthusiast

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v3 1/2] drm: introduce KMS recovery mechanism
  2026-02-20  9:09                 ` Michel Dänzer
  2026-02-20 14:08                   ` Michel Dänzer
@ 2026-04-17 16:08                   ` Michel Dänzer
  2026-04-17 16:28                     ` Ville Syrjälä
  1 sibling, 1 reply; 16+ messages in thread
From: Michel Dänzer @ 2026-04-17 16:08 UTC (permalink / raw)
  To: Hamza Mahfooz
  Cc: Mario Limonciello, dri-devel, Harry Wentland, Leo Li,
	Rodrigo Siqueira, Alex Deucher, Christian König,
	David Airlie, Simona Vetter, Maarten Lankhorst, Maxime Ripard,
	Thomas Zimmermann, Alex Hung, Wayne Lin, Aurabindo Pillai,
	Ivan Lipski, Timur Kristóf, Dominik Kaszewski, amd-gfx,
	linux-kernel, Ville Syrjälä

On 2/20/26 10:09, Michel Dänzer wrote:
> On 2/20/26 09:42, Michel Dänzer wrote:
>> On 2/18/26 10:22, Michel Dänzer wrote:
>>> On 2/18/26 01:45, Hamza Mahfooz wrote:
>>>> On Mon, Feb 16, 2026 at 10:28:13AM +0100, Michel Dänzer wrote:
>>>>> On 2/14/26 23:16, Hamza Mahfooz wrote:
>>>>>> On Sat, Feb 14, 2026 at 03:02:49PM +0100, Michel Dänzer wrote:
>>>>>>
>>>>>>> In principle it's possible to do (the equivalent of) a modeset with the current state for all CRTCs, no need to do it separately per CRTC.
>>>>>>
>>>>>> AFAIK that is what the uevent is already doing (unless I'm mistaken).
>>>>>
>>>>> This is about just doing a full modeset, which isn't something user space can do in response to a wedged event.
>>>>
>>>> I was referring to what compositors are doing in response to
>>>> `drm_kms_helper_hotplug_event()`. Perhaps, the enum constants should be
>>>> renamed, since the forced modeset is actually sent when the current
>>>> reset phase is DRM_KMS_RESET_NONE (the phase is updated before sending
>>>> out the event though).
>>>
>>> Ah, you're talking about the drm_kms_helper_hotplug_event call in drm_atomic_helper_wait_for_flip_done (I thought "uevent" referred to drm_dev_wedged_event in patch 2).
>>>
>>> I don't know about other compositors, but I don't think mutter will do a modeset in response to a hotplug event if no KMS state changed (because some monitors can generate spurious hotplug events).
>>
>> [...]
> 
> [...], a reason why the kernel can't just do a modeset itself occurred to me: A modeset might affect other CRTCs than the ones affected by the timed-out commit, which could interact badly with other pending commits affecting those other CRTCs.

In a different thread, Ville (added to Cc) said this shouldn't be an issue.

In which case this would seem like the simplest solution.


-- 
Earthling Michel Dänzer       \        GNOME / Xwayland / Mesa developer
https://redhat.com             \               Libre software enthusiast

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v3 1/2] drm: introduce KMS recovery mechanism
  2026-04-17 16:08                   ` Michel Dänzer
@ 2026-04-17 16:28                     ` Ville Syrjälä
  0 siblings, 0 replies; 16+ messages in thread
From: Ville Syrjälä @ 2026-04-17 16:28 UTC (permalink / raw)
  To: Michel Dänzer
  Cc: Hamza Mahfooz, Mario Limonciello, dri-devel, Harry Wentland,
	Leo Li, Rodrigo Siqueira, Alex Deucher, Christian König,
	David Airlie, Simona Vetter, Maarten Lankhorst, Maxime Ripard,
	Thomas Zimmermann, Alex Hung, Wayne Lin, Aurabindo Pillai,
	Ivan Lipski, Timur Kristóf, Dominik Kaszewski, amd-gfx,
	linux-kernel

On Fri, Apr 17, 2026 at 06:08:05PM +0200, Michel Dänzer wrote:
> On 2/20/26 10:09, Michel Dänzer wrote:
> > On 2/20/26 09:42, Michel Dänzer wrote:
> >> On 2/18/26 10:22, Michel Dänzer wrote:
> >>> On 2/18/26 01:45, Hamza Mahfooz wrote:
> >>>> On Mon, Feb 16, 2026 at 10:28:13AM +0100, Michel Dänzer wrote:
> >>>>> On 2/14/26 23:16, Hamza Mahfooz wrote:
> >>>>>> On Sat, Feb 14, 2026 at 03:02:49PM +0100, Michel Dänzer wrote:
> >>>>>>
> >>>>>>> In principle it's possible to do (the equivalent of) a modeset with the current state for all CRTCs, no need to do it separately per CRTC.
> >>>>>>
> >>>>>> AFAIK that is what the uevent is already doing (unless I'm mistaken).
> >>>>>
> >>>>> This is about just doing a full modeset, which isn't something user space can do in response to a wedged event.
> >>>>
> >>>> I was referring to what compositors are doing in response to
> >>>> `drm_kms_helper_hotplug_event()`. Perhaps, the enum constants should be
> >>>> renamed, since the forced modeset is actually sent when the current
> >>>> reset phase is DRM_KMS_RESET_NONE (the phase is updated before sending
> >>>> out the event though).
> >>>
> >>> Ah, you're talking about the drm_kms_helper_hotplug_event call in drm_atomic_helper_wait_for_flip_done (I thought "uevent" referred to drm_dev_wedged_event in patch 2).
> >>>
> >>> I don't know about other compositors, but I don't think mutter will do a modeset in response to a hotplug event if no KMS state changed (because some monitors can generate spurious hotplug events).
> >>
> >> [...]
> > 
> > [...], a reason why the kernel can't just do a modeset itself occurred to me: A modeset might affect other CRTCs than the ones affected by the timed-out commit, which could interact badly with other pending commits affecting those other CRTCs.
> 
> In a different thread, Ville (added to Cc) said this shouldn't be an issue.
> 
> In which case this would seem like the simplest solution.

Yeah, so extra blocking commits will just make everything (including
subsequent non-blocking commits) block on the modeset mutexes before
the -EBUSY check even happens.

On a somewhat relatd note, some time ago I proposed making blocking
commits also do the commit_tail part locklessly. But realizing that
something might depend on the current not-actually-nonblocking commit
semantics I was careful to preserve that even while dropping the locks:
https://lore.kernel.org/dri-devel/20220916163331.6849-1-ville.syrjala@linux.intel.com/

-- 
Ville Syrjälä
Intel

^ permalink raw reply	[flat|nested] 16+ messages in thread

end of thread, other threads:[~2026-04-17 16:28 UTC | newest]

Thread overview: 16+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-02-12 23:08 [PATCH v3 1/2] drm: introduce KMS recovery mechanism Hamza Mahfooz
2026-02-12 23:09 ` [PATCH v3 2/2] drm/amd/display: add vendor specific reset Hamza Mahfooz
2026-02-18  9:34   ` Christian König
2026-02-13  0:18 ` [PATCH v3 1/2] drm: introduce KMS recovery mechanism Mario Limonciello
2026-02-13 19:35   ` Hamza Mahfooz
2026-02-14 14:02     ` Michel Dänzer
2026-02-14 22:16       ` Hamza Mahfooz
2026-02-16  9:28         ` Michel Dänzer
2026-02-18  0:45           ` Hamza Mahfooz
2026-02-18  9:22             ` Michel Dänzer
2026-02-20  8:42               ` Michel Dänzer
2026-02-20  9:09                 ` Michel Dänzer
2026-02-20 14:08                   ` Michel Dänzer
2026-04-17 16:08                   ` Michel Dänzer
2026-04-17 16:28                     ` Ville Syrjälä
2026-02-18  9:31 ` Christian König

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox