[PATCH v4 1/2] drm: introduce KMS recovery mechanism

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCH v4 1/2] drm: introduce KMS recovery mechanism
@ 2026-02-20 17:15 Hamza Mahfooz
  2026-02-20 17:15 ` [PATCH v4 2/2] drm/amd/display: add vendor specific reset Hamza Mahfooz
  2026-02-28 18:48 ` [PATCH v4 1/2] drm: introduce KMS recovery mechanism Shengyu Qu
  0 siblings, 2 replies; 7+ messages in thread
From: Hamza Mahfooz @ 2026-02-20 17:15 UTC (permalink / raw)
  To: dri-devel
  Cc: Michel Dänzer, Mario Limonciello, Hamza Mahfooz,
	Harry Wentland, Leo Li, Rodrigo Siqueira, Alex Deucher,
	Christian König, David Airlie, Simona Vetter,
	Maarten Lankhorst, Maxime Ripard, Thomas Zimmermann, Alex Hung,
	Aurabindo Pillai, Wayne Lin, Timur Kristóf, Ivan Lipski,
	Dominik Kaszewski, amd-gfx, linux-kernel

There should be a mechanism for drivers to respond to flip_done
timeouts. Since, as it stands it is possible for the display to stall
indefinitely, necessitating a hard reset. So, introduce a new mechanism
that tries various methods of recovery with increasing aggression, in
the following order:

1. Force a full modeset (have the compositor reprogram the state from
   scratch).
2. As a last resort, have the driver attempt a vendor specific reset
   (which they can do by reading the return value of
   drm_atomic_helper_wait_for_flip_done()).

Signed-off-by: Hamza Mahfooz <someguy@effective-light.com>
---
v2: new to the series
v3: get rid of page_flip_timeout() and have
    drm_atomic_helper_wait_for_flip_done() return a error.
v4: get rid of nested ret variable.
---
 drivers/gpu/drm/drm_atomic_helper.c | 47 ++++++++++++++++++++++++-----
 include/drm/drm_atomic_helper.h     |  4 +--
 include/drm/drm_device.h            | 24 +++++++++++++++
 3 files changed, 66 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/drm_atomic_helper.c b/drivers/gpu/drm/drm_atomic_helper.c
index 5840e9cc6f66..d905eb166225 100644
--- a/drivers/gpu/drm/drm_atomic_helper.c
+++ b/drivers/gpu/drm/drm_atomic_helper.c
@@ -42,6 +42,7 @@
 #include <drm/drm_gem_atomic_helper.h>
 #include <drm/drm_panic.h>
 #include <drm/drm_print.h>
+#include <drm/drm_probe_helper.h>
 #include <drm/drm_self_refresh_helper.h>
 #include <drm/drm_vblank.h>
 #include <drm/drm_writeback.h>
@@ -1864,30 +1865,62 @@ EXPORT_SYMBOL(drm_atomic_helper_wait_for_vblanks);
  *
  * This requires that drivers use the nonblocking commit tracking support
  * initialized using drm_atomic_helper_setup_commit().
+ *
+ * Returns:
+ * -ETIMEDOUT to indicate that drivers can attempt a vendor reset, 0 otherwise.
  */
-void drm_atomic_helper_wait_for_flip_done(struct drm_device *dev,
-					  struct drm_atomic_state *state)
+int drm_atomic_helper_wait_for_flip_done(struct drm_device *dev,
+					 struct drm_atomic_state *state)
 {
 	struct drm_crtc *crtc;
+	int ret = 0;
 	int i;
 
 	for (i = 0; i < dev->mode_config.num_crtc; i++) {
 		struct drm_crtc_commit *commit = state->crtcs[i].commit;
-		int ret;
 
 		crtc = state->crtcs[i].ptr;
 
 		if (!crtc || !commit)
 			continue;
 
-		ret = wait_for_completion_timeout(&commit->flip_done, 10 * HZ);
-		if (ret == 0)
-			drm_err(dev, "[CRTC:%d:%s] flip_done timed out\n",
-				crtc->base.id, crtc->name);
+		if (!wait_for_completion_timeout(&commit->flip_done, 10 * HZ)) {
+			switch (dev->reset_phase) {
+			case DRM_KMS_RESET_NONE:
+				drm_err(dev, "[CRTC:%d:%s] flip_done timed out\n",
+					crtc->base.id, crtc->name);
+				dev->reset_phase = DRM_KMS_RESET_FORCE_MODESET;
+				drm_kms_helper_hotplug_event(dev);
+				break;
+			case DRM_KMS_RESET_FORCE_MODESET:
+				drm_err(dev, "[CRTC:%d:%s] force full modeset failed\n",
+					crtc->base.id, crtc->name);
+				dev->reset_phase = DRM_KMS_RESET_VENDOR;
+				ret = -ETIMEDOUT;
+				break;
+			case DRM_KMS_RESET_VENDOR:
+				drm_err(dev, "[CRTC:%d:%s] KMS recovery failed!\n",
+					crtc->base.id, crtc->name);
+				dev->reset_phase = DRM_KMS_RESET_GIVE_UP;
+				break;
+			default:
+				break;
+			}
+
+			goto exit;
+		}
+	}
+
+	if (dev->reset_phase) {
+		drm_info(dev, "KMS recovery succeeded!\n");
+		dev->reset_phase = DRM_KMS_RESET_NONE;
 	}
 
+exit:
 	if (state->fake_commit)
 		complete_all(&state->fake_commit->flip_done);
+
+	return ret;
 }
 EXPORT_SYMBOL(drm_atomic_helper_wait_for_flip_done);
 
diff --git a/include/drm/drm_atomic_helper.h b/include/drm/drm_atomic_helper.h
index 53382fe93537..298c8dff3993 100644
--- a/include/drm/drm_atomic_helper.h
+++ b/include/drm/drm_atomic_helper.h
@@ -79,8 +79,8 @@ int drm_atomic_helper_wait_for_fences(struct drm_device *dev,
 void drm_atomic_helper_wait_for_vblanks(struct drm_device *dev,
 					struct drm_atomic_state *old_state);
 
-void drm_atomic_helper_wait_for_flip_done(struct drm_device *dev,
-					  struct drm_atomic_state *old_state);
+int drm_atomic_helper_wait_for_flip_done(struct drm_device *dev,
+					 struct drm_atomic_state *old_state);
 
 void
 drm_atomic_helper_update_legacy_modeset_state(struct drm_device *dev,
diff --git a/include/drm/drm_device.h b/include/drm/drm_device.h
index bc78fb77cc27..1244d7527e7b 100644
--- a/include/drm/drm_device.h
+++ b/include/drm/drm_device.h
@@ -66,6 +66,23 @@ enum switch_power_state {
 	DRM_SWITCH_POWER_DYNAMIC_OFF = 3,
 };
 
+/**
+ * enum drm_kms_reset_phase - reset phase of drm device
+ */
+enum drm_kms_reset_phase {
+	/** @DRM_KMS_RESET_NONE: Not currently attempting recovery */
+	DRM_KMS_RESET_NONE,
+
+	/** @DRM_KMS_RESET_FORCE_MODESET: Force a full modeset */
+	DRM_KMS_RESET_FORCE_MODESET,
+
+	/** @DRM_KMS_RESET_VENDOR: Attempt a vendor reset */
+	DRM_KMS_RESET_VENDOR,
+
+	/** @DRM_KMS_RESET_GIVE_UP: All recovery methods failed */
+	DRM_KMS_RESET_GIVE_UP,
+};
+
 /**
  * struct drm_device - DRM device structure
  *
@@ -375,6 +392,13 @@ struct drm_device {
 	 * Root directory for debugfs files.
 	 */
 	struct dentry *debugfs_root;
+
+	/**
+	 * @reset_phase:
+	 *
+	 * Reset phase that the device is in.
+	 */
+	enum drm_kms_reset_phase reset_phase;
 };
 
 void drm_dev_set_dma_dev(struct drm_device *dev, struct device *dma_dev);
-- 
2.53.0


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH v4 2/2] drm/amd/display: add vendor specific reset
  2026-02-20 17:15 [PATCH v4 1/2] drm: introduce KMS recovery mechanism Hamza Mahfooz
@ 2026-02-20 17:15 ` Hamza Mahfooz
  2026-02-20 17:37   ` Hamza Mahfooz
                     ` (2 more replies)
  2026-02-28 18:48 ` [PATCH v4 1/2] drm: introduce KMS recovery mechanism Shengyu Qu
  1 sibling, 3 replies; 7+ messages in thread
From: Hamza Mahfooz @ 2026-02-20 17:15 UTC (permalink / raw)
  To: dri-devel
  Cc: Michel Dänzer, Mario Limonciello, Hamza Mahfooz,
	Harry Wentland, Leo Li, Rodrigo Siqueira, Alex Deucher,
	Christian König, David Airlie, Simona Vetter,
	Maarten Lankhorst, Maxime Ripard, Thomas Zimmermann, Alex Hung,
	Aurabindo Pillai, Wayne Lin, Timur Kristóf, Ivan Lipski,
	Dominik Kaszewski, amd-gfx, linux-kernel

We now have a means to respond to page flip timeouts. So, hook up
support by trying to reload DMUB firmware if
drm_atomic_helper_wait_for_flip_done() fails. Also, send out a wedged
event if the firmware reload fails.

Signed-off-by: Hamza Mahfooz <someguy@effective-light.com>
---
v2: send a wedged event instead of attempting a GPU reset.
v3: read return value of drm_atomic_helper_wait_for_flip_done().
v4: only send wedged event if firmware reload fails and send out "fake"
    page flip completes.
---
 .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 24 +++++++++++++++++--
 1 file changed, 22 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
index 7c51d8d7e73c..0ae6ada22fb0 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
@@ -87,6 +87,7 @@
 #include <drm/drm_atomic_uapi.h>
 #include <drm/drm_atomic_helper.h>
 #include <drm/drm_blend.h>
+#include <drm/drm_drv.h>
 #include <drm/drm_fixed.h>
 #include <drm/drm_fourcc.h>
 #include <drm/drm_edid.h>
@@ -10829,6 +10830,7 @@ static void amdgpu_dm_atomic_commit_tail(struct drm_atomic_state *state)
 	struct drm_connector_state *old_con_state = NULL, *new_con_state = NULL;
 	struct dm_crtc_state *dm_old_crtc_state, *dm_new_crtc_state;
 	int crtc_disable_count = 0;
+	struct amdgpu_crtc *acrtc;
 
 	trace_amdgpu_dm_atomic_commit_tail_begin(state);
 
@@ -11085,8 +11087,26 @@ static void amdgpu_dm_atomic_commit_tail(struct drm_atomic_state *state)
 	/* Signal HW programming completion */
 	drm_atomic_helper_commit_hw_done(state);
 
-	if (wait_for_vblank)
-		drm_atomic_helper_wait_for_flip_done(dev, state);
+	if (wait_for_vblank &&
+	    drm_atomic_helper_wait_for_flip_done(dev, state)) {
+		mutex_lock(&dm->dc_lock);
+		if (dm_dmub_hw_init(adev))
+			drm_dev_wedged_event(dev, DRM_WEDGE_RECOVERY_REBIND |
+					     DRM_WEDGE_RECOVERY_BUS_RESET,
+					     NULL);
+		mutex_unlock(&dm->dc_lock);
+
+		spin_lock_irqsave(&dev->event_lock, flags);
+		drm_for_each_crtc(crtc, dev) {
+			if (acrtc->event) {
+				drm_crtc_send_vblank_event(crtc, acrtc->event);
+				acrtc->event = NULL;
+				drm_crtc_vblank_put(crtc);
+				acrtc->pflip_status = AMDGPU_FLIP_NONE;
+			}
+		}
+		spin_unlock_irqrestore(&dev->event_lock, flags);
+	}
 
 	drm_atomic_helper_cleanup_planes(dev, state);
 
-- 
2.53.0


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCH v4 2/2] drm/amd/display: add vendor specific reset
  2026-02-20 17:15 ` [PATCH v4 2/2] drm/amd/display: add vendor specific reset Hamza Mahfooz
@ 2026-02-20 17:37   ` Hamza Mahfooz
  2026-02-22  1:32   ` kernel test robot
  2026-02-23  9:34   ` Christian König
  2 siblings, 0 replies; 7+ messages in thread
From: Hamza Mahfooz @ 2026-02-20 17:37 UTC (permalink / raw)
  To: dri-devel
  Cc: Michel Dänzer, Mario Limonciello, Harry Wentland, Leo Li,
	Rodrigo Siqueira, Alex Deucher, Christian König,
	David Airlie, Simona Vetter, Maarten Lankhorst, Maxime Ripard,
	Thomas Zimmermann, Alex Hung, Aurabindo Pillai, Wayne Lin,
	Timur Kristóf, Ivan Lipski, Dominik Kaszewski, amd-gfx,
	linux-kernel

On Fri, Feb 20, 2026 at 12:15:13PM -0500, Hamza Mahfooz wrote:
> We now have a means to respond to page flip timeouts. So, hook up
> support by trying to reload DMUB firmware if
> drm_atomic_helper_wait_for_flip_done() fails. Also, send out a wedged
> event if the firmware reload fails.
> 
> Signed-off-by: Hamza Mahfooz <someguy@effective-light.com>
> ---
> v2: send a wedged event instead of attempting a GPU reset.
> v3: read return value of drm_atomic_helper_wait_for_flip_done().
> v4: only send wedged event if firmware reload fails and send out "fake"
>     page flip completes.
> ---
>  .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 24 +++++++++++++++++--
>  1 file changed, 22 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
> index 7c51d8d7e73c..0ae6ada22fb0 100644
> --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
> +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
> @@ -87,6 +87,7 @@
>  #include <drm/drm_atomic_uapi.h>
>  #include <drm/drm_atomic_helper.h>
>  #include <drm/drm_blend.h>
> +#include <drm/drm_drv.h>
>  #include <drm/drm_fixed.h>
>  #include <drm/drm_fourcc.h>
>  #include <drm/drm_edid.h>
> @@ -10829,6 +10830,7 @@ static void amdgpu_dm_atomic_commit_tail(struct drm_atomic_state *state)
>  	struct drm_connector_state *old_con_state = NULL, *new_con_state = NULL;
>  	struct dm_crtc_state *dm_old_crtc_state, *dm_new_crtc_state;
>  	int crtc_disable_count = 0;
> +	struct amdgpu_crtc *acrtc;
>  
>  	trace_amdgpu_dm_atomic_commit_tail_begin(state);
>  
> @@ -11085,8 +11087,26 @@ static void amdgpu_dm_atomic_commit_tail(struct drm_atomic_state *state)
>  	/* Signal HW programming completion */
>  	drm_atomic_helper_commit_hw_done(state);
>  
> -	if (wait_for_vblank)
> -		drm_atomic_helper_wait_for_flip_done(dev, state);
> +	if (wait_for_vblank &&
> +	    drm_atomic_helper_wait_for_flip_done(dev, state)) {
> +		mutex_lock(&dm->dc_lock);
> +		if (dm_dmub_hw_init(adev))
> +			drm_dev_wedged_event(dev, DRM_WEDGE_RECOVERY_REBIND |
> +					     DRM_WEDGE_RECOVERY_BUS_RESET,
> +					     NULL);
> +		mutex_unlock(&dm->dc_lock);
> +
> +		spin_lock_irqsave(&dev->event_lock, flags);
> +		drm_for_each_crtc(crtc, dev) {

Whoops, sent out an older version of this patch, the following line is
here as of the latest (tested) version:

			acrtc = to_amdgpu_crtc(crtc);
> +			if (acrtc->event) {

> +				drm_crtc_send_vblank_event(crtc, acrtc->event);
> +				acrtc->event = NULL;
> +				drm_crtc_vblank_put(crtc);
> +				acrtc->pflip_status = AMDGPU_FLIP_NONE;
> +			}
> +		}
> +		spin_unlock_irqrestore(&dev->event_lock, flags);
> +	}
>  
>  	drm_atomic_helper_cleanup_planes(dev, state);
>  
> -- 
> 2.53.0
> 

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH v4 2/2] drm/amd/display: add vendor specific reset
  2026-02-20 17:15 ` [PATCH v4 2/2] drm/amd/display: add vendor specific reset Hamza Mahfooz
  2026-02-20 17:37   ` Hamza Mahfooz
@ 2026-02-22  1:32   ` kernel test robot
  2026-02-23  9:34   ` Christian König
  2 siblings, 0 replies; 7+ messages in thread
From: kernel test robot @ 2026-02-22  1:32 UTC (permalink / raw)
  To: Hamza Mahfooz, dri-devel
  Cc: llvm, oe-kbuild-all, Michel Dänzer, Mario Limonciello,
	Hamza Mahfooz, Harry Wentland, Leo Li, Rodrigo Siqueira,
	Alex Deucher, Christian König, David Airlie, Simona Vetter,
	Maarten Lankhorst, Maxime Ripard, Thomas Zimmermann, Alex Hung,
	Aurabindo Pillai, Wayne Lin, Timur Kristóf, Ivan Lipski,
	Dominik Kaszewski, amd-gfx, linux-kernel

Hi Hamza,

kernel test robot noticed the following build warnings:

[auto build test WARNING on drm-misc/drm-misc-next]
[also build test WARNING on daeinki-drm-exynos/exynos-drm-next drm/drm-next drm-i915/for-linux-next drm-i915/for-linux-next-fixes drm-tip/drm-tip linus/master v6.19 next-20260220]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Hamza-Mahfooz/drm-amd-display-add-vendor-specific-reset/20260221-011745
base:   https://gitlab.freedesktop.org/drm/misc/kernel.git drm-misc-next
patch link:    https://lore.kernel.org/r/20260220171518.711594-2-someguy%40effective-light.com
patch subject: [PATCH v4 2/2] drm/amd/display: add vendor specific reset
config: x86_64-randconfig-072-20260221 (https://download.01.org/0day-ci/archive/20260222/202602220950.2iCpIKFc-lkp@intel.com/config)
compiler: clang version 20.1.8 (https://github.com/llvm/llvm-project 87f0227cb60147a26a1eeb4fb06e3b505e9c7261)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20260222/202602220950.2iCpIKFc-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202602220950.2iCpIKFc-lkp@intel.com/

All warnings (new ones prefixed by >>):

>> drivers/gpu/drm/amd/amdgpu/../display/amdgpu_dm/amdgpu_dm.c:11143:8: warning: variable 'acrtc' is uninitialized when used here [-Wuninitialized]
    11143 |                         if (acrtc->event) {
          |                             ^~~~~
   drivers/gpu/drm/amd/amdgpu/../display/amdgpu_dm/amdgpu_dm.c:10875:27: note: initialize the variable 'acrtc' to silence this warning
    10875 |         struct amdgpu_crtc *acrtc;
          |                                  ^
          |                                   = NULL
   1 warning generated.


vim +/acrtc +11143 drivers/gpu/drm/amd/amdgpu/../display/amdgpu_dm/amdgpu_dm.c

 11021	
 11022			if (new_crtc_state->active &&
 11023			    (!old_crtc_state->active ||
 11024			     drm_atomic_crtc_needs_modeset(new_crtc_state))) {
 11025				dc_stream_retain(dm_new_crtc_state->stream);
 11026				acrtc->dm_irq_params.stream = dm_new_crtc_state->stream;
 11027				manage_dm_interrupts(adev, acrtc, dm_new_crtc_state);
 11028			}
 11029			/* Handle vrr on->off / off->on transitions */
 11030			amdgpu_dm_handle_vrr_transition(dm_old_crtc_state, dm_new_crtc_state);
 11031	
 11032	#ifdef CONFIG_DEBUG_FS
 11033			if (new_crtc_state->active &&
 11034			    (!old_crtc_state->active ||
 11035			     drm_atomic_crtc_needs_modeset(new_crtc_state))) {
 11036				/**
 11037				 * Frontend may have changed so reapply the CRC capture
 11038				 * settings for the stream.
 11039				 */
 11040				if (amdgpu_dm_is_valid_crc_source(cur_crc_src)) {
 11041	#if defined(CONFIG_DRM_AMD_SECURE_DISPLAY)
 11042					if (amdgpu_dm_crc_window_is_activated(crtc)) {
 11043						uint8_t cnt;
 11044	
 11045						spin_lock_irqsave(&adev_to_drm(adev)->event_lock, flags);
 11046						for (cnt = 0; cnt < MAX_CRC_WINDOW_NUM; cnt++) {
 11047							if (acrtc->dm_irq_params.window_param[cnt].enable) {
 11048								acrtc->dm_irq_params.window_param[cnt].update_win = true;
 11049	
 11050								/**
 11051								 * It takes 2 frames for HW to stably generate CRC when
 11052								 * resuming from suspend, so we set skip_frame_cnt 2.
 11053								 */
 11054								acrtc->dm_irq_params.window_param[cnt].skip_frame_cnt = 2;
 11055							}
 11056						}
 11057						spin_unlock_irqrestore(&adev_to_drm(adev)->event_lock, flags);
 11058					}
 11059	#endif
 11060					if (amdgpu_dm_crtc_configure_crc_source(
 11061						crtc, dm_new_crtc_state, cur_crc_src))
 11062						drm_dbg_atomic(dev, "Failed to configure crc source");
 11063				}
 11064			}
 11065	#endif
 11066		}
 11067	
 11068		for_each_new_crtc_in_state(state, crtc, new_crtc_state, j)
 11069			if (new_crtc_state->async_flip)
 11070				wait_for_vblank = false;
 11071	
 11072		/* update planes when needed per crtc*/
 11073		for_each_new_crtc_in_state(state, crtc, new_crtc_state, j) {
 11074			dm_new_crtc_state = to_dm_crtc_state(new_crtc_state);
 11075	
 11076			if (dm_new_crtc_state->stream)
 11077				amdgpu_dm_commit_planes(state, dev, dm, crtc, wait_for_vblank);
 11078		}
 11079	
 11080		/* Enable writeback */
 11081		for_each_new_connector_in_state(state, connector, new_con_state, i) {
 11082			struct dm_connector_state *dm_new_con_state = to_dm_connector_state(new_con_state);
 11083			struct amdgpu_crtc *acrtc = to_amdgpu_crtc(dm_new_con_state->base.crtc);
 11084	
 11085			if (connector->connector_type != DRM_MODE_CONNECTOR_WRITEBACK)
 11086				continue;
 11087	
 11088			if (!new_con_state->writeback_job)
 11089				continue;
 11090	
 11091			new_crtc_state = drm_atomic_get_new_crtc_state(state, &acrtc->base);
 11092	
 11093			if (!new_crtc_state)
 11094				continue;
 11095	
 11096			if (acrtc->wb_enabled)
 11097				continue;
 11098	
 11099			dm_new_crtc_state = to_dm_crtc_state(new_crtc_state);
 11100	
 11101			dm_set_writeback(dm, dm_new_crtc_state, connector, new_con_state);
 11102			acrtc->wb_enabled = true;
 11103		}
 11104	
 11105		/* Update audio instances for each connector. */
 11106		amdgpu_dm_commit_audio(dev, state);
 11107	
 11108		/* restore the backlight level */
 11109		for (i = 0; i < dm->num_of_edps; i++) {
 11110			if (dm->backlight_dev[i] &&
 11111			    (dm->actual_brightness[i] != dm->brightness[i]))
 11112				amdgpu_dm_backlight_set_level(dm, i, dm->brightness[i]);
 11113		}
 11114	
 11115		/*
 11116		 * send vblank event on all events not handled in flip and
 11117		 * mark consumed event for drm_atomic_helper_commit_hw_done
 11118		 */
 11119		spin_lock_irqsave(&adev_to_drm(adev)->event_lock, flags);
 11120		for_each_new_crtc_in_state(state, crtc, new_crtc_state, i) {
 11121	
 11122			if (new_crtc_state->event)
 11123				drm_send_event_locked(dev, &new_crtc_state->event->base);
 11124	
 11125			new_crtc_state->event = NULL;
 11126		}
 11127		spin_unlock_irqrestore(&adev_to_drm(adev)->event_lock, flags);
 11128	
 11129		/* Signal HW programming completion */
 11130		drm_atomic_helper_commit_hw_done(state);
 11131	
 11132		if (wait_for_vblank &&
 11133		    drm_atomic_helper_wait_for_flip_done(dev, state)) {
 11134			mutex_lock(&dm->dc_lock);
 11135			if (dm_dmub_hw_init(adev))
 11136				drm_dev_wedged_event(dev, DRM_WEDGE_RECOVERY_REBIND |
 11137						     DRM_WEDGE_RECOVERY_BUS_RESET,
 11138						     NULL);
 11139			mutex_unlock(&dm->dc_lock);
 11140	
 11141			spin_lock_irqsave(&dev->event_lock, flags);
 11142			drm_for_each_crtc(crtc, dev) {
 11143				if (acrtc->event) {
 11144					drm_crtc_send_vblank_event(crtc, acrtc->event);
 11145					acrtc->event = NULL;
 11146					drm_crtc_vblank_put(crtc);
 11147					acrtc->pflip_status = AMDGPU_FLIP_NONE;
 11148				}
 11149			}
 11150			spin_unlock_irqrestore(&dev->event_lock, flags);
 11151		}
 11152	
 11153		drm_atomic_helper_cleanup_planes(dev, state);
 11154	
 11155		/* Don't free the memory if we are hitting this as part of suspend.
 11156		 * This way we don't free any memory during suspend; see
 11157		 * amdgpu_bo_free_kernel().  The memory will be freed in the first
 11158		 * non-suspend modeset or when the driver is torn down.
 11159		 */
 11160		if (!adev->in_suspend) {
 11161			/* return the stolen vga memory back to VRAM */
 11162			if (!adev->mman.keep_stolen_vga_memory)
 11163				amdgpu_bo_free_kernel(&adev->mman.stolen_vga_memory, NULL, NULL);
 11164			amdgpu_bo_free_kernel(&adev->mman.stolen_extended_memory, NULL, NULL);
 11165		}
 11166	
 11167		/*
 11168		 * Finally, drop a runtime PM reference for each newly disabled CRTC,
 11169		 * so we can put the GPU into runtime suspend if we're not driving any
 11170		 * displays anymore
 11171		 */
 11172		for (i = 0; i < crtc_disable_count; i++)
 11173			pm_runtime_put_autosuspend(dev->dev);
 11174		pm_runtime_mark_last_busy(dev->dev);
 11175	
 11176		trace_amdgpu_dm_atomic_commit_tail_finish(state);
 11177	}
 11178	

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH v4 2/2] drm/amd/display: add vendor specific reset
  2026-02-20 17:15 ` [PATCH v4 2/2] drm/amd/display: add vendor specific reset Hamza Mahfooz
  2026-02-20 17:37   ` Hamza Mahfooz
  2026-02-22  1:32   ` kernel test robot
@ 2026-02-23  9:34   ` Christian König
  2026-02-27 20:58     ` Hamza Mahfooz
  2 siblings, 1 reply; 7+ messages in thread
From: Christian König @ 2026-02-23  9:34 UTC (permalink / raw)
  To: Hamza Mahfooz, dri-devel
  Cc: Michel Dänzer, Mario Limonciello, Harry Wentland, Leo Li,
	Rodrigo Siqueira, Alex Deucher, David Airlie, Simona Vetter,
	Maarten Lankhorst, Maxime Ripard, Thomas Zimmermann, Alex Hung,
	Aurabindo Pillai, Wayne Lin, Timur Kristóf, Ivan Lipski,
	Dominik Kaszewski, amd-gfx, linux-kernel



On 2/20/26 18:15, Hamza Mahfooz wrote:
> We now have a means to respond to page flip timeouts. So, hook up
> support by trying to reload DMUB firmware if
> drm_atomic_helper_wait_for_flip_done() fails. Also, send out a wedged
> event if the firmware reload fails.
> 
> Signed-off-by: Hamza Mahfooz <someguy@effective-light.com>
> ---
> v2: send a wedged event instead of attempting a GPU reset.
> v3: read return value of drm_atomic_helper_wait_for_flip_done().
> v4: only send wedged event if firmware reload fails and send out "fake"
>     page flip completes.
> ---
>  .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 24 +++++++++++++++++--
>  1 file changed, 22 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
> index 7c51d8d7e73c..0ae6ada22fb0 100644
> --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
> +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
> @@ -87,6 +87,7 @@
>  #include <drm/drm_atomic_uapi.h>
>  #include <drm/drm_atomic_helper.h>
>  #include <drm/drm_blend.h>
> +#include <drm/drm_drv.h>
>  #include <drm/drm_fixed.h>
>  #include <drm/drm_fourcc.h>
>  #include <drm/drm_edid.h>
> @@ -10829,6 +10830,7 @@ static void amdgpu_dm_atomic_commit_tail(struct drm_atomic_state *state)
>  	struct drm_connector_state *old_con_state = NULL, *new_con_state = NULL;
>  	struct dm_crtc_state *dm_old_crtc_state, *dm_new_crtc_state;
>  	int crtc_disable_count = 0;
> +	struct amdgpu_crtc *acrtc;
>  
>  	trace_amdgpu_dm_atomic_commit_tail_begin(state);
>  
> @@ -11085,8 +11087,26 @@ static void amdgpu_dm_atomic_commit_tail(struct drm_atomic_state *state)
>  	/* Signal HW programming completion */
>  	drm_atomic_helper_commit_hw_done(state);
>  
> -	if (wait_for_vblank)
> -		drm_atomic_helper_wait_for_flip_done(dev, state);
> +	if (wait_for_vblank &&
> +	    drm_atomic_helper_wait_for_flip_done(dev, state)) {
> +		mutex_lock(&dm->dc_lock);
> +		if (dm_dmub_hw_init(adev))

From Michel's explanation that is pretty much a no-go because it potentially causes other atomic commits to react in unforeseen ways.

> +			drm_dev_wedged_event(dev, DRM_WEDGE_RECOVERY_REBIND |
> +					     DRM_WEDGE_RECOVERY_BUS_RESET,
> +					     NULL);

Please completely drop that. This here is a sledge hammer and not going to fly anywhere.

> +		mutex_unlock(&dm->dc_lock);
> +
> +		spin_lock_irqsave(&dev->event_lock, flags);
> +		drm_for_each_crtc(crtc, dev) {

This should go over only the CRTCs in the atomic commit currently handled.

Have you tried sending a hotplug event for the connectors in question as Michel suggested?

Regards,
Christian.

> +			if (acrtc->event) {
> +				drm_crtc_send_vblank_event(crtc, acrtc->event);
> +				acrtc->event = NULL;
> +				drm_crtc_vblank_put(crtc);
> +				acrtc->pflip_status = AMDGPU_FLIP_NONE;
> +			}
> +		}
> +		spin_unlock_irqrestore(&dev->event_lock, flags);
> +	}
>  
>  	drm_atomic_helper_cleanup_planes(dev, state);
>  


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH v4 2/2] drm/amd/display: add vendor specific reset
  2026-02-23  9:34   ` Christian König
@ 2026-02-27 20:58     ` Hamza Mahfooz
  0 siblings, 0 replies; 7+ messages in thread
From: Hamza Mahfooz @ 2026-02-27 20:58 UTC (permalink / raw)
  To: Christian König
  Cc: dri-devel, Michel Dänzer, Mario Limonciello, Harry Wentland,
	Leo Li, Rodrigo Siqueira, Alex Deucher, David Airlie,
	Simona Vetter, Maarten Lankhorst, Maxime Ripard,
	Thomas Zimmermann, Alex Hung, Aurabindo Pillai, Wayne Lin,
	Timur Kristóf, Ivan Lipski, Dominik Kaszewski, amd-gfx,
	linux-kernel

On Mon, Feb 23, 2026 at 10:34:17AM +0100, Christian König wrote:
> > @@ -11085,8 +11087,26 @@ static void amdgpu_dm_atomic_commit_tail(struct drm_atomic_state *state)
> >  	/* Signal HW programming completion */
> >  	drm_atomic_helper_commit_hw_done(state);
> >  
> > -	if (wait_for_vblank)
> > -		drm_atomic_helper_wait_for_flip_done(dev, state);
> > +	if (wait_for_vblank &&
> > +	    drm_atomic_helper_wait_for_flip_done(dev, state)) {
> > +		mutex_lock(&dm->dc_lock);
> > +		if (dm_dmub_hw_init(adev))
> 
> From Michel's explanation that is pretty much a no-go because it potentially causes other atomic commits to react in unforeseen ways.
> 

This code would only run if the forced modeset fails, which is to say we
are already in a hung state, so I don't expect any other atomic commits
to be firing off. Also, evidently the timeout isn't a one off, so it's
almost certainly caused by hung firmware and not by a bug in the
driver's vblanking code.

> > +			drm_dev_wedged_event(dev, DRM_WEDGE_RECOVERY_REBIND |
> > +					     DRM_WEDGE_RECOVERY_BUS_RESET,
> > +					     NULL);
> 
> Please completely drop that. This here is a sledge hammer and not going to fly anywhere.
> 

I don't feel too strongly about it, though isn't the point to inform
userspace that we were unable to recover? AFAIK the prescribed methods
are mere suggestions and users can choose to ignore them if they feel
that they are too hard hitting.

> > +		mutex_unlock(&dm->dc_lock);
> > +
> > +		spin_lock_irqsave(&dev->event_lock, flags);
> > +		drm_for_each_crtc(crtc, dev) {
> 
> This should go over only the CRTCs in the atomic commit currently handled.
> 
> Have you tried sending a hotplug event for the connectors in question as Michel suggested?
> 

Sure, I can look into that. However, we would still need the firmware
reload. Otherwise, we would just be forcing a modeset twice in
succession.

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH v4 1/2] drm: introduce KMS recovery mechanism
  2026-02-20 17:15 [PATCH v4 1/2] drm: introduce KMS recovery mechanism Hamza Mahfooz
  2026-02-20 17:15 ` [PATCH v4 2/2] drm/amd/display: add vendor specific reset Hamza Mahfooz
@ 2026-02-28 18:48 ` Shengyu Qu
  1 sibling, 0 replies; 7+ messages in thread
From: Shengyu Qu @ 2026-02-28 18:48 UTC (permalink / raw)
  To: Hamza Mahfooz, dri-devel
  Cc: wiagn233, Michel Dänzer, Mario Limonciello, Harry Wentland,
	Leo Li, Rodrigo Siqueira, Alex Deucher, Christian König,
	David Airlie, Simona Vetter, Maarten Lankhorst, Maxime Ripard,
	Thomas Zimmermann, Alex Hung, Aurabindo Pillai, Wayne Lin,
	Timur Kristóf, Ivan Lipski, Dominik Kaszewski, amd-gfx,
	linux-kernel


[-- Attachment #1.1.1: Type: text/plain, Size: 6073 bytes --]

Hi,

Since you can reproduce this timeout issue, can you try the debug method
here[1] to confirm if your issue is the same as that patch?

[1] https://lists.freedesktop.org/archives/amd-gfx/2026-February/138842.html

Best regards,
Shengyu

在 2026/2/21 1:15, Hamza Mahfooz 写道:
> There should be a mechanism for drivers to respond to flip_done
> timeouts. Since, as it stands it is possible for the display to stall
> indefinitely, necessitating a hard reset. So, introduce a new mechanism
> that tries various methods of recovery with increasing aggression, in
> the following order:
> 
> 1. Force a full modeset (have the compositor reprogram the state from
>     scratch).
> 2. As a last resort, have the driver attempt a vendor specific reset
>     (which they can do by reading the return value of
>     drm_atomic_helper_wait_for_flip_done()).
> 
> Signed-off-by: Hamza Mahfooz <someguy@effective-light.com>
> ---
> v2: new to the series
> v3: get rid of page_flip_timeout() and have
>      drm_atomic_helper_wait_for_flip_done() return a error.
> v4: get rid of nested ret variable.
> ---
>   drivers/gpu/drm/drm_atomic_helper.c | 47 ++++++++++++++++++++++++-----
>   include/drm/drm_atomic_helper.h     |  4 +--
>   include/drm/drm_device.h            | 24 +++++++++++++++
>   3 files changed, 66 insertions(+), 9 deletions(-)
> 
> diff --git a/drivers/gpu/drm/drm_atomic_helper.c b/drivers/gpu/drm/drm_atomic_helper.c
> index 5840e9cc6f66..d905eb166225 100644
> --- a/drivers/gpu/drm/drm_atomic_helper.c
> +++ b/drivers/gpu/drm/drm_atomic_helper.c
> @@ -42,6 +42,7 @@
>   #include <drm/drm_gem_atomic_helper.h>
>   #include <drm/drm_panic.h>
>   #include <drm/drm_print.h>
> +#include <drm/drm_probe_helper.h>
>   #include <drm/drm_self_refresh_helper.h>
>   #include <drm/drm_vblank.h>
>   #include <drm/drm_writeback.h>
> @@ -1864,30 +1865,62 @@ EXPORT_SYMBOL(drm_atomic_helper_wait_for_vblanks);
>    *
>    * This requires that drivers use the nonblocking commit tracking support
>    * initialized using drm_atomic_helper_setup_commit().
> + *
> + * Returns:
> + * -ETIMEDOUT to indicate that drivers can attempt a vendor reset, 0 otherwise.
>    */
> -void drm_atomic_helper_wait_for_flip_done(struct drm_device *dev,
> -					  struct drm_atomic_state *state)
> +int drm_atomic_helper_wait_for_flip_done(struct drm_device *dev,
> +					 struct drm_atomic_state *state)
>   {
>   	struct drm_crtc *crtc;
> +	int ret = 0;
>   	int i;
>   
>   	for (i = 0; i < dev->mode_config.num_crtc; i++) {
>   		struct drm_crtc_commit *commit = state->crtcs[i].commit;
> -		int ret;
>   
>   		crtc = state->crtcs[i].ptr;
>   
>   		if (!crtc || !commit)
>   			continue;
>   
> -		ret = wait_for_completion_timeout(&commit->flip_done, 10 * HZ);
> -		if (ret == 0)
> -			drm_err(dev, "[CRTC:%d:%s] flip_done timed out\n",
> -				crtc->base.id, crtc->name);
> +		if (!wait_for_completion_timeout(&commit->flip_done, 10 * HZ)) {
> +			switch (dev->reset_phase) {
> +			case DRM_KMS_RESET_NONE:
> +				drm_err(dev, "[CRTC:%d:%s] flip_done timed out\n",
> +					crtc->base.id, crtc->name);
> +				dev->reset_phase = DRM_KMS_RESET_FORCE_MODESET;
> +				drm_kms_helper_hotplug_event(dev);
> +				break;
> +			case DRM_KMS_RESET_FORCE_MODESET:
> +				drm_err(dev, "[CRTC:%d:%s] force full modeset failed\n",
> +					crtc->base.id, crtc->name);
> +				dev->reset_phase = DRM_KMS_RESET_VENDOR;
> +				ret = -ETIMEDOUT;
> +				break;
> +			case DRM_KMS_RESET_VENDOR:
> +				drm_err(dev, "[CRTC:%d:%s] KMS recovery failed!\n",
> +					crtc->base.id, crtc->name);
> +				dev->reset_phase = DRM_KMS_RESET_GIVE_UP;
> +				break;
> +			default:
> +				break;
> +			}
> +
> +			goto exit;
> +		}
> +	}
> +
> +	if (dev->reset_phase) {
> +		drm_info(dev, "KMS recovery succeeded!\n");
> +		dev->reset_phase = DRM_KMS_RESET_NONE;
>   	}
>   
> +exit:
>   	if (state->fake_commit)
>   		complete_all(&state->fake_commit->flip_done);
> +
> +	return ret;
>   }
>   EXPORT_SYMBOL(drm_atomic_helper_wait_for_flip_done);
>   
> diff --git a/include/drm/drm_atomic_helper.h b/include/drm/drm_atomic_helper.h
> index 53382fe93537..298c8dff3993 100644
> --- a/include/drm/drm_atomic_helper.h
> +++ b/include/drm/drm_atomic_helper.h
> @@ -79,8 +79,8 @@ int drm_atomic_helper_wait_for_fences(struct drm_device *dev,
>   void drm_atomic_helper_wait_for_vblanks(struct drm_device *dev,
>   					struct drm_atomic_state *old_state);
>   
> -void drm_atomic_helper_wait_for_flip_done(struct drm_device *dev,
> -					  struct drm_atomic_state *old_state);
> +int drm_atomic_helper_wait_for_flip_done(struct drm_device *dev,
> +					 struct drm_atomic_state *old_state);
>   
>   void
>   drm_atomic_helper_update_legacy_modeset_state(struct drm_device *dev,
> diff --git a/include/drm/drm_device.h b/include/drm/drm_device.h
> index bc78fb77cc27..1244d7527e7b 100644
> --- a/include/drm/drm_device.h
> +++ b/include/drm/drm_device.h
> @@ -66,6 +66,23 @@ enum switch_power_state {
>   	DRM_SWITCH_POWER_DYNAMIC_OFF = 3,
>   };
>   
> +/**
> + * enum drm_kms_reset_phase - reset phase of drm device
> + */
> +enum drm_kms_reset_phase {
> +	/** @DRM_KMS_RESET_NONE: Not currently attempting recovery */
> +	DRM_KMS_RESET_NONE,
> +
> +	/** @DRM_KMS_RESET_FORCE_MODESET: Force a full modeset */
> +	DRM_KMS_RESET_FORCE_MODESET,
> +
> +	/** @DRM_KMS_RESET_VENDOR: Attempt a vendor reset */
> +	DRM_KMS_RESET_VENDOR,
> +
> +	/** @DRM_KMS_RESET_GIVE_UP: All recovery methods failed */
> +	DRM_KMS_RESET_GIVE_UP,
> +};
> +
>   /**
>    * struct drm_device - DRM device structure
>    *
> @@ -375,6 +392,13 @@ struct drm_device {
>   	 * Root directory for debugfs files.
>   	 */
>   	struct dentry *debugfs_root;
> +
> +	/**
> +	 * @reset_phase:
> +	 *
> +	 * Reset phase that the device is in.
> +	 */
> +	enum drm_kms_reset_phase reset_phase;
>   };
>   
>   void drm_dev_set_dma_dev(struct drm_device *dev, struct device *dma_dev);


[-- Attachment #1.1.2: OpenPGP public key --]
[-- Type: application/pgp-keys, Size: 6977 bytes --]

[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 840 bytes --]

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2026-02-28 18:48 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-02-20 17:15 [PATCH v4 1/2] drm: introduce KMS recovery mechanism Hamza Mahfooz
2026-02-20 17:15 ` [PATCH v4 2/2] drm/amd/display: add vendor specific reset Hamza Mahfooz
2026-02-20 17:37   ` Hamza Mahfooz
2026-02-22  1:32   ` kernel test robot
2026-02-23  9:34   ` Christian König
2026-02-27 20:58     ` Hamza Mahfooz
2026-02-28 18:48 ` [PATCH v4 1/2] drm: introduce KMS recovery mechanism Shengyu Qu

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox