* [PATCH v4 1/2] drm: introduce KMS recovery mechanism
@ 2026-02-20 17:15 Hamza Mahfooz
2026-02-20 17:15 ` [PATCH v4 2/2] drm/amd/display: add vendor specific reset Hamza Mahfooz
2026-02-28 18:48 ` [PATCH v4 1/2] drm: introduce KMS recovery mechanism Shengyu Qu
0 siblings, 2 replies; 7+ messages in thread
From: Hamza Mahfooz @ 2026-02-20 17:15 UTC (permalink / raw)
To: dri-devel
Cc: Michel Dänzer, Mario Limonciello, Hamza Mahfooz,
Harry Wentland, Leo Li, Rodrigo Siqueira, Alex Deucher,
Christian König, David Airlie, Simona Vetter,
Maarten Lankhorst, Maxime Ripard, Thomas Zimmermann, Alex Hung,
Aurabindo Pillai, Wayne Lin, Timur Kristóf, Ivan Lipski,
Dominik Kaszewski, amd-gfx, linux-kernel
There should be a mechanism for drivers to respond to flip_done
timeouts. Since, as it stands it is possible for the display to stall
indefinitely, necessitating a hard reset. So, introduce a new mechanism
that tries various methods of recovery with increasing aggression, in
the following order:
1. Force a full modeset (have the compositor reprogram the state from
scratch).
2. As a last resort, have the driver attempt a vendor specific reset
(which they can do by reading the return value of
drm_atomic_helper_wait_for_flip_done()).
Signed-off-by: Hamza Mahfooz <someguy@effective-light.com>
---
v2: new to the series
v3: get rid of page_flip_timeout() and have
drm_atomic_helper_wait_for_flip_done() return a error.
v4: get rid of nested ret variable.
---
drivers/gpu/drm/drm_atomic_helper.c | 47 ++++++++++++++++++++++++-----
include/drm/drm_atomic_helper.h | 4 +--
include/drm/drm_device.h | 24 +++++++++++++++
3 files changed, 66 insertions(+), 9 deletions(-)
diff --git a/drivers/gpu/drm/drm_atomic_helper.c b/drivers/gpu/drm/drm_atomic_helper.c
index 5840e9cc6f66..d905eb166225 100644
--- a/drivers/gpu/drm/drm_atomic_helper.c
+++ b/drivers/gpu/drm/drm_atomic_helper.c
@@ -42,6 +42,7 @@
#include <drm/drm_gem_atomic_helper.h>
#include <drm/drm_panic.h>
#include <drm/drm_print.h>
+#include <drm/drm_probe_helper.h>
#include <drm/drm_self_refresh_helper.h>
#include <drm/drm_vblank.h>
#include <drm/drm_writeback.h>
@@ -1864,30 +1865,62 @@ EXPORT_SYMBOL(drm_atomic_helper_wait_for_vblanks);
*
* This requires that drivers use the nonblocking commit tracking support
* initialized using drm_atomic_helper_setup_commit().
+ *
+ * Returns:
+ * -ETIMEDOUT to indicate that drivers can attempt a vendor reset, 0 otherwise.
*/
-void drm_atomic_helper_wait_for_flip_done(struct drm_device *dev,
- struct drm_atomic_state *state)
+int drm_atomic_helper_wait_for_flip_done(struct drm_device *dev,
+ struct drm_atomic_state *state)
{
struct drm_crtc *crtc;
+ int ret = 0;
int i;
for (i = 0; i < dev->mode_config.num_crtc; i++) {
struct drm_crtc_commit *commit = state->crtcs[i].commit;
- int ret;
crtc = state->crtcs[i].ptr;
if (!crtc || !commit)
continue;
- ret = wait_for_completion_timeout(&commit->flip_done, 10 * HZ);
- if (ret == 0)
- drm_err(dev, "[CRTC:%d:%s] flip_done timed out\n",
- crtc->base.id, crtc->name);
+ if (!wait_for_completion_timeout(&commit->flip_done, 10 * HZ)) {
+ switch (dev->reset_phase) {
+ case DRM_KMS_RESET_NONE:
+ drm_err(dev, "[CRTC:%d:%s] flip_done timed out\n",
+ crtc->base.id, crtc->name);
+ dev->reset_phase = DRM_KMS_RESET_FORCE_MODESET;
+ drm_kms_helper_hotplug_event(dev);
+ break;
+ case DRM_KMS_RESET_FORCE_MODESET:
+ drm_err(dev, "[CRTC:%d:%s] force full modeset failed\n",
+ crtc->base.id, crtc->name);
+ dev->reset_phase = DRM_KMS_RESET_VENDOR;
+ ret = -ETIMEDOUT;
+ break;
+ case DRM_KMS_RESET_VENDOR:
+ drm_err(dev, "[CRTC:%d:%s] KMS recovery failed!\n",
+ crtc->base.id, crtc->name);
+ dev->reset_phase = DRM_KMS_RESET_GIVE_UP;
+ break;
+ default:
+ break;
+ }
+
+ goto exit;
+ }
+ }
+
+ if (dev->reset_phase) {
+ drm_info(dev, "KMS recovery succeeded!\n");
+ dev->reset_phase = DRM_KMS_RESET_NONE;
}
+exit:
if (state->fake_commit)
complete_all(&state->fake_commit->flip_done);
+
+ return ret;
}
EXPORT_SYMBOL(drm_atomic_helper_wait_for_flip_done);
diff --git a/include/drm/drm_atomic_helper.h b/include/drm/drm_atomic_helper.h
index 53382fe93537..298c8dff3993 100644
--- a/include/drm/drm_atomic_helper.h
+++ b/include/drm/drm_atomic_helper.h
@@ -79,8 +79,8 @@ int drm_atomic_helper_wait_for_fences(struct drm_device *dev,
void drm_atomic_helper_wait_for_vblanks(struct drm_device *dev,
struct drm_atomic_state *old_state);
-void drm_atomic_helper_wait_for_flip_done(struct drm_device *dev,
- struct drm_atomic_state *old_state);
+int drm_atomic_helper_wait_for_flip_done(struct drm_device *dev,
+ struct drm_atomic_state *old_state);
void
drm_atomic_helper_update_legacy_modeset_state(struct drm_device *dev,
diff --git a/include/drm/drm_device.h b/include/drm/drm_device.h
index bc78fb77cc27..1244d7527e7b 100644
--- a/include/drm/drm_device.h
+++ b/include/drm/drm_device.h
@@ -66,6 +66,23 @@ enum switch_power_state {
DRM_SWITCH_POWER_DYNAMIC_OFF = 3,
};
+/**
+ * enum drm_kms_reset_phase - reset phase of drm device
+ */
+enum drm_kms_reset_phase {
+ /** @DRM_KMS_RESET_NONE: Not currently attempting recovery */
+ DRM_KMS_RESET_NONE,
+
+ /** @DRM_KMS_RESET_FORCE_MODESET: Force a full modeset */
+ DRM_KMS_RESET_FORCE_MODESET,
+
+ /** @DRM_KMS_RESET_VENDOR: Attempt a vendor reset */
+ DRM_KMS_RESET_VENDOR,
+
+ /** @DRM_KMS_RESET_GIVE_UP: All recovery methods failed */
+ DRM_KMS_RESET_GIVE_UP,
+};
+
/**
* struct drm_device - DRM device structure
*
@@ -375,6 +392,13 @@ struct drm_device {
* Root directory for debugfs files.
*/
struct dentry *debugfs_root;
+
+ /**
+ * @reset_phase:
+ *
+ * Reset phase that the device is in.
+ */
+ enum drm_kms_reset_phase reset_phase;
};
void drm_dev_set_dma_dev(struct drm_device *dev, struct device *dma_dev);
--
2.53.0
^ permalink raw reply related [flat|nested] 7+ messages in thread
* [PATCH v4 2/2] drm/amd/display: add vendor specific reset
2026-02-20 17:15 [PATCH v4 1/2] drm: introduce KMS recovery mechanism Hamza Mahfooz
@ 2026-02-20 17:15 ` Hamza Mahfooz
2026-02-20 17:37 ` Hamza Mahfooz
` (2 more replies)
2026-02-28 18:48 ` [PATCH v4 1/2] drm: introduce KMS recovery mechanism Shengyu Qu
1 sibling, 3 replies; 7+ messages in thread
From: Hamza Mahfooz @ 2026-02-20 17:15 UTC (permalink / raw)
To: dri-devel
Cc: Michel Dänzer, Mario Limonciello, Hamza Mahfooz,
Harry Wentland, Leo Li, Rodrigo Siqueira, Alex Deucher,
Christian König, David Airlie, Simona Vetter,
Maarten Lankhorst, Maxime Ripard, Thomas Zimmermann, Alex Hung,
Aurabindo Pillai, Wayne Lin, Timur Kristóf, Ivan Lipski,
Dominik Kaszewski, amd-gfx, linux-kernel
We now have a means to respond to page flip timeouts. So, hook up
support by trying to reload DMUB firmware if
drm_atomic_helper_wait_for_flip_done() fails. Also, send out a wedged
event if the firmware reload fails.
Signed-off-by: Hamza Mahfooz <someguy@effective-light.com>
---
v2: send a wedged event instead of attempting a GPU reset.
v3: read return value of drm_atomic_helper_wait_for_flip_done().
v4: only send wedged event if firmware reload fails and send out "fake"
page flip completes.
---
.../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 24 +++++++++++++++++--
1 file changed, 22 insertions(+), 2 deletions(-)
diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
index 7c51d8d7e73c..0ae6ada22fb0 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
@@ -87,6 +87,7 @@
#include <drm/drm_atomic_uapi.h>
#include <drm/drm_atomic_helper.h>
#include <drm/drm_blend.h>
+#include <drm/drm_drv.h>
#include <drm/drm_fixed.h>
#include <drm/drm_fourcc.h>
#include <drm/drm_edid.h>
@@ -10829,6 +10830,7 @@ static void amdgpu_dm_atomic_commit_tail(struct drm_atomic_state *state)
struct drm_connector_state *old_con_state = NULL, *new_con_state = NULL;
struct dm_crtc_state *dm_old_crtc_state, *dm_new_crtc_state;
int crtc_disable_count = 0;
+ struct amdgpu_crtc *acrtc;
trace_amdgpu_dm_atomic_commit_tail_begin(state);
@@ -11085,8 +11087,26 @@ static void amdgpu_dm_atomic_commit_tail(struct drm_atomic_state *state)
/* Signal HW programming completion */
drm_atomic_helper_commit_hw_done(state);
- if (wait_for_vblank)
- drm_atomic_helper_wait_for_flip_done(dev, state);
+ if (wait_for_vblank &&
+ drm_atomic_helper_wait_for_flip_done(dev, state)) {
+ mutex_lock(&dm->dc_lock);
+ if (dm_dmub_hw_init(adev))
+ drm_dev_wedged_event(dev, DRM_WEDGE_RECOVERY_REBIND |
+ DRM_WEDGE_RECOVERY_BUS_RESET,
+ NULL);
+ mutex_unlock(&dm->dc_lock);
+
+ spin_lock_irqsave(&dev->event_lock, flags);
+ drm_for_each_crtc(crtc, dev) {
+ if (acrtc->event) {
+ drm_crtc_send_vblank_event(crtc, acrtc->event);
+ acrtc->event = NULL;
+ drm_crtc_vblank_put(crtc);
+ acrtc->pflip_status = AMDGPU_FLIP_NONE;
+ }
+ }
+ spin_unlock_irqrestore(&dev->event_lock, flags);
+ }
drm_atomic_helper_cleanup_planes(dev, state);
--
2.53.0
^ permalink raw reply related [flat|nested] 7+ messages in thread
* Re: [PATCH v4 2/2] drm/amd/display: add vendor specific reset
2026-02-20 17:15 ` [PATCH v4 2/2] drm/amd/display: add vendor specific reset Hamza Mahfooz
@ 2026-02-20 17:37 ` Hamza Mahfooz
2026-02-22 1:32 ` kernel test robot
2026-02-23 9:34 ` Christian König
2 siblings, 0 replies; 7+ messages in thread
From: Hamza Mahfooz @ 2026-02-20 17:37 UTC (permalink / raw)
To: dri-devel
Cc: Michel Dänzer, Mario Limonciello, Harry Wentland, Leo Li,
Rodrigo Siqueira, Alex Deucher, Christian König,
David Airlie, Simona Vetter, Maarten Lankhorst, Maxime Ripard,
Thomas Zimmermann, Alex Hung, Aurabindo Pillai, Wayne Lin,
Timur Kristóf, Ivan Lipski, Dominik Kaszewski, amd-gfx,
linux-kernel
On Fri, Feb 20, 2026 at 12:15:13PM -0500, Hamza Mahfooz wrote:
> We now have a means to respond to page flip timeouts. So, hook up
> support by trying to reload DMUB firmware if
> drm_atomic_helper_wait_for_flip_done() fails. Also, send out a wedged
> event if the firmware reload fails.
>
> Signed-off-by: Hamza Mahfooz <someguy@effective-light.com>
> ---
> v2: send a wedged event instead of attempting a GPU reset.
> v3: read return value of drm_atomic_helper_wait_for_flip_done().
> v4: only send wedged event if firmware reload fails and send out "fake"
> page flip completes.
> ---
> .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 24 +++++++++++++++++--
> 1 file changed, 22 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
> index 7c51d8d7e73c..0ae6ada22fb0 100644
> --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
> +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
> @@ -87,6 +87,7 @@
> #include <drm/drm_atomic_uapi.h>
> #include <drm/drm_atomic_helper.h>
> #include <drm/drm_blend.h>
> +#include <drm/drm_drv.h>
> #include <drm/drm_fixed.h>
> #include <drm/drm_fourcc.h>
> #include <drm/drm_edid.h>
> @@ -10829,6 +10830,7 @@ static void amdgpu_dm_atomic_commit_tail(struct drm_atomic_state *state)
> struct drm_connector_state *old_con_state = NULL, *new_con_state = NULL;
> struct dm_crtc_state *dm_old_crtc_state, *dm_new_crtc_state;
> int crtc_disable_count = 0;
> + struct amdgpu_crtc *acrtc;
>
> trace_amdgpu_dm_atomic_commit_tail_begin(state);
>
> @@ -11085,8 +11087,26 @@ static void amdgpu_dm_atomic_commit_tail(struct drm_atomic_state *state)
> /* Signal HW programming completion */
> drm_atomic_helper_commit_hw_done(state);
>
> - if (wait_for_vblank)
> - drm_atomic_helper_wait_for_flip_done(dev, state);
> + if (wait_for_vblank &&
> + drm_atomic_helper_wait_for_flip_done(dev, state)) {
> + mutex_lock(&dm->dc_lock);
> + if (dm_dmub_hw_init(adev))
> + drm_dev_wedged_event(dev, DRM_WEDGE_RECOVERY_REBIND |
> + DRM_WEDGE_RECOVERY_BUS_RESET,
> + NULL);
> + mutex_unlock(&dm->dc_lock);
> +
> + spin_lock_irqsave(&dev->event_lock, flags);
> + drm_for_each_crtc(crtc, dev) {
Whoops, sent out an older version of this patch, the following line is
here as of the latest (tested) version:
acrtc = to_amdgpu_crtc(crtc);
> + if (acrtc->event) {
> + drm_crtc_send_vblank_event(crtc, acrtc->event);
> + acrtc->event = NULL;
> + drm_crtc_vblank_put(crtc);
> + acrtc->pflip_status = AMDGPU_FLIP_NONE;
> + }
> + }
> + spin_unlock_irqrestore(&dev->event_lock, flags);
> + }
>
> drm_atomic_helper_cleanup_planes(dev, state);
>
> --
> 2.53.0
>
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH v4 2/2] drm/amd/display: add vendor specific reset
2026-02-20 17:15 ` [PATCH v4 2/2] drm/amd/display: add vendor specific reset Hamza Mahfooz
2026-02-20 17:37 ` Hamza Mahfooz
@ 2026-02-22 1:32 ` kernel test robot
2026-02-23 9:34 ` Christian König
2 siblings, 0 replies; 7+ messages in thread
From: kernel test robot @ 2026-02-22 1:32 UTC (permalink / raw)
To: Hamza Mahfooz, dri-devel
Cc: llvm, oe-kbuild-all, Michel Dänzer, Mario Limonciello,
Hamza Mahfooz, Harry Wentland, Leo Li, Rodrigo Siqueira,
Alex Deucher, Christian König, David Airlie, Simona Vetter,
Maarten Lankhorst, Maxime Ripard, Thomas Zimmermann, Alex Hung,
Aurabindo Pillai, Wayne Lin, Timur Kristóf, Ivan Lipski,
Dominik Kaszewski, amd-gfx, linux-kernel
Hi Hamza,
kernel test robot noticed the following build warnings:
[auto build test WARNING on drm-misc/drm-misc-next]
[also build test WARNING on daeinki-drm-exynos/exynos-drm-next drm/drm-next drm-i915/for-linux-next drm-i915/for-linux-next-fixes drm-tip/drm-tip linus/master v6.19 next-20260220]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]
url: https://github.com/intel-lab-lkp/linux/commits/Hamza-Mahfooz/drm-amd-display-add-vendor-specific-reset/20260221-011745
base: https://gitlab.freedesktop.org/drm/misc/kernel.git drm-misc-next
patch link: https://lore.kernel.org/r/20260220171518.711594-2-someguy%40effective-light.com
patch subject: [PATCH v4 2/2] drm/amd/display: add vendor specific reset
config: x86_64-randconfig-072-20260221 (https://download.01.org/0day-ci/archive/20260222/202602220950.2iCpIKFc-lkp@intel.com/config)
compiler: clang version 20.1.8 (https://github.com/llvm/llvm-project 87f0227cb60147a26a1eeb4fb06e3b505e9c7261)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20260222/202602220950.2iCpIKFc-lkp@intel.com/reproduce)
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202602220950.2iCpIKFc-lkp@intel.com/
All warnings (new ones prefixed by >>):
>> drivers/gpu/drm/amd/amdgpu/../display/amdgpu_dm/amdgpu_dm.c:11143:8: warning: variable 'acrtc' is uninitialized when used here [-Wuninitialized]
11143 | if (acrtc->event) {
| ^~~~~
drivers/gpu/drm/amd/amdgpu/../display/amdgpu_dm/amdgpu_dm.c:10875:27: note: initialize the variable 'acrtc' to silence this warning
10875 | struct amdgpu_crtc *acrtc;
| ^
| = NULL
1 warning generated.
vim +/acrtc +11143 drivers/gpu/drm/amd/amdgpu/../display/amdgpu_dm/amdgpu_dm.c
11021
11022 if (new_crtc_state->active &&
11023 (!old_crtc_state->active ||
11024 drm_atomic_crtc_needs_modeset(new_crtc_state))) {
11025 dc_stream_retain(dm_new_crtc_state->stream);
11026 acrtc->dm_irq_params.stream = dm_new_crtc_state->stream;
11027 manage_dm_interrupts(adev, acrtc, dm_new_crtc_state);
11028 }
11029 /* Handle vrr on->off / off->on transitions */
11030 amdgpu_dm_handle_vrr_transition(dm_old_crtc_state, dm_new_crtc_state);
11031
11032 #ifdef CONFIG_DEBUG_FS
11033 if (new_crtc_state->active &&
11034 (!old_crtc_state->active ||
11035 drm_atomic_crtc_needs_modeset(new_crtc_state))) {
11036 /**
11037 * Frontend may have changed so reapply the CRC capture
11038 * settings for the stream.
11039 */
11040 if (amdgpu_dm_is_valid_crc_source(cur_crc_src)) {
11041 #if defined(CONFIG_DRM_AMD_SECURE_DISPLAY)
11042 if (amdgpu_dm_crc_window_is_activated(crtc)) {
11043 uint8_t cnt;
11044
11045 spin_lock_irqsave(&adev_to_drm(adev)->event_lock, flags);
11046 for (cnt = 0; cnt < MAX_CRC_WINDOW_NUM; cnt++) {
11047 if (acrtc->dm_irq_params.window_param[cnt].enable) {
11048 acrtc->dm_irq_params.window_param[cnt].update_win = true;
11049
11050 /**
11051 * It takes 2 frames for HW to stably generate CRC when
11052 * resuming from suspend, so we set skip_frame_cnt 2.
11053 */
11054 acrtc->dm_irq_params.window_param[cnt].skip_frame_cnt = 2;
11055 }
11056 }
11057 spin_unlock_irqrestore(&adev_to_drm(adev)->event_lock, flags);
11058 }
11059 #endif
11060 if (amdgpu_dm_crtc_configure_crc_source(
11061 crtc, dm_new_crtc_state, cur_crc_src))
11062 drm_dbg_atomic(dev, "Failed to configure crc source");
11063 }
11064 }
11065 #endif
11066 }
11067
11068 for_each_new_crtc_in_state(state, crtc, new_crtc_state, j)
11069 if (new_crtc_state->async_flip)
11070 wait_for_vblank = false;
11071
11072 /* update planes when needed per crtc*/
11073 for_each_new_crtc_in_state(state, crtc, new_crtc_state, j) {
11074 dm_new_crtc_state = to_dm_crtc_state(new_crtc_state);
11075
11076 if (dm_new_crtc_state->stream)
11077 amdgpu_dm_commit_planes(state, dev, dm, crtc, wait_for_vblank);
11078 }
11079
11080 /* Enable writeback */
11081 for_each_new_connector_in_state(state, connector, new_con_state, i) {
11082 struct dm_connector_state *dm_new_con_state = to_dm_connector_state(new_con_state);
11083 struct amdgpu_crtc *acrtc = to_amdgpu_crtc(dm_new_con_state->base.crtc);
11084
11085 if (connector->connector_type != DRM_MODE_CONNECTOR_WRITEBACK)
11086 continue;
11087
11088 if (!new_con_state->writeback_job)
11089 continue;
11090
11091 new_crtc_state = drm_atomic_get_new_crtc_state(state, &acrtc->base);
11092
11093 if (!new_crtc_state)
11094 continue;
11095
11096 if (acrtc->wb_enabled)
11097 continue;
11098
11099 dm_new_crtc_state = to_dm_crtc_state(new_crtc_state);
11100
11101 dm_set_writeback(dm, dm_new_crtc_state, connector, new_con_state);
11102 acrtc->wb_enabled = true;
11103 }
11104
11105 /* Update audio instances for each connector. */
11106 amdgpu_dm_commit_audio(dev, state);
11107
11108 /* restore the backlight level */
11109 for (i = 0; i < dm->num_of_edps; i++) {
11110 if (dm->backlight_dev[i] &&
11111 (dm->actual_brightness[i] != dm->brightness[i]))
11112 amdgpu_dm_backlight_set_level(dm, i, dm->brightness[i]);
11113 }
11114
11115 /*
11116 * send vblank event on all events not handled in flip and
11117 * mark consumed event for drm_atomic_helper_commit_hw_done
11118 */
11119 spin_lock_irqsave(&adev_to_drm(adev)->event_lock, flags);
11120 for_each_new_crtc_in_state(state, crtc, new_crtc_state, i) {
11121
11122 if (new_crtc_state->event)
11123 drm_send_event_locked(dev, &new_crtc_state->event->base);
11124
11125 new_crtc_state->event = NULL;
11126 }
11127 spin_unlock_irqrestore(&adev_to_drm(adev)->event_lock, flags);
11128
11129 /* Signal HW programming completion */
11130 drm_atomic_helper_commit_hw_done(state);
11131
11132 if (wait_for_vblank &&
11133 drm_atomic_helper_wait_for_flip_done(dev, state)) {
11134 mutex_lock(&dm->dc_lock);
11135 if (dm_dmub_hw_init(adev))
11136 drm_dev_wedged_event(dev, DRM_WEDGE_RECOVERY_REBIND |
11137 DRM_WEDGE_RECOVERY_BUS_RESET,
11138 NULL);
11139 mutex_unlock(&dm->dc_lock);
11140
11141 spin_lock_irqsave(&dev->event_lock, flags);
11142 drm_for_each_crtc(crtc, dev) {
11143 if (acrtc->event) {
11144 drm_crtc_send_vblank_event(crtc, acrtc->event);
11145 acrtc->event = NULL;
11146 drm_crtc_vblank_put(crtc);
11147 acrtc->pflip_status = AMDGPU_FLIP_NONE;
11148 }
11149 }
11150 spin_unlock_irqrestore(&dev->event_lock, flags);
11151 }
11152
11153 drm_atomic_helper_cleanup_planes(dev, state);
11154
11155 /* Don't free the memory if we are hitting this as part of suspend.
11156 * This way we don't free any memory during suspend; see
11157 * amdgpu_bo_free_kernel(). The memory will be freed in the first
11158 * non-suspend modeset or when the driver is torn down.
11159 */
11160 if (!adev->in_suspend) {
11161 /* return the stolen vga memory back to VRAM */
11162 if (!adev->mman.keep_stolen_vga_memory)
11163 amdgpu_bo_free_kernel(&adev->mman.stolen_vga_memory, NULL, NULL);
11164 amdgpu_bo_free_kernel(&adev->mman.stolen_extended_memory, NULL, NULL);
11165 }
11166
11167 /*
11168 * Finally, drop a runtime PM reference for each newly disabled CRTC,
11169 * so we can put the GPU into runtime suspend if we're not driving any
11170 * displays anymore
11171 */
11172 for (i = 0; i < crtc_disable_count; i++)
11173 pm_runtime_put_autosuspend(dev->dev);
11174 pm_runtime_mark_last_busy(dev->dev);
11175
11176 trace_amdgpu_dm_atomic_commit_tail_finish(state);
11177 }
11178
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH v4 2/2] drm/amd/display: add vendor specific reset
2026-02-20 17:15 ` [PATCH v4 2/2] drm/amd/display: add vendor specific reset Hamza Mahfooz
2026-02-20 17:37 ` Hamza Mahfooz
2026-02-22 1:32 ` kernel test robot
@ 2026-02-23 9:34 ` Christian König
2026-02-27 20:58 ` Hamza Mahfooz
2 siblings, 1 reply; 7+ messages in thread
From: Christian König @ 2026-02-23 9:34 UTC (permalink / raw)
To: Hamza Mahfooz, dri-devel
Cc: Michel Dänzer, Mario Limonciello, Harry Wentland, Leo Li,
Rodrigo Siqueira, Alex Deucher, David Airlie, Simona Vetter,
Maarten Lankhorst, Maxime Ripard, Thomas Zimmermann, Alex Hung,
Aurabindo Pillai, Wayne Lin, Timur Kristóf, Ivan Lipski,
Dominik Kaszewski, amd-gfx, linux-kernel
On 2/20/26 18:15, Hamza Mahfooz wrote:
> We now have a means to respond to page flip timeouts. So, hook up
> support by trying to reload DMUB firmware if
> drm_atomic_helper_wait_for_flip_done() fails. Also, send out a wedged
> event if the firmware reload fails.
>
> Signed-off-by: Hamza Mahfooz <someguy@effective-light.com>
> ---
> v2: send a wedged event instead of attempting a GPU reset.
> v3: read return value of drm_atomic_helper_wait_for_flip_done().
> v4: only send wedged event if firmware reload fails and send out "fake"
> page flip completes.
> ---
> .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 24 +++++++++++++++++--
> 1 file changed, 22 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
> index 7c51d8d7e73c..0ae6ada22fb0 100644
> --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
> +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
> @@ -87,6 +87,7 @@
> #include <drm/drm_atomic_uapi.h>
> #include <drm/drm_atomic_helper.h>
> #include <drm/drm_blend.h>
> +#include <drm/drm_drv.h>
> #include <drm/drm_fixed.h>
> #include <drm/drm_fourcc.h>
> #include <drm/drm_edid.h>
> @@ -10829,6 +10830,7 @@ static void amdgpu_dm_atomic_commit_tail(struct drm_atomic_state *state)
> struct drm_connector_state *old_con_state = NULL, *new_con_state = NULL;
> struct dm_crtc_state *dm_old_crtc_state, *dm_new_crtc_state;
> int crtc_disable_count = 0;
> + struct amdgpu_crtc *acrtc;
>
> trace_amdgpu_dm_atomic_commit_tail_begin(state);
>
> @@ -11085,8 +11087,26 @@ static void amdgpu_dm_atomic_commit_tail(struct drm_atomic_state *state)
> /* Signal HW programming completion */
> drm_atomic_helper_commit_hw_done(state);
>
> - if (wait_for_vblank)
> - drm_atomic_helper_wait_for_flip_done(dev, state);
> + if (wait_for_vblank &&
> + drm_atomic_helper_wait_for_flip_done(dev, state)) {
> + mutex_lock(&dm->dc_lock);
> + if (dm_dmub_hw_init(adev))
From Michel's explanation that is pretty much a no-go because it potentially causes other atomic commits to react in unforeseen ways.
> + drm_dev_wedged_event(dev, DRM_WEDGE_RECOVERY_REBIND |
> + DRM_WEDGE_RECOVERY_BUS_RESET,
> + NULL);
Please completely drop that. This here is a sledge hammer and not going to fly anywhere.
> + mutex_unlock(&dm->dc_lock);
> +
> + spin_lock_irqsave(&dev->event_lock, flags);
> + drm_for_each_crtc(crtc, dev) {
This should go over only the CRTCs in the atomic commit currently handled.
Have you tried sending a hotplug event for the connectors in question as Michel suggested?
Regards,
Christian.
> + if (acrtc->event) {
> + drm_crtc_send_vblank_event(crtc, acrtc->event);
> + acrtc->event = NULL;
> + drm_crtc_vblank_put(crtc);
> + acrtc->pflip_status = AMDGPU_FLIP_NONE;
> + }
> + }
> + spin_unlock_irqrestore(&dev->event_lock, flags);
> + }
>
> drm_atomic_helper_cleanup_planes(dev, state);
>
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH v4 2/2] drm/amd/display: add vendor specific reset
2026-02-23 9:34 ` Christian König
@ 2026-02-27 20:58 ` Hamza Mahfooz
0 siblings, 0 replies; 7+ messages in thread
From: Hamza Mahfooz @ 2026-02-27 20:58 UTC (permalink / raw)
To: Christian König
Cc: dri-devel, Michel Dänzer, Mario Limonciello, Harry Wentland,
Leo Li, Rodrigo Siqueira, Alex Deucher, David Airlie,
Simona Vetter, Maarten Lankhorst, Maxime Ripard,
Thomas Zimmermann, Alex Hung, Aurabindo Pillai, Wayne Lin,
Timur Kristóf, Ivan Lipski, Dominik Kaszewski, amd-gfx,
linux-kernel
On Mon, Feb 23, 2026 at 10:34:17AM +0100, Christian König wrote:
> > @@ -11085,8 +11087,26 @@ static void amdgpu_dm_atomic_commit_tail(struct drm_atomic_state *state)
> > /* Signal HW programming completion */
> > drm_atomic_helper_commit_hw_done(state);
> >
> > - if (wait_for_vblank)
> > - drm_atomic_helper_wait_for_flip_done(dev, state);
> > + if (wait_for_vblank &&
> > + drm_atomic_helper_wait_for_flip_done(dev, state)) {
> > + mutex_lock(&dm->dc_lock);
> > + if (dm_dmub_hw_init(adev))
>
> From Michel's explanation that is pretty much a no-go because it potentially causes other atomic commits to react in unforeseen ways.
>
This code would only run if the forced modeset fails, which is to say we
are already in a hung state, so I don't expect any other atomic commits
to be firing off. Also, evidently the timeout isn't a one off, so it's
almost certainly caused by hung firmware and not by a bug in the
driver's vblanking code.
> > + drm_dev_wedged_event(dev, DRM_WEDGE_RECOVERY_REBIND |
> > + DRM_WEDGE_RECOVERY_BUS_RESET,
> > + NULL);
>
> Please completely drop that. This here is a sledge hammer and not going to fly anywhere.
>
I don't feel too strongly about it, though isn't the point to inform
userspace that we were unable to recover? AFAIK the prescribed methods
are mere suggestions and users can choose to ignore them if they feel
that they are too hard hitting.
> > + mutex_unlock(&dm->dc_lock);
> > +
> > + spin_lock_irqsave(&dev->event_lock, flags);
> > + drm_for_each_crtc(crtc, dev) {
>
> This should go over only the CRTCs in the atomic commit currently handled.
>
> Have you tried sending a hotplug event for the connectors in question as Michel suggested?
>
Sure, I can look into that. However, we would still need the firmware
reload. Otherwise, we would just be forcing a modeset twice in
succession.
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH v4 1/2] drm: introduce KMS recovery mechanism
2026-02-20 17:15 [PATCH v4 1/2] drm: introduce KMS recovery mechanism Hamza Mahfooz
2026-02-20 17:15 ` [PATCH v4 2/2] drm/amd/display: add vendor specific reset Hamza Mahfooz
@ 2026-02-28 18:48 ` Shengyu Qu
1 sibling, 0 replies; 7+ messages in thread
From: Shengyu Qu @ 2026-02-28 18:48 UTC (permalink / raw)
To: Hamza Mahfooz, dri-devel
Cc: wiagn233, Michel Dänzer, Mario Limonciello, Harry Wentland,
Leo Li, Rodrigo Siqueira, Alex Deucher, Christian König,
David Airlie, Simona Vetter, Maarten Lankhorst, Maxime Ripard,
Thomas Zimmermann, Alex Hung, Aurabindo Pillai, Wayne Lin,
Timur Kristóf, Ivan Lipski, Dominik Kaszewski, amd-gfx,
linux-kernel
[-- Attachment #1.1.1: Type: text/plain, Size: 6073 bytes --]
Hi,
Since you can reproduce this timeout issue, can you try the debug method
here[1] to confirm if your issue is the same as that patch?
[1] https://lists.freedesktop.org/archives/amd-gfx/2026-February/138842.html
Best regards,
Shengyu
在 2026/2/21 1:15, Hamza Mahfooz 写道:
> There should be a mechanism for drivers to respond to flip_done
> timeouts. Since, as it stands it is possible for the display to stall
> indefinitely, necessitating a hard reset. So, introduce a new mechanism
> that tries various methods of recovery with increasing aggression, in
> the following order:
>
> 1. Force a full modeset (have the compositor reprogram the state from
> scratch).
> 2. As a last resort, have the driver attempt a vendor specific reset
> (which they can do by reading the return value of
> drm_atomic_helper_wait_for_flip_done()).
>
> Signed-off-by: Hamza Mahfooz <someguy@effective-light.com>
> ---
> v2: new to the series
> v3: get rid of page_flip_timeout() and have
> drm_atomic_helper_wait_for_flip_done() return a error.
> v4: get rid of nested ret variable.
> ---
> drivers/gpu/drm/drm_atomic_helper.c | 47 ++++++++++++++++++++++++-----
> include/drm/drm_atomic_helper.h | 4 +--
> include/drm/drm_device.h | 24 +++++++++++++++
> 3 files changed, 66 insertions(+), 9 deletions(-)
>
> diff --git a/drivers/gpu/drm/drm_atomic_helper.c b/drivers/gpu/drm/drm_atomic_helper.c
> index 5840e9cc6f66..d905eb166225 100644
> --- a/drivers/gpu/drm/drm_atomic_helper.c
> +++ b/drivers/gpu/drm/drm_atomic_helper.c
> @@ -42,6 +42,7 @@
> #include <drm/drm_gem_atomic_helper.h>
> #include <drm/drm_panic.h>
> #include <drm/drm_print.h>
> +#include <drm/drm_probe_helper.h>
> #include <drm/drm_self_refresh_helper.h>
> #include <drm/drm_vblank.h>
> #include <drm/drm_writeback.h>
> @@ -1864,30 +1865,62 @@ EXPORT_SYMBOL(drm_atomic_helper_wait_for_vblanks);
> *
> * This requires that drivers use the nonblocking commit tracking support
> * initialized using drm_atomic_helper_setup_commit().
> + *
> + * Returns:
> + * -ETIMEDOUT to indicate that drivers can attempt a vendor reset, 0 otherwise.
> */
> -void drm_atomic_helper_wait_for_flip_done(struct drm_device *dev,
> - struct drm_atomic_state *state)
> +int drm_atomic_helper_wait_for_flip_done(struct drm_device *dev,
> + struct drm_atomic_state *state)
> {
> struct drm_crtc *crtc;
> + int ret = 0;
> int i;
>
> for (i = 0; i < dev->mode_config.num_crtc; i++) {
> struct drm_crtc_commit *commit = state->crtcs[i].commit;
> - int ret;
>
> crtc = state->crtcs[i].ptr;
>
> if (!crtc || !commit)
> continue;
>
> - ret = wait_for_completion_timeout(&commit->flip_done, 10 * HZ);
> - if (ret == 0)
> - drm_err(dev, "[CRTC:%d:%s] flip_done timed out\n",
> - crtc->base.id, crtc->name);
> + if (!wait_for_completion_timeout(&commit->flip_done, 10 * HZ)) {
> + switch (dev->reset_phase) {
> + case DRM_KMS_RESET_NONE:
> + drm_err(dev, "[CRTC:%d:%s] flip_done timed out\n",
> + crtc->base.id, crtc->name);
> + dev->reset_phase = DRM_KMS_RESET_FORCE_MODESET;
> + drm_kms_helper_hotplug_event(dev);
> + break;
> + case DRM_KMS_RESET_FORCE_MODESET:
> + drm_err(dev, "[CRTC:%d:%s] force full modeset failed\n",
> + crtc->base.id, crtc->name);
> + dev->reset_phase = DRM_KMS_RESET_VENDOR;
> + ret = -ETIMEDOUT;
> + break;
> + case DRM_KMS_RESET_VENDOR:
> + drm_err(dev, "[CRTC:%d:%s] KMS recovery failed!\n",
> + crtc->base.id, crtc->name);
> + dev->reset_phase = DRM_KMS_RESET_GIVE_UP;
> + break;
> + default:
> + break;
> + }
> +
> + goto exit;
> + }
> + }
> +
> + if (dev->reset_phase) {
> + drm_info(dev, "KMS recovery succeeded!\n");
> + dev->reset_phase = DRM_KMS_RESET_NONE;
> }
>
> +exit:
> if (state->fake_commit)
> complete_all(&state->fake_commit->flip_done);
> +
> + return ret;
> }
> EXPORT_SYMBOL(drm_atomic_helper_wait_for_flip_done);
>
> diff --git a/include/drm/drm_atomic_helper.h b/include/drm/drm_atomic_helper.h
> index 53382fe93537..298c8dff3993 100644
> --- a/include/drm/drm_atomic_helper.h
> +++ b/include/drm/drm_atomic_helper.h
> @@ -79,8 +79,8 @@ int drm_atomic_helper_wait_for_fences(struct drm_device *dev,
> void drm_atomic_helper_wait_for_vblanks(struct drm_device *dev,
> struct drm_atomic_state *old_state);
>
> -void drm_atomic_helper_wait_for_flip_done(struct drm_device *dev,
> - struct drm_atomic_state *old_state);
> +int drm_atomic_helper_wait_for_flip_done(struct drm_device *dev,
> + struct drm_atomic_state *old_state);
>
> void
> drm_atomic_helper_update_legacy_modeset_state(struct drm_device *dev,
> diff --git a/include/drm/drm_device.h b/include/drm/drm_device.h
> index bc78fb77cc27..1244d7527e7b 100644
> --- a/include/drm/drm_device.h
> +++ b/include/drm/drm_device.h
> @@ -66,6 +66,23 @@ enum switch_power_state {
> DRM_SWITCH_POWER_DYNAMIC_OFF = 3,
> };
>
> +/**
> + * enum drm_kms_reset_phase - reset phase of drm device
> + */
> +enum drm_kms_reset_phase {
> + /** @DRM_KMS_RESET_NONE: Not currently attempting recovery */
> + DRM_KMS_RESET_NONE,
> +
> + /** @DRM_KMS_RESET_FORCE_MODESET: Force a full modeset */
> + DRM_KMS_RESET_FORCE_MODESET,
> +
> + /** @DRM_KMS_RESET_VENDOR: Attempt a vendor reset */
> + DRM_KMS_RESET_VENDOR,
> +
> + /** @DRM_KMS_RESET_GIVE_UP: All recovery methods failed */
> + DRM_KMS_RESET_GIVE_UP,
> +};
> +
> /**
> * struct drm_device - DRM device structure
> *
> @@ -375,6 +392,13 @@ struct drm_device {
> * Root directory for debugfs files.
> */
> struct dentry *debugfs_root;
> +
> + /**
> + * @reset_phase:
> + *
> + * Reset phase that the device is in.
> + */
> + enum drm_kms_reset_phase reset_phase;
> };
>
> void drm_dev_set_dma_dev(struct drm_device *dev, struct device *dma_dev);
[-- Attachment #1.1.2: OpenPGP public key --]
[-- Type: application/pgp-keys, Size: 6977 bytes --]
[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 840 bytes --]
^ permalink raw reply [flat|nested] 7+ messages in thread
end of thread, other threads:[~2026-02-28 18:48 UTC | newest]
Thread overview: 7+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-02-20 17:15 [PATCH v4 1/2] drm: introduce KMS recovery mechanism Hamza Mahfooz
2026-02-20 17:15 ` [PATCH v4 2/2] drm/amd/display: add vendor specific reset Hamza Mahfooz
2026-02-20 17:37 ` Hamza Mahfooz
2026-02-22 1:32 ` kernel test robot
2026-02-23 9:34 ` Christian König
2026-02-27 20:58 ` Hamza Mahfooz
2026-02-28 18:48 ` [PATCH v4 1/2] drm: introduce KMS recovery mechanism Shengyu Qu
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox