[PATCH] accel/ivpu: Implement heartbeat-based TDR mechanism

dri-devel.lists.freedesktop.org archive mirror
 help / color / mirror / Atom feed

* [PATCH] accel/ivpu: Implement heartbeat-based TDR mechanism
@ 2025-04-16 10:25 Maciej Falkowski
  2025-04-18 15:27 ` Jeffrey Hugo
  2025-04-25  8:50 ` Jacek Lawrynowicz
  0 siblings, 2 replies; 6+ messages in thread
From: Maciej Falkowski @ 2025-04-16 10:25 UTC (permalink / raw)
  To: dri-devel
  Cc: oded.gabbay, quic_jhugo, jacek.lawrynowicz, lizhi.hou,
	Karol Wachowski, Maciej Falkowski

From: Karol Wachowski <karol.wachowski@intel.com>

Introduce a heartbeat-based Timeout Detection and Recovery (TDR) mechanism.
The enhancement aims to improve the reliability of device hang detection by
monitoring heartbeat updates.

Each progressing inference will update heartbeat counter allowing driver to
monitor its progression. Limit maximum number of reschedules when heartbeat
indicates progression to 30.

The heartbeat mechanism provides a more robust method for detecting device
hangs, potentially reducing false positive recoveries due to long running
inferences.

Signed-off-by: Karol Wachowski <karol.wachowski@intel.com>
Signed-off-by: Maciej Falkowski <maciej.falkowski@linux.intel.com>
---
 drivers/accel/ivpu/ivpu_drv.c |  4 ++++
 drivers/accel/ivpu/ivpu_drv.h |  1 +
 drivers/accel/ivpu/ivpu_fw.h  |  1 +
 drivers/accel/ivpu/ivpu_pm.c  | 20 ++++++++++++++++++++
 4 files changed, 26 insertions(+)

diff --git a/drivers/accel/ivpu/ivpu_drv.c b/drivers/accel/ivpu/ivpu_drv.c
index eff1d3ca075f..0e7748c5e117 100644
--- a/drivers/accel/ivpu/ivpu_drv.c
+++ b/drivers/accel/ivpu/ivpu_drv.c
@@ -374,6 +374,9 @@ int ivpu_boot(struct ivpu_device *vdev)
 {
 	int ret;
 
+	drm_WARN_ON(&vdev->drm, atomic_read(&vdev->job_timeout_counter));
+	drm_WARN_ON(&vdev->drm, !xa_empty(&vdev->submitted_jobs_xa));
+
 	/* Update boot params located at first 4KB of FW memory */
 	ivpu_fw_boot_params_setup(vdev, ivpu_bo_vaddr(vdev->fw->mem));
 
@@ -573,6 +576,7 @@ static int ivpu_dev_init(struct ivpu_device *vdev)
 	vdev->context_xa_limit.min = IVPU_USER_CONTEXT_MIN_SSID;
 	vdev->context_xa_limit.max = IVPU_USER_CONTEXT_MAX_SSID;
 	atomic64_set(&vdev->unique_id_counter, 0);
+	atomic_set(&vdev->job_timeout_counter, 0);
 	xa_init_flags(&vdev->context_xa, XA_FLAGS_ALLOC | XA_FLAGS_LOCK_IRQ);
 	xa_init_flags(&vdev->submitted_jobs_xa, XA_FLAGS_ALLOC1);
 	xa_init_flags(&vdev->db_xa, XA_FLAGS_ALLOC1);
diff --git a/drivers/accel/ivpu/ivpu_drv.h b/drivers/accel/ivpu/ivpu_drv.h
index 92753effb1c9..5497e7030e91 100644
--- a/drivers/accel/ivpu/ivpu_drv.h
+++ b/drivers/accel/ivpu/ivpu_drv.h
@@ -154,6 +154,7 @@ struct ivpu_device {
 	struct mutex submitted_jobs_lock; /* Protects submitted_jobs */
 	struct xarray submitted_jobs_xa;
 	struct ivpu_ipc_consumer job_done_consumer;
+	atomic_t job_timeout_counter;
 
 	atomic64_t unique_id_counter;
 
diff --git a/drivers/accel/ivpu/ivpu_fw.h b/drivers/accel/ivpu/ivpu_fw.h
index 1d0b2bd9d65c..9a3935be1c05 100644
--- a/drivers/accel/ivpu/ivpu_fw.h
+++ b/drivers/accel/ivpu/ivpu_fw.h
@@ -39,6 +39,7 @@ struct ivpu_fw_info {
 	u64 read_only_addr;
 	u32 read_only_size;
 	u32 sched_mode;
+	u64 last_heartbeat;
 };
 
 int ivpu_fw_init(struct ivpu_device *vdev);
diff --git a/drivers/accel/ivpu/ivpu_pm.c b/drivers/accel/ivpu/ivpu_pm.c
index b5891e91f7ab..1fe03fc16bbc 100644
--- a/drivers/accel/ivpu/ivpu_pm.c
+++ b/drivers/accel/ivpu/ivpu_pm.c
@@ -34,6 +34,7 @@ module_param_named(tdr_timeout_ms, ivpu_tdr_timeout_ms, ulong, 0644);
 MODULE_PARM_DESC(tdr_timeout_ms, "Timeout for device hang detection, in milliseconds, 0 - default");
 
 #define PM_RESCHEDULE_LIMIT     5
+#define PM_TDR_HEARTBEAT_LIMIT  30
 
 static void ivpu_pm_prepare_cold_boot(struct ivpu_device *vdev)
 {
@@ -44,6 +45,7 @@ static void ivpu_pm_prepare_cold_boot(struct ivpu_device *vdev)
 	ivpu_fw_log_reset(vdev);
 	ivpu_fw_load(vdev);
 	fw->entry_point = fw->cold_boot_entry_point;
+	fw->last_heartbeat = 0;
 }
 
 static void ivpu_pm_prepare_warm_boot(struct ivpu_device *vdev)
@@ -189,7 +191,24 @@ static void ivpu_job_timeout_work(struct work_struct *work)
 {
 	struct ivpu_pm_info *pm = container_of(work, struct ivpu_pm_info, job_timeout_work.work);
 	struct ivpu_device *vdev = pm->vdev;
+	u64 heartbeat;
 
+	if (ivpu_jsm_get_heartbeat(vdev, 0, &heartbeat) || heartbeat <= vdev->fw->last_heartbeat) {
+		ivpu_err(vdev, "Job timeout detected, heartbeat not progressed\n");
+		goto recovery;
+	}
+
+	if (atomic_fetch_inc(&vdev->job_timeout_counter) > PM_TDR_HEARTBEAT_LIMIT) {
+		ivpu_err(vdev, "Job timeout detected, heartbeat limit exceeded\n");
+		goto recovery;
+	}
+
+	vdev->fw->last_heartbeat = heartbeat;
+	ivpu_start_job_timeout_detection(vdev);
+	return;
+
+recovery:
+	atomic_set(&vdev->job_timeout_counter, 0);
 	ivpu_pm_trigger_recovery(vdev, "TDR");
 }
 
@@ -204,6 +223,7 @@ void ivpu_start_job_timeout_detection(struct ivpu_device *vdev)
 void ivpu_stop_job_timeout_detection(struct ivpu_device *vdev)
 {
 	cancel_delayed_work_sync(&vdev->pm->job_timeout_work);
+	atomic_set(&vdev->job_timeout_counter, 0);
 }
 
 int ivpu_pm_suspend_cb(struct device *dev)
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [PATCH] accel/ivpu: Implement heartbeat-based TDR mechanism
  2025-04-16 10:25 [PATCH] accel/ivpu: Implement heartbeat-based TDR mechanism Maciej Falkowski
@ 2025-04-18 15:27 ` Jeffrey Hugo
  2025-04-23  7:23   ` Jacek Lawrynowicz
  2025-04-25  8:50 ` Jacek Lawrynowicz
  1 sibling, 1 reply; 6+ messages in thread
From: Jeffrey Hugo @ 2025-04-18 15:27 UTC (permalink / raw)
  To: Maciej Falkowski, dri-devel
  Cc: oded.gabbay, jacek.lawrynowicz, lizhi.hou, Karol Wachowski

On 4/16/2025 4:25 AM, Maciej Falkowski wrote:
> From: Karol Wachowski <karol.wachowski@intel.com>
> 
> Introduce a heartbeat-based Timeout Detection and Recovery (TDR) mechanism.
> The enhancement aims to improve the reliability of device hang detection by
> monitoring heartbeat updates.
> 
> Each progressing inference will update heartbeat counter allowing driver to
> monitor its progression. Limit maximum number of reschedules when heartbeat
> indicates progression to 30.

Code looks good.  However, why 30?  This would artificially limit how 
long a job could run, no?

-Jeff

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] accel/ivpu: Implement heartbeat-based TDR mechanism
  2025-04-18 15:27 ` Jeffrey Hugo
@ 2025-04-23  7:23   ` Jacek Lawrynowicz
  2025-04-23 14:23     ` Jeff Hugo
  0 siblings, 1 reply; 6+ messages in thread
From: Jacek Lawrynowicz @ 2025-04-23  7:23 UTC (permalink / raw)
  To: Jeffrey Hugo, Maciej Falkowski, dri-devel
  Cc: oded.gabbay, lizhi.hou, Karol Wachowski

Hi,

On 4/18/2025 5:27 PM, Jeffrey Hugo wrote:
> On 4/16/2025 4:25 AM, Maciej Falkowski wrote:
>> From: Karol Wachowski <karol.wachowski@intel.com>
>>
>> Introduce a heartbeat-based Timeout Detection and Recovery (TDR) mechanism.
>> The enhancement aims to improve the reliability of device hang detection by
>> monitoring heartbeat updates.
>>
>> Each progressing inference will update heartbeat counter allowing driver to
>> monitor its progression. Limit maximum number of reschedules when heartbeat
>> indicates progression to 30.
> 
> Code looks good.  However, why 30?  This would artificially limit how long a job could run, no?

Yes, we still need a time based limit. There may be workloads that are stuck in infinite loop for example.
With this patch the max time the job can run is extended from 2 to 60 seconds.
We are not aware of any workloads that exceed this timeout at the moment.

Regards,
Jacek

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] accel/ivpu: Implement heartbeat-based TDR mechanism
  2025-04-23  7:23   ` Jacek Lawrynowicz
@ 2025-04-23 14:23     ` Jeff Hugo
  2025-04-25  8:22       ` Jacek Lawrynowicz
  0 siblings, 1 reply; 6+ messages in thread
From: Jeff Hugo @ 2025-04-23 14:23 UTC (permalink / raw)
  To: Jacek Lawrynowicz, Maciej Falkowski, dri-devel
  Cc: oded.gabbay, lizhi.hou, Karol Wachowski

On 4/23/2025 1:23 AM, Jacek Lawrynowicz wrote:
> Hi,
> 
> On 4/18/2025 5:27 PM, Jeffrey Hugo wrote:
>> On 4/16/2025 4:25 AM, Maciej Falkowski wrote:
>>> From: Karol Wachowski <karol.wachowski@intel.com>
>>>
>>> Introduce a heartbeat-based Timeout Detection and Recovery (TDR) mechanism.
>>> The enhancement aims to improve the reliability of device hang detection by
>>> monitoring heartbeat updates.
>>>
>>> Each progressing inference will update heartbeat counter allowing driver to
>>> monitor its progression. Limit maximum number of reschedules when heartbeat
>>> indicates progression to 30.
>>
>> Code looks good.  However, why 30?  This would artificially limit how long a job could run, no?
> 
> Yes, we still need a time based limit. There may be workloads that are stuck in infinite loop for example.
> With this patch the max time the job can run is extended from 2 to 60 seconds.
> We are not aware of any workloads that exceed this timeout at the moment.

Infinite loop vs something that just happens to be running long by 
design is a difficult problem.  60 seconds does not seem all that long 
to me.  Perhaps consider some kind of override so that if/when a 
workload comes along that needs more than 60 seconds, the user doesn't 
need to recompile their kernel to make it work?  I suspect that would be 
outside the scope of this change.

For this change, I think it would be good to add some info from your 
response.  I think the commit text would be improved by stating this 
increases the max runtime from 2 seconds to 60, and that this covers all 
known workloads.  Also, I think a comment on PM_TDR_HEARTBEAT_LIMIT that 
tells how long the limit is (60 seconds) would be helpful to future 
readers, instead of needing to parse through multiple functions and how 
they all interact.

With the commit text update -

Reviewed-by: Jeff Hugo <jeff.hugo@oss.qualcomm.com>

The PM_TDR_HEARTBEAT_LIMIT comment is optional to me.

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] accel/ivpu: Implement heartbeat-based TDR mechanism
  2025-04-23 14:23     ` Jeff Hugo
@ 2025-04-25  8:22       ` Jacek Lawrynowicz
  0 siblings, 0 replies; 6+ messages in thread
From: Jacek Lawrynowicz @ 2025-04-25  8:22 UTC (permalink / raw)
  To: Jeff Hugo, Maciej Falkowski, dri-devel
  Cc: oded.gabbay, lizhi.hou, Karol Wachowski

Hi,

On 4/23/2025 4:23 PM, Jeff Hugo wrote:
> On 4/23/2025 1:23 AM, Jacek Lawrynowicz wrote:
>> Hi,
>>
>> On 4/18/2025 5:27 PM, Jeffrey Hugo wrote:
>>> On 4/16/2025 4:25 AM, Maciej Falkowski wrote:
>>>> From: Karol Wachowski <karol.wachowski@intel.com>
>>>>
>>>> Introduce a heartbeat-based Timeout Detection and Recovery (TDR) mechanism.
>>>> The enhancement aims to improve the reliability of device hang detection by
>>>> monitoring heartbeat updates.
>>>>
>>>> Each progressing inference will update heartbeat counter allowing driver to
>>>> monitor its progression. Limit maximum number of reschedules when heartbeat
>>>> indicates progression to 30.
>>>
>>> Code looks good.  However, why 30?  This would artificially limit how long a job could run, no?
>>
>> Yes, we still need a time based limit. There may be workloads that are stuck in infinite loop for example.
>> With this patch the max time the job can run is extended from 2 to 60 seconds.
>> We are not aware of any workloads that exceed this timeout at the moment.
> 
> Infinite loop vs something that just happens to be running long by design is a difficult problem.  60 seconds does not seem all that long to me.  Perhaps consider some kind of override so that if/when a workload comes along that needs more than 60 seconds, the user doesn't need to recompile their kernel to make it work?  I suspect that would be outside the scope of this change.
> 
> For this change, I think it would be good to add some info from your response.  I think the commit text would be improved by stating this increases the max runtime from 2 seconds to 60, and that this covers all known workloads.  Also, I think a comment on PM_TDR_HEARTBEAT_LIMIT that tells how long the limit is (60 seconds) would be helpful to future readers, instead of needing to parse through multiple functions and how they all interact.
> 
> With the commit text update -
> 
> Reviewed-by: Jeff Hugo <jeff.hugo@oss.qualcomm.com>
> 
> The PM_TDR_HEARTBEAT_LIMIT comment is optional to me.

OK, I will update the commit message.

There is a module param that can be used to increase the overall timeout (tdr_timeout_ms).
The total timeout is (tdr_timeout_ms * 30) but this is not too intuitive after this change.
I will figure better solution that's easier to use.


Regards,
Jacek


^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] accel/ivpu: Implement heartbeat-based TDR mechanism
  2025-04-16 10:25 [PATCH] accel/ivpu: Implement heartbeat-based TDR mechanism Maciej Falkowski
  2025-04-18 15:27 ` Jeffrey Hugo
@ 2025-04-25  8:50 ` Jacek Lawrynowicz
  1 sibling, 0 replies; 6+ messages in thread
From: Jacek Lawrynowicz @ 2025-04-25  8:50 UTC (permalink / raw)
  To: Maciej Falkowski, dri-devel
  Cc: oded.gabbay, quic_jhugo, lizhi.hou, Karol Wachowski

Applied to drm-misc-next

On 4/16/2025 12:25 PM, Maciej Falkowski wrote:
> From: Karol Wachowski <karol.wachowski@intel.com>
> 
> Introduce a heartbeat-based Timeout Detection and Recovery (TDR) mechanism.
> The enhancement aims to improve the reliability of device hang detection by
> monitoring heartbeat updates.
> 
> Each progressing inference will update heartbeat counter allowing driver to
> monitor its progression. Limit maximum number of reschedules when heartbeat
> indicates progression to 30.
> 
> The heartbeat mechanism provides a more robust method for detecting device
> hangs, potentially reducing false positive recoveries due to long running
> inferences.
> 
> Signed-off-by: Karol Wachowski <karol.wachowski@intel.com>
> Signed-off-by: Maciej Falkowski <maciej.falkowski@linux.intel.com>
> ---
>  drivers/accel/ivpu/ivpu_drv.c |  4 ++++
>  drivers/accel/ivpu/ivpu_drv.h |  1 +
>  drivers/accel/ivpu/ivpu_fw.h  |  1 +
>  drivers/accel/ivpu/ivpu_pm.c  | 20 ++++++++++++++++++++
>  4 files changed, 26 insertions(+)
> 
> diff --git a/drivers/accel/ivpu/ivpu_drv.c b/drivers/accel/ivpu/ivpu_drv.c
> index eff1d3ca075f..0e7748c5e117 100644
> --- a/drivers/accel/ivpu/ivpu_drv.c
> +++ b/drivers/accel/ivpu/ivpu_drv.c
> @@ -374,6 +374,9 @@ int ivpu_boot(struct ivpu_device *vdev)
>  {
>  	int ret;
>  
> +	drm_WARN_ON(&vdev->drm, atomic_read(&vdev->job_timeout_counter));
> +	drm_WARN_ON(&vdev->drm, !xa_empty(&vdev->submitted_jobs_xa));
> +
>  	/* Update boot params located at first 4KB of FW memory */
>  	ivpu_fw_boot_params_setup(vdev, ivpu_bo_vaddr(vdev->fw->mem));
>  
> @@ -573,6 +576,7 @@ static int ivpu_dev_init(struct ivpu_device *vdev)
>  	vdev->context_xa_limit.min = IVPU_USER_CONTEXT_MIN_SSID;
>  	vdev->context_xa_limit.max = IVPU_USER_CONTEXT_MAX_SSID;
>  	atomic64_set(&vdev->unique_id_counter, 0);
> +	atomic_set(&vdev->job_timeout_counter, 0);
>  	xa_init_flags(&vdev->context_xa, XA_FLAGS_ALLOC | XA_FLAGS_LOCK_IRQ);
>  	xa_init_flags(&vdev->submitted_jobs_xa, XA_FLAGS_ALLOC1);
>  	xa_init_flags(&vdev->db_xa, XA_FLAGS_ALLOC1);
> diff --git a/drivers/accel/ivpu/ivpu_drv.h b/drivers/accel/ivpu/ivpu_drv.h
> index 92753effb1c9..5497e7030e91 100644
> --- a/drivers/accel/ivpu/ivpu_drv.h
> +++ b/drivers/accel/ivpu/ivpu_drv.h
> @@ -154,6 +154,7 @@ struct ivpu_device {
>  	struct mutex submitted_jobs_lock; /* Protects submitted_jobs */
>  	struct xarray submitted_jobs_xa;
>  	struct ivpu_ipc_consumer job_done_consumer;
> +	atomic_t job_timeout_counter;
>  
>  	atomic64_t unique_id_counter;
>  
> diff --git a/drivers/accel/ivpu/ivpu_fw.h b/drivers/accel/ivpu/ivpu_fw.h
> index 1d0b2bd9d65c..9a3935be1c05 100644
> --- a/drivers/accel/ivpu/ivpu_fw.h
> +++ b/drivers/accel/ivpu/ivpu_fw.h
> @@ -39,6 +39,7 @@ struct ivpu_fw_info {
>  	u64 read_only_addr;
>  	u32 read_only_size;
>  	u32 sched_mode;
> +	u64 last_heartbeat;
>  };
>  
>  int ivpu_fw_init(struct ivpu_device *vdev);
> diff --git a/drivers/accel/ivpu/ivpu_pm.c b/drivers/accel/ivpu/ivpu_pm.c
> index b5891e91f7ab..1fe03fc16bbc 100644
> --- a/drivers/accel/ivpu/ivpu_pm.c
> +++ b/drivers/accel/ivpu/ivpu_pm.c
> @@ -34,6 +34,7 @@ module_param_named(tdr_timeout_ms, ivpu_tdr_timeout_ms, ulong, 0644);
>  MODULE_PARM_DESC(tdr_timeout_ms, "Timeout for device hang detection, in milliseconds, 0 - default");
>  
>  #define PM_RESCHEDULE_LIMIT     5
> +#define PM_TDR_HEARTBEAT_LIMIT  30
>  
>  static void ivpu_pm_prepare_cold_boot(struct ivpu_device *vdev)
>  {
> @@ -44,6 +45,7 @@ static void ivpu_pm_prepare_cold_boot(struct ivpu_device *vdev)
>  	ivpu_fw_log_reset(vdev);
>  	ivpu_fw_load(vdev);
>  	fw->entry_point = fw->cold_boot_entry_point;
> +	fw->last_heartbeat = 0;
>  }
>  
>  static void ivpu_pm_prepare_warm_boot(struct ivpu_device *vdev)
> @@ -189,7 +191,24 @@ static void ivpu_job_timeout_work(struct work_struct *work)
>  {
>  	struct ivpu_pm_info *pm = container_of(work, struct ivpu_pm_info, job_timeout_work.work);
>  	struct ivpu_device *vdev = pm->vdev;
> +	u64 heartbeat;
>  
> +	if (ivpu_jsm_get_heartbeat(vdev, 0, &heartbeat) || heartbeat <= vdev->fw->last_heartbeat) {
> +		ivpu_err(vdev, "Job timeout detected, heartbeat not progressed\n");
> +		goto recovery;
> +	}
> +
> +	if (atomic_fetch_inc(&vdev->job_timeout_counter) > PM_TDR_HEARTBEAT_LIMIT) {
> +		ivpu_err(vdev, "Job timeout detected, heartbeat limit exceeded\n");
> +		goto recovery;
> +	}
> +
> +	vdev->fw->last_heartbeat = heartbeat;
> +	ivpu_start_job_timeout_detection(vdev);
> +	return;
> +
> +recovery:
> +	atomic_set(&vdev->job_timeout_counter, 0);
>  	ivpu_pm_trigger_recovery(vdev, "TDR");
>  }
>  
> @@ -204,6 +223,7 @@ void ivpu_start_job_timeout_detection(struct ivpu_device *vdev)
>  void ivpu_stop_job_timeout_detection(struct ivpu_device *vdev)
>  {
>  	cancel_delayed_work_sync(&vdev->pm->job_timeout_work);
> +	atomic_set(&vdev->job_timeout_counter, 0);
>  }
>  
>  int ivpu_pm_suspend_cb(struct device *dev)


^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2025-04-25  8:50 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-04-16 10:25 [PATCH] accel/ivpu: Implement heartbeat-based TDR mechanism Maciej Falkowski
2025-04-18 15:27 ` Jeffrey Hugo
2025-04-23  7:23   ` Jacek Lawrynowicz
2025-04-23 14:23     ` Jeff Hugo
2025-04-25  8:22       ` Jacek Lawrynowicz
2025-04-25  8:50 ` Jacek Lawrynowicz

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).