All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] accel/ivpu: Add inference_timeout_ms module parameter
@ 2025-05-15  9:31 Jacek Lawrynowicz
  2025-05-16 14:39 ` Jeff Hugo
  2025-06-02 12:42 ` Jacek Lawrynowicz
  0 siblings, 2 replies; 3+ messages in thread
From: Jacek Lawrynowicz @ 2025-05-15  9:31 UTC (permalink / raw)
  To: dri-devel; +Cc: jeff.hugo, lizhi.hou, Karol Wachowski, Jacek Lawrynowicz

From: Karol Wachowski <karol.wachowski@intel.com>

Add new inference_timeout_ms parameter that allows specifying
maximum allowed duration in milliseconds that inference can take before
triggering a recovery.

Calculate maximum number of heartbeat retries based on ratio between
inference timeout and tdr timeout.

Signed-off-by: Karol Wachowski <karol.wachowski@intel.com>
Signed-off-by: Jacek Lawrynowicz <jacek.lawrynowicz@linux.intel.com>
---
 drivers/accel/ivpu/ivpu_drv.h |  1 +
 drivers/accel/ivpu/ivpu_hw.c  |  4 ++++
 drivers/accel/ivpu/ivpu_pm.c  | 15 ++++++++++++---
 3 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/drivers/accel/ivpu/ivpu_drv.h b/drivers/accel/ivpu/ivpu_drv.h
index 5497e7030e915..b6d6b3238b596 100644
--- a/drivers/accel/ivpu/ivpu_drv.h
+++ b/drivers/accel/ivpu/ivpu_drv.h
@@ -165,6 +165,7 @@ struct ivpu_device {
 		int boot;
 		int jsm;
 		int tdr;
+		int inference;
 		int autosuspend;
 		int d0i3_entry_msg;
 		int state_dump_msg;
diff --git a/drivers/accel/ivpu/ivpu_hw.c b/drivers/accel/ivpu/ivpu_hw.c
index 633160470c939..08dcc31b56f4d 100644
--- a/drivers/accel/ivpu/ivpu_hw.c
+++ b/drivers/accel/ivpu/ivpu_hw.c
@@ -94,12 +94,14 @@ static void timeouts_init(struct ivpu_device *vdev)
 		vdev->timeout.boot = -1;
 		vdev->timeout.jsm = -1;
 		vdev->timeout.tdr = -1;
+		vdev->timeout.inference = -1;
 		vdev->timeout.autosuspend = -1;
 		vdev->timeout.d0i3_entry_msg = -1;
 	} else if (ivpu_is_fpga(vdev)) {
 		vdev->timeout.boot = 50;
 		vdev->timeout.jsm = 15000;
 		vdev->timeout.tdr = 30000;
+		vdev->timeout.inference = 900000;
 		vdev->timeout.autosuspend = -1;
 		vdev->timeout.d0i3_entry_msg = 500;
 		vdev->timeout.state_dump_msg = 10000;
@@ -107,6 +109,7 @@ static void timeouts_init(struct ivpu_device *vdev)
 		vdev->timeout.boot = 50;
 		vdev->timeout.jsm = 500;
 		vdev->timeout.tdr = 10000;
+		vdev->timeout.inference = 300000;
 		vdev->timeout.autosuspend = 100;
 		vdev->timeout.d0i3_entry_msg = 100;
 		vdev->timeout.state_dump_msg = 10;
@@ -114,6 +117,7 @@ static void timeouts_init(struct ivpu_device *vdev)
 		vdev->timeout.boot = 1000;
 		vdev->timeout.jsm = 500;
 		vdev->timeout.tdr = 2000;
+		vdev->timeout.inference = 60000;
 		if (ivpu_hw_ip_gen(vdev) == IVPU_HW_IP_37XX)
 			vdev->timeout.autosuspend = 10;
 		else
diff --git a/drivers/accel/ivpu/ivpu_pm.c b/drivers/accel/ivpu/ivpu_pm.c
index ea30db181cd75..eacda1dbe8405 100644
--- a/drivers/accel/ivpu/ivpu_pm.c
+++ b/drivers/accel/ivpu/ivpu_pm.c
@@ -33,8 +33,11 @@ static unsigned long ivpu_tdr_timeout_ms;
 module_param_named(tdr_timeout_ms, ivpu_tdr_timeout_ms, ulong, 0644);
 MODULE_PARM_DESC(tdr_timeout_ms, "Timeout for device hang detection, in milliseconds, 0 - default");
 
+static unsigned long ivpu_inference_timeout_ms;
+module_param_named(inference_timeout_ms, ivpu_inference_timeout_ms, ulong, 0644);
+MODULE_PARM_DESC(inference_timeout_ms, "Inference maximum duration, in milliseconds, 0 - default");
+
 #define PM_RESCHEDULE_LIMIT     5
-#define PM_TDR_HEARTBEAT_LIMIT  30
 
 static void ivpu_pm_prepare_cold_boot(struct ivpu_device *vdev)
 {
@@ -191,6 +194,10 @@ static void ivpu_job_timeout_work(struct work_struct *work)
 {
 	struct ivpu_pm_info *pm = container_of(work, struct ivpu_pm_info, job_timeout_work.work);
 	struct ivpu_device *vdev = pm->vdev;
+	unsigned long timeout_ms = ivpu_tdr_timeout_ms ? ivpu_tdr_timeout_ms : vdev->timeout.tdr;
+	unsigned long inference_timeout_ms = ivpu_inference_timeout_ms ? ivpu_inference_timeout_ms :
+					     vdev->timeout.inference;
+	u64 inference_max_retries;
 	u64 heartbeat;
 
 	if (ivpu_jsm_get_heartbeat(vdev, 0, &heartbeat) || heartbeat <= vdev->fw->last_heartbeat) {
@@ -198,8 +205,10 @@ static void ivpu_job_timeout_work(struct work_struct *work)
 		goto recovery;
 	}
 
-	if (atomic_fetch_inc(&vdev->job_timeout_counter) > PM_TDR_HEARTBEAT_LIMIT) {
-		ivpu_err(vdev, "Job timeout detected, heartbeat limit exceeded\n");
+	inference_max_retries = DIV_ROUND_UP(inference_timeout_ms, timeout_ms);
+	if (atomic_fetch_inc(&vdev->job_timeout_counter) >= inference_max_retries) {
+		ivpu_err(vdev, "Job timeout detected, heartbeat limit (%lld) exceeded\n",
+			 inference_max_retries);
 		goto recovery;
 	}
 
-- 
2.45.1


^ permalink raw reply related	[flat|nested] 3+ messages in thread

* Re: [PATCH] accel/ivpu: Add inference_timeout_ms module parameter
  2025-05-15  9:31 [PATCH] accel/ivpu: Add inference_timeout_ms module parameter Jacek Lawrynowicz
@ 2025-05-16 14:39 ` Jeff Hugo
  2025-06-02 12:42 ` Jacek Lawrynowicz
  1 sibling, 0 replies; 3+ messages in thread
From: Jeff Hugo @ 2025-05-16 14:39 UTC (permalink / raw)
  To: Jacek Lawrynowicz, dri-devel; +Cc: lizhi.hou, Karol Wachowski

On 5/15/2025 3:31 AM, Jacek Lawrynowicz wrote:
> From: Karol Wachowski <karol.wachowski@intel.com>
> 
> Add new inference_timeout_ms parameter that allows specifying
> maximum allowed duration in milliseconds that inference can take before
> triggering a recovery.
> 
> Calculate maximum number of heartbeat retries based on ratio between
> inference timeout and tdr timeout.
> 
> Signed-off-by: Karol Wachowski <karol.wachowski@intel.com>
> Signed-off-by: Jacek Lawrynowicz <jacek.lawrynowicz@linux.intel.com>

Reviewed-by: Jeff Hugo <jeff.hugo@oss.qualcomm.com>

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [PATCH] accel/ivpu: Add inference_timeout_ms module parameter
  2025-05-15  9:31 [PATCH] accel/ivpu: Add inference_timeout_ms module parameter Jacek Lawrynowicz
  2025-05-16 14:39 ` Jeff Hugo
@ 2025-06-02 12:42 ` Jacek Lawrynowicz
  1 sibling, 0 replies; 3+ messages in thread
From: Jacek Lawrynowicz @ 2025-06-02 12:42 UTC (permalink / raw)
  To: dri-devel; +Cc: jeff.hugo, lizhi.hou, Karol Wachowski

Applied to drm-misc-next

On 5/15/2025 11:31 AM, Jacek Lawrynowicz wrote:
> From: Karol Wachowski <karol.wachowski@intel.com>
> 
> Add new inference_timeout_ms parameter that allows specifying
> maximum allowed duration in milliseconds that inference can take before
> triggering a recovery.
> 
> Calculate maximum number of heartbeat retries based on ratio between
> inference timeout and tdr timeout.
> 
> Signed-off-by: Karol Wachowski <karol.wachowski@intel.com>
> Signed-off-by: Jacek Lawrynowicz <jacek.lawrynowicz@linux.intel.com>
> ---
>  drivers/accel/ivpu/ivpu_drv.h |  1 +
>  drivers/accel/ivpu/ivpu_hw.c  |  4 ++++
>  drivers/accel/ivpu/ivpu_pm.c  | 15 ++++++++++++---
>  3 files changed, 17 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/accel/ivpu/ivpu_drv.h b/drivers/accel/ivpu/ivpu_drv.h
> index 5497e7030e915..b6d6b3238b596 100644
> --- a/drivers/accel/ivpu/ivpu_drv.h
> +++ b/drivers/accel/ivpu/ivpu_drv.h
> @@ -165,6 +165,7 @@ struct ivpu_device {
>  		int boot;
>  		int jsm;
>  		int tdr;
> +		int inference;
>  		int autosuspend;
>  		int d0i3_entry_msg;
>  		int state_dump_msg;
> diff --git a/drivers/accel/ivpu/ivpu_hw.c b/drivers/accel/ivpu/ivpu_hw.c
> index 633160470c939..08dcc31b56f4d 100644
> --- a/drivers/accel/ivpu/ivpu_hw.c
> +++ b/drivers/accel/ivpu/ivpu_hw.c
> @@ -94,12 +94,14 @@ static void timeouts_init(struct ivpu_device *vdev)
>  		vdev->timeout.boot = -1;
>  		vdev->timeout.jsm = -1;
>  		vdev->timeout.tdr = -1;
> +		vdev->timeout.inference = -1;
>  		vdev->timeout.autosuspend = -1;
>  		vdev->timeout.d0i3_entry_msg = -1;
>  	} else if (ivpu_is_fpga(vdev)) {
>  		vdev->timeout.boot = 50;
>  		vdev->timeout.jsm = 15000;
>  		vdev->timeout.tdr = 30000;
> +		vdev->timeout.inference = 900000;
>  		vdev->timeout.autosuspend = -1;
>  		vdev->timeout.d0i3_entry_msg = 500;
>  		vdev->timeout.state_dump_msg = 10000;
> @@ -107,6 +109,7 @@ static void timeouts_init(struct ivpu_device *vdev)
>  		vdev->timeout.boot = 50;
>  		vdev->timeout.jsm = 500;
>  		vdev->timeout.tdr = 10000;
> +		vdev->timeout.inference = 300000;
>  		vdev->timeout.autosuspend = 100;
>  		vdev->timeout.d0i3_entry_msg = 100;
>  		vdev->timeout.state_dump_msg = 10;
> @@ -114,6 +117,7 @@ static void timeouts_init(struct ivpu_device *vdev)
>  		vdev->timeout.boot = 1000;
>  		vdev->timeout.jsm = 500;
>  		vdev->timeout.tdr = 2000;
> +		vdev->timeout.inference = 60000;
>  		if (ivpu_hw_ip_gen(vdev) == IVPU_HW_IP_37XX)
>  			vdev->timeout.autosuspend = 10;
>  		else
> diff --git a/drivers/accel/ivpu/ivpu_pm.c b/drivers/accel/ivpu/ivpu_pm.c
> index ea30db181cd75..eacda1dbe8405 100644
> --- a/drivers/accel/ivpu/ivpu_pm.c
> +++ b/drivers/accel/ivpu/ivpu_pm.c
> @@ -33,8 +33,11 @@ static unsigned long ivpu_tdr_timeout_ms;
>  module_param_named(tdr_timeout_ms, ivpu_tdr_timeout_ms, ulong, 0644);
>  MODULE_PARM_DESC(tdr_timeout_ms, "Timeout for device hang detection, in milliseconds, 0 - default");
>  
> +static unsigned long ivpu_inference_timeout_ms;
> +module_param_named(inference_timeout_ms, ivpu_inference_timeout_ms, ulong, 0644);
> +MODULE_PARM_DESC(inference_timeout_ms, "Inference maximum duration, in milliseconds, 0 - default");
> +
>  #define PM_RESCHEDULE_LIMIT     5
> -#define PM_TDR_HEARTBEAT_LIMIT  30
>  
>  static void ivpu_pm_prepare_cold_boot(struct ivpu_device *vdev)
>  {
> @@ -191,6 +194,10 @@ static void ivpu_job_timeout_work(struct work_struct *work)
>  {
>  	struct ivpu_pm_info *pm = container_of(work, struct ivpu_pm_info, job_timeout_work.work);
>  	struct ivpu_device *vdev = pm->vdev;
> +	unsigned long timeout_ms = ivpu_tdr_timeout_ms ? ivpu_tdr_timeout_ms : vdev->timeout.tdr;
> +	unsigned long inference_timeout_ms = ivpu_inference_timeout_ms ? ivpu_inference_timeout_ms :
> +					     vdev->timeout.inference;
> +	u64 inference_max_retries;
>  	u64 heartbeat;
>  
>  	if (ivpu_jsm_get_heartbeat(vdev, 0, &heartbeat) || heartbeat <= vdev->fw->last_heartbeat) {
> @@ -198,8 +205,10 @@ static void ivpu_job_timeout_work(struct work_struct *work)
>  		goto recovery;
>  	}
>  
> -	if (atomic_fetch_inc(&vdev->job_timeout_counter) > PM_TDR_HEARTBEAT_LIMIT) {
> -		ivpu_err(vdev, "Job timeout detected, heartbeat limit exceeded\n");
> +	inference_max_retries = DIV_ROUND_UP(inference_timeout_ms, timeout_ms);
> +	if (atomic_fetch_inc(&vdev->job_timeout_counter) >= inference_max_retries) {
> +		ivpu_err(vdev, "Job timeout detected, heartbeat limit (%lld) exceeded\n",
> +			 inference_max_retries);
>  		goto recovery;
>  	}
>  


^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2025-06-02 12:42 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-05-15  9:31 [PATCH] accel/ivpu: Add inference_timeout_ms module parameter Jacek Lawrynowicz
2025-05-16 14:39 ` Jeff Hugo
2025-06-02 12:42 ` Jacek Lawrynowicz

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.