All of lore.kernel.org
 help / color / mirror / Atom feed
From: Jacek Lawrynowicz <jacek.lawrynowicz@linux.intel.com>
To: dri-devel@lists.freedesktop.org
Cc: jeff.hugo@oss.qualcomm.com, lizhi.hou@amd.com,
	Karol Wachowski <karol.wachowski@intel.com>
Subject: Re: [PATCH] accel/ivpu: Add inference_timeout_ms module parameter
Date: Mon, 2 Jun 2025 14:42:01 +0200	[thread overview]
Message-ID: <760f00da-7daf-446d-b595-e7a21f1fe34f@linux.intel.com> (raw)
In-Reply-To: <20250515093128.252041-1-jacek.lawrynowicz@linux.intel.com>

Applied to drm-misc-next

On 5/15/2025 11:31 AM, Jacek Lawrynowicz wrote:
> From: Karol Wachowski <karol.wachowski@intel.com>
> 
> Add new inference_timeout_ms parameter that allows specifying
> maximum allowed duration in milliseconds that inference can take before
> triggering a recovery.
> 
> Calculate maximum number of heartbeat retries based on ratio between
> inference timeout and tdr timeout.
> 
> Signed-off-by: Karol Wachowski <karol.wachowski@intel.com>
> Signed-off-by: Jacek Lawrynowicz <jacek.lawrynowicz@linux.intel.com>
> ---
>  drivers/accel/ivpu/ivpu_drv.h |  1 +
>  drivers/accel/ivpu/ivpu_hw.c  |  4 ++++
>  drivers/accel/ivpu/ivpu_pm.c  | 15 ++++++++++++---
>  3 files changed, 17 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/accel/ivpu/ivpu_drv.h b/drivers/accel/ivpu/ivpu_drv.h
> index 5497e7030e915..b6d6b3238b596 100644
> --- a/drivers/accel/ivpu/ivpu_drv.h
> +++ b/drivers/accel/ivpu/ivpu_drv.h
> @@ -165,6 +165,7 @@ struct ivpu_device {
>  		int boot;
>  		int jsm;
>  		int tdr;
> +		int inference;
>  		int autosuspend;
>  		int d0i3_entry_msg;
>  		int state_dump_msg;
> diff --git a/drivers/accel/ivpu/ivpu_hw.c b/drivers/accel/ivpu/ivpu_hw.c
> index 633160470c939..08dcc31b56f4d 100644
> --- a/drivers/accel/ivpu/ivpu_hw.c
> +++ b/drivers/accel/ivpu/ivpu_hw.c
> @@ -94,12 +94,14 @@ static void timeouts_init(struct ivpu_device *vdev)
>  		vdev->timeout.boot = -1;
>  		vdev->timeout.jsm = -1;
>  		vdev->timeout.tdr = -1;
> +		vdev->timeout.inference = -1;
>  		vdev->timeout.autosuspend = -1;
>  		vdev->timeout.d0i3_entry_msg = -1;
>  	} else if (ivpu_is_fpga(vdev)) {
>  		vdev->timeout.boot = 50;
>  		vdev->timeout.jsm = 15000;
>  		vdev->timeout.tdr = 30000;
> +		vdev->timeout.inference = 900000;
>  		vdev->timeout.autosuspend = -1;
>  		vdev->timeout.d0i3_entry_msg = 500;
>  		vdev->timeout.state_dump_msg = 10000;
> @@ -107,6 +109,7 @@ static void timeouts_init(struct ivpu_device *vdev)
>  		vdev->timeout.boot = 50;
>  		vdev->timeout.jsm = 500;
>  		vdev->timeout.tdr = 10000;
> +		vdev->timeout.inference = 300000;
>  		vdev->timeout.autosuspend = 100;
>  		vdev->timeout.d0i3_entry_msg = 100;
>  		vdev->timeout.state_dump_msg = 10;
> @@ -114,6 +117,7 @@ static void timeouts_init(struct ivpu_device *vdev)
>  		vdev->timeout.boot = 1000;
>  		vdev->timeout.jsm = 500;
>  		vdev->timeout.tdr = 2000;
> +		vdev->timeout.inference = 60000;
>  		if (ivpu_hw_ip_gen(vdev) == IVPU_HW_IP_37XX)
>  			vdev->timeout.autosuspend = 10;
>  		else
> diff --git a/drivers/accel/ivpu/ivpu_pm.c b/drivers/accel/ivpu/ivpu_pm.c
> index ea30db181cd75..eacda1dbe8405 100644
> --- a/drivers/accel/ivpu/ivpu_pm.c
> +++ b/drivers/accel/ivpu/ivpu_pm.c
> @@ -33,8 +33,11 @@ static unsigned long ivpu_tdr_timeout_ms;
>  module_param_named(tdr_timeout_ms, ivpu_tdr_timeout_ms, ulong, 0644);
>  MODULE_PARM_DESC(tdr_timeout_ms, "Timeout for device hang detection, in milliseconds, 0 - default");
>  
> +static unsigned long ivpu_inference_timeout_ms;
> +module_param_named(inference_timeout_ms, ivpu_inference_timeout_ms, ulong, 0644);
> +MODULE_PARM_DESC(inference_timeout_ms, "Inference maximum duration, in milliseconds, 0 - default");
> +
>  #define PM_RESCHEDULE_LIMIT     5
> -#define PM_TDR_HEARTBEAT_LIMIT  30
>  
>  static void ivpu_pm_prepare_cold_boot(struct ivpu_device *vdev)
>  {
> @@ -191,6 +194,10 @@ static void ivpu_job_timeout_work(struct work_struct *work)
>  {
>  	struct ivpu_pm_info *pm = container_of(work, struct ivpu_pm_info, job_timeout_work.work);
>  	struct ivpu_device *vdev = pm->vdev;
> +	unsigned long timeout_ms = ivpu_tdr_timeout_ms ? ivpu_tdr_timeout_ms : vdev->timeout.tdr;
> +	unsigned long inference_timeout_ms = ivpu_inference_timeout_ms ? ivpu_inference_timeout_ms :
> +					     vdev->timeout.inference;
> +	u64 inference_max_retries;
>  	u64 heartbeat;
>  
>  	if (ivpu_jsm_get_heartbeat(vdev, 0, &heartbeat) || heartbeat <= vdev->fw->last_heartbeat) {
> @@ -198,8 +205,10 @@ static void ivpu_job_timeout_work(struct work_struct *work)
>  		goto recovery;
>  	}
>  
> -	if (atomic_fetch_inc(&vdev->job_timeout_counter) > PM_TDR_HEARTBEAT_LIMIT) {
> -		ivpu_err(vdev, "Job timeout detected, heartbeat limit exceeded\n");
> +	inference_max_retries = DIV_ROUND_UP(inference_timeout_ms, timeout_ms);
> +	if (atomic_fetch_inc(&vdev->job_timeout_counter) >= inference_max_retries) {
> +		ivpu_err(vdev, "Job timeout detected, heartbeat limit (%lld) exceeded\n",
> +			 inference_max_retries);
>  		goto recovery;
>  	}
>  


      parent reply	other threads:[~2025-06-02 12:42 UTC|newest]

Thread overview: 3+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-05-15  9:31 [PATCH] accel/ivpu: Add inference_timeout_ms module parameter Jacek Lawrynowicz
2025-05-16 14:39 ` Jeff Hugo
2025-06-02 12:42 ` Jacek Lawrynowicz [this message]

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=760f00da-7daf-446d-b595-e7a21f1fe34f@linux.intel.com \
    --to=jacek.lawrynowicz@linux.intel.com \
    --cc=dri-devel@lists.freedesktop.org \
    --cc=jeff.hugo@oss.qualcomm.com \
    --cc=karol.wachowski@intel.com \
    --cc=lizhi.hou@amd.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.