From: Mario Limonciello <mario.limonciello@amd.com>
To: Lizhi Hou <lizhi.hou@amd.com>,
ogabbay@kernel.org, quic_jhugo@quicinc.com,
dri-devel@lists.freedesktop.org,
maciej.falkowski@linux.intel.com
Cc: linux-kernel@vger.kernel.org, max.zhen@amd.com, sonal.santan@amd.com
Subject: Re: [PATCH V1] accel/amdxdna: Check for device hang on job timeout
Date: Fri, 10 Apr 2026 15:37:59 -0500 [thread overview]
Message-ID: <561d6991-d83e-40be-8baf-e705e6d5159d@amd.com> (raw)
In-Reply-To: <20260409175826.195665-1-lizhi.hou@amd.com>
On 4/9/26 12:58, Lizhi Hou wrote:
> A job timeout does not necessarily indicate that the device is hung, as
> it may still be processing other jobs.
>
> Track whether any jobs have been successfully submitted or completed,
> and use this information to determine if the device is making forward
> progress. If so, return DRM_GPU_SCHED_STAT_NO_HANG instead of treating
> the timeout as a device hang.
>
> In the meanwhile the timeout interval is changed to 2 seconds which meets
> the userspace requirement.
>
> Signed-off-by: Lizhi Hou <lizhi.hou@amd.com>
Reviewed-by: Mario Limonciello (AMD) <superm1@kernel.org>
> ---
> drivers/accel/amdxdna/aie2_ctx.c | 36 +++++++++++++++++++++++++++-----
> drivers/accel/amdxdna/aie2_pci.h | 6 ++++++
> 2 files changed, 37 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/accel/amdxdna/aie2_ctx.c b/drivers/accel/amdxdna/aie2_ctx.c
> index f97755d60fa3..ddcf06a6b80c 100644
> --- a/drivers/accel/amdxdna/aie2_ctx.c
> +++ b/drivers/accel/amdxdna/aie2_ctx.c
> @@ -27,7 +27,9 @@ static bool force_cmdlist = true;
> module_param(force_cmdlist, bool, 0600);
> MODULE_PARM_DESC(force_cmdlist, "Force use command list (Default true)");
>
> -#define HWCTX_MAX_TIMEOUT 60000 /* milliseconds */
> +uint tdr_timeout_ms = 2000;
> +module_param(tdr_timeout_ms, int, 0400);
> +MODULE_PARM_DESC(tdr_timeout_ms, "TDR (Timeout Detection and Recovery) timeout in milliseconds (0 = disable)");
>
> struct aie2_ctx_health {
> struct amdxdna_ctx_health header;
> @@ -39,6 +41,24 @@ struct aie2_ctx_health {
> u32 fatal_error_app_module;
> };
>
> +static inline void aie2_tdr_signal(struct amdxdna_dev *xdna)
> +{
> + WRITE_ONCE(xdna->dev_handle->tdr_status, AIE2_TDR_SIGNALED);
> +}
> +
> +static bool aie2_tdr_detect(struct amdxdna_dev *xdna)
> +{
> + struct amdxdna_dev_hdl *ndev = xdna->dev_handle;
> +
> + if (READ_ONCE(ndev->tdr_status) == AIE2_TDR_WAIT) {
> + XDNA_ERR(xdna, "TDR timeout detected");
> + return true;
> + }
> +
> + WRITE_ONCE(ndev->tdr_status, AIE2_TDR_WAIT);
> + return false;
> +}
> +
> static void aie2_job_release(struct kref *ref)
> {
> struct amdxdna_sched_job *job;
> @@ -177,6 +197,7 @@ aie2_sched_notify(struct amdxdna_sched_job *job)
>
> trace_xdna_job(&job->base, job->hwctx->name, "signaled fence", job->seq);
>
> + aie2_tdr_signal(job->hwctx->client->xdna);
> job->hwctx->priv->completed++;
> dma_fence_signal(fence);
>
> @@ -385,6 +406,8 @@ aie2_sched_job_run(struct drm_sched_job *sched_job)
> aie2_job_put(job);
> mmput(job->mm);
> fence = ERR_PTR(ret);
> + } else {
> + aie2_tdr_signal(hwctx->client->xdna);
> }
> trace_xdna_job(sched_job, hwctx->name, "sent to device", job->seq);
>
> @@ -415,9 +438,12 @@ aie2_sched_job_timedout(struct drm_sched_job *sched_job)
>
> xdna = hwctx->client->xdna;
> trace_xdna_job(sched_job, hwctx->name, "job timedout", job->seq);
> - job->job_timeout = true;
>
> - mutex_lock(&xdna->dev_lock);
> + guard(mutex)(&xdna->dev_lock);
> +
> + if (!aie2_tdr_detect(xdna))
> + return DRM_GPU_SCHED_STAT_NO_HANG;
> +
> report = kzalloc_obj(*report);
> if (!report)
> goto reset_hwctx;
> @@ -429,10 +455,10 @@ aie2_sched_job_timedout(struct drm_sched_job *sched_job)
> job->aie2_job_health = report;
>
> reset_hwctx:
> + job->job_timeout = true;
> aie2_hwctx_stop(xdna, hwctx, sched_job);
>
> aie2_hwctx_restart(xdna, hwctx);
> - mutex_unlock(&xdna->dev_lock);
>
> return DRM_GPU_SCHED_STAT_RESET;
> }
> @@ -608,7 +634,7 @@ int aie2_hwctx_init(struct amdxdna_hwctx *hwctx)
> .ops = &sched_ops,
> .num_rqs = DRM_SCHED_PRIORITY_COUNT,
> .credit_limit = HWCTX_MAX_CMDS,
> - .timeout = msecs_to_jiffies(HWCTX_MAX_TIMEOUT),
> + .timeout = msecs_to_jiffies(tdr_timeout_ms),
> .name = "amdxdna_js",
> .dev = xdna->ddev.dev,
> };
> diff --git a/drivers/accel/amdxdna/aie2_pci.h b/drivers/accel/amdxdna/aie2_pci.h
> index 7c308672b5fe..81564483cb16 100644
> --- a/drivers/accel/amdxdna/aie2_pci.h
> +++ b/drivers/accel/amdxdna/aie2_pci.h
> @@ -165,6 +165,11 @@ struct aie2_exec_msg_ops {
> u32 (*get_chain_msg_op)(u32 cmd_op);
> };
>
> +enum aie2_tdr_status {
> + AIE2_TDR_WAIT,
> + AIE2_TDR_SIGNALED,
> +};
> +
> struct amdxdna_dev_hdl {
> struct aie_device aie;
> const struct amdxdna_dev_priv *priv;
> @@ -197,6 +202,7 @@ struct amdxdna_dev_hdl {
> u32 hwctx_num;
>
> struct amdxdna_async_error last_async_err;
> + enum aie2_tdr_status tdr_status;
> };
>
> struct aie2_hw_ops {
prev parent reply other threads:[~2026-04-10 20:38 UTC|newest]
Thread overview: 2+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-04-09 17:58 [PATCH V1] accel/amdxdna: Check for device hang on job timeout Lizhi Hou
2026-04-10 20:37 ` Mario Limonciello [this message]
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=561d6991-d83e-40be-8baf-e705e6d5159d@amd.com \
--to=mario.limonciello@amd.com \
--cc=dri-devel@lists.freedesktop.org \
--cc=linux-kernel@vger.kernel.org \
--cc=lizhi.hou@amd.com \
--cc=maciej.falkowski@linux.intel.com \
--cc=max.zhen@amd.com \
--cc=ogabbay@kernel.org \
--cc=quic_jhugo@quicinc.com \
--cc=sonal.santan@amd.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox