Re: [PATCH V1] accel/amdxdna: Check for device hang on job timeout

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

From: Lizhi Hou <lizhi.hou@amd.com>
To: Mario Limonciello <mario.limonciello@amd.com>,
	<ogabbay@kernel.org>, <quic_jhugo@quicinc.com>,
	<dri-devel@lists.freedesktop.org>,
	<maciej.falkowski@linux.intel.com>
Cc: <linux-kernel@vger.kernel.org>, <max.zhen@amd.com>,
	<sonal.santan@amd.com>
Subject: Re: [PATCH V1] accel/amdxdna: Check for device hang on job timeout
Date: Mon, 13 Apr 2026 09:45:17 -0700	[thread overview]
Message-ID: <fe1efb3a-179f-e31d-b7fc-2b57096a138b@amd.com> (raw)
In-Reply-To: <561d6991-d83e-40be-8baf-e705e6d5159d@amd.com>

Applied to drm-misc-next

On 4/10/26 13:37, Mario Limonciello wrote:
>
>
> On 4/9/26 12:58, Lizhi Hou wrote:
>> A job timeout does not necessarily indicate that the device is hung, as
>> it may still be processing other jobs.
>>
>> Track whether any jobs have been successfully submitted or completed,
>> and use this information to determine if the device is making forward
>> progress. If so, return DRM_GPU_SCHED_STAT_NO_HANG instead of treating
>> the timeout as a device hang.
>>
>> In the meanwhile the timeout interval is changed to 2 seconds which 
>> meets
>> the userspace requirement.
>>
>> Signed-off-by: Lizhi Hou <lizhi.hou@amd.com>
> Reviewed-by: Mario Limonciello (AMD) <superm1@kernel.org>
>
>> ---
>>   drivers/accel/amdxdna/aie2_ctx.c | 36 +++++++++++++++++++++++++++-----
>>   drivers/accel/amdxdna/aie2_pci.h |  6 ++++++
>>   2 files changed, 37 insertions(+), 5 deletions(-)
>>
>> diff --git a/drivers/accel/amdxdna/aie2_ctx.c 
>> b/drivers/accel/amdxdna/aie2_ctx.c
>> index f97755d60fa3..ddcf06a6b80c 100644
>> --- a/drivers/accel/amdxdna/aie2_ctx.c
>> +++ b/drivers/accel/amdxdna/aie2_ctx.c
>> @@ -27,7 +27,9 @@ static bool force_cmdlist = true;
>>   module_param(force_cmdlist, bool, 0600);
>>   MODULE_PARM_DESC(force_cmdlist, "Force use command list (Default 
>> true)");
>>   -#define HWCTX_MAX_TIMEOUT    60000 /* milliseconds */
>> +uint tdr_timeout_ms = 2000;
>> +module_param(tdr_timeout_ms, int, 0400);
>> +MODULE_PARM_DESC(tdr_timeout_ms, "TDR (Timeout Detection and 
>> Recovery) timeout in milliseconds (0 = disable)");
>>     struct aie2_ctx_health {
>>       struct amdxdna_ctx_health header;
>> @@ -39,6 +41,24 @@ struct aie2_ctx_health {
>>       u32 fatal_error_app_module;
>>   };
>>   +static inline void aie2_tdr_signal(struct amdxdna_dev *xdna)
>> +{
>> +    WRITE_ONCE(xdna->dev_handle->tdr_status, AIE2_TDR_SIGNALED);
>> +}
>> +
>> +static bool aie2_tdr_detect(struct amdxdna_dev *xdna)
>> +{
>> +    struct amdxdna_dev_hdl *ndev = xdna->dev_handle;
>> +
>> +    if (READ_ONCE(ndev->tdr_status) == AIE2_TDR_WAIT) {
>> +        XDNA_ERR(xdna, "TDR timeout detected");
>> +        return true;
>> +    }
>> +
>> +    WRITE_ONCE(ndev->tdr_status, AIE2_TDR_WAIT);
>> +    return false;
>> +}
>> +
>>   static void aie2_job_release(struct kref *ref)
>>   {
>>       struct amdxdna_sched_job *job;
>> @@ -177,6 +197,7 @@ aie2_sched_notify(struct amdxdna_sched_job *job)
>>         trace_xdna_job(&job->base, job->hwctx->name, "signaled 
>> fence", job->seq);
>>   +    aie2_tdr_signal(job->hwctx->client->xdna);
>>       job->hwctx->priv->completed++;
>>       dma_fence_signal(fence);
>>   @@ -385,6 +406,8 @@ aie2_sched_job_run(struct drm_sched_job 
>> *sched_job)
>>           aie2_job_put(job);
>>           mmput(job->mm);
>>           fence = ERR_PTR(ret);
>> +    } else {
>> +        aie2_tdr_signal(hwctx->client->xdna);
>>       }
>>       trace_xdna_job(sched_job, hwctx->name, "sent to device", 
>> job->seq);
>>   @@ -415,9 +438,12 @@ aie2_sched_job_timedout(struct drm_sched_job 
>> *sched_job)
>>         xdna = hwctx->client->xdna;
>>       trace_xdna_job(sched_job, hwctx->name, "job timedout", job->seq);
>> -    job->job_timeout = true;
>>   -    mutex_lock(&xdna->dev_lock);
>> +    guard(mutex)(&xdna->dev_lock);
>> +
>> +    if (!aie2_tdr_detect(xdna))
>> +        return DRM_GPU_SCHED_STAT_NO_HANG;
>> +
>>       report = kzalloc_obj(*report);
>>       if (!report)
>>           goto reset_hwctx;
>> @@ -429,10 +455,10 @@ aie2_sched_job_timedout(struct drm_sched_job 
>> *sched_job)
>>           job->aie2_job_health = report;
>>     reset_hwctx:
>> +    job->job_timeout = true;
>>       aie2_hwctx_stop(xdna, hwctx, sched_job);
>>         aie2_hwctx_restart(xdna, hwctx);
>> -    mutex_unlock(&xdna->dev_lock);
>>         return DRM_GPU_SCHED_STAT_RESET;
>>   }
>> @@ -608,7 +634,7 @@ int aie2_hwctx_init(struct amdxdna_hwctx *hwctx)
>>           .ops = &sched_ops,
>>           .num_rqs = DRM_SCHED_PRIORITY_COUNT,
>>           .credit_limit = HWCTX_MAX_CMDS,
>> -        .timeout = msecs_to_jiffies(HWCTX_MAX_TIMEOUT),
>> +        .timeout = msecs_to_jiffies(tdr_timeout_ms),
>>           .name = "amdxdna_js",
>>           .dev = xdna->ddev.dev,
>>       };
>> diff --git a/drivers/accel/amdxdna/aie2_pci.h 
>> b/drivers/accel/amdxdna/aie2_pci.h
>> index 7c308672b5fe..81564483cb16 100644
>> --- a/drivers/accel/amdxdna/aie2_pci.h
>> +++ b/drivers/accel/amdxdna/aie2_pci.h
>> @@ -165,6 +165,11 @@ struct aie2_exec_msg_ops {
>>       u32 (*get_chain_msg_op)(u32 cmd_op);
>>   };
>>   +enum aie2_tdr_status {
>> +    AIE2_TDR_WAIT,
>> +    AIE2_TDR_SIGNALED,
>> +};
>> +
>>   struct amdxdna_dev_hdl {
>>       struct aie_device        aie;
>>       const struct amdxdna_dev_priv    *priv;
>> @@ -197,6 +202,7 @@ struct amdxdna_dev_hdl {
>>       u32                hwctx_num;
>>         struct amdxdna_async_error    last_async_err;
>> +    enum aie2_tdr_status        tdr_status;
>>   };
>>     struct aie2_hw_ops {
>

     prev parent reply	other threads:[~2026-04-13 16:45 UTC|newest]

Thread overview: 3+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-04-09 17:58 [PATCH V1] accel/amdxdna: Check for device hang on job timeout Lizhi Hou
2026-04-10 20:37 ` Mario Limonciello
2026-04-13 16:45   ` Lizhi Hou [this message]

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=fe1efb3a-179f-e31d-b7fc-2b57096a138b@amd.com \
    --to=lizhi.hou@amd.com \
    --cc=dri-devel@lists.freedesktop.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=maciej.falkowski@linux.intel.com \
    --cc=mario.limonciello@amd.com \
    --cc=max.zhen@amd.com \
    --cc=ogabbay@kernel.org \
    --cc=quic_jhugo@quicinc.com \
    --cc=sonal.santan@amd.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox