public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
* [PATCH V1] accel/amdxdna: Check for device hang on job timeout
@ 2026-04-09 17:58 Lizhi Hou
  2026-04-10 20:37 ` Mario Limonciello
  0 siblings, 1 reply; 3+ messages in thread
From: Lizhi Hou @ 2026-04-09 17:58 UTC (permalink / raw)
  To: ogabbay, quic_jhugo, dri-devel, mario.limonciello,
	maciej.falkowski
  Cc: Lizhi Hou, linux-kernel, max.zhen, sonal.santan

A job timeout does not necessarily indicate that the device is hung, as
it may still be processing other jobs.

Track whether any jobs have been successfully submitted or completed,
and use this information to determine if the device is making forward
progress. If so, return DRM_GPU_SCHED_STAT_NO_HANG instead of treating
the timeout as a device hang.

In the meanwhile the timeout interval is changed to 2 seconds which meets
the userspace requirement.

Signed-off-by: Lizhi Hou <lizhi.hou@amd.com>
---
 drivers/accel/amdxdna/aie2_ctx.c | 36 +++++++++++++++++++++++++++-----
 drivers/accel/amdxdna/aie2_pci.h |  6 ++++++
 2 files changed, 37 insertions(+), 5 deletions(-)

diff --git a/drivers/accel/amdxdna/aie2_ctx.c b/drivers/accel/amdxdna/aie2_ctx.c
index f97755d60fa3..ddcf06a6b80c 100644
--- a/drivers/accel/amdxdna/aie2_ctx.c
+++ b/drivers/accel/amdxdna/aie2_ctx.c
@@ -27,7 +27,9 @@ static bool force_cmdlist = true;
 module_param(force_cmdlist, bool, 0600);
 MODULE_PARM_DESC(force_cmdlist, "Force use command list (Default true)");
 
-#define HWCTX_MAX_TIMEOUT	60000 /* milliseconds */
+uint tdr_timeout_ms = 2000;
+module_param(tdr_timeout_ms, int, 0400);
+MODULE_PARM_DESC(tdr_timeout_ms, "TDR (Timeout Detection and Recovery) timeout in milliseconds (0 = disable)");
 
 struct aie2_ctx_health {
 	struct amdxdna_ctx_health header;
@@ -39,6 +41,24 @@ struct aie2_ctx_health {
 	u32 fatal_error_app_module;
 };
 
+static inline void aie2_tdr_signal(struct amdxdna_dev *xdna)
+{
+	WRITE_ONCE(xdna->dev_handle->tdr_status, AIE2_TDR_SIGNALED);
+}
+
+static bool aie2_tdr_detect(struct amdxdna_dev *xdna)
+{
+	struct amdxdna_dev_hdl *ndev = xdna->dev_handle;
+
+	if (READ_ONCE(ndev->tdr_status) == AIE2_TDR_WAIT) {
+		XDNA_ERR(xdna, "TDR timeout detected");
+		return true;
+	}
+
+	WRITE_ONCE(ndev->tdr_status, AIE2_TDR_WAIT);
+	return false;
+}
+
 static void aie2_job_release(struct kref *ref)
 {
 	struct amdxdna_sched_job *job;
@@ -177,6 +197,7 @@ aie2_sched_notify(struct amdxdna_sched_job *job)
 
 	trace_xdna_job(&job->base, job->hwctx->name, "signaled fence", job->seq);
 
+	aie2_tdr_signal(job->hwctx->client->xdna);
 	job->hwctx->priv->completed++;
 	dma_fence_signal(fence);
 
@@ -385,6 +406,8 @@ aie2_sched_job_run(struct drm_sched_job *sched_job)
 		aie2_job_put(job);
 		mmput(job->mm);
 		fence = ERR_PTR(ret);
+	} else {
+		aie2_tdr_signal(hwctx->client->xdna);
 	}
 	trace_xdna_job(sched_job, hwctx->name, "sent to device", job->seq);
 
@@ -415,9 +438,12 @@ aie2_sched_job_timedout(struct drm_sched_job *sched_job)
 
 	xdna = hwctx->client->xdna;
 	trace_xdna_job(sched_job, hwctx->name, "job timedout", job->seq);
-	job->job_timeout = true;
 
-	mutex_lock(&xdna->dev_lock);
+	guard(mutex)(&xdna->dev_lock);
+
+	if (!aie2_tdr_detect(xdna))
+		return DRM_GPU_SCHED_STAT_NO_HANG;
+
 	report = kzalloc_obj(*report);
 	if (!report)
 		goto reset_hwctx;
@@ -429,10 +455,10 @@ aie2_sched_job_timedout(struct drm_sched_job *sched_job)
 		job->aie2_job_health = report;
 
 reset_hwctx:
+	job->job_timeout = true;
 	aie2_hwctx_stop(xdna, hwctx, sched_job);
 
 	aie2_hwctx_restart(xdna, hwctx);
-	mutex_unlock(&xdna->dev_lock);
 
 	return DRM_GPU_SCHED_STAT_RESET;
 }
@@ -608,7 +634,7 @@ int aie2_hwctx_init(struct amdxdna_hwctx *hwctx)
 		.ops = &sched_ops,
 		.num_rqs = DRM_SCHED_PRIORITY_COUNT,
 		.credit_limit = HWCTX_MAX_CMDS,
-		.timeout = msecs_to_jiffies(HWCTX_MAX_TIMEOUT),
+		.timeout = msecs_to_jiffies(tdr_timeout_ms),
 		.name = "amdxdna_js",
 		.dev = xdna->ddev.dev,
 	};
diff --git a/drivers/accel/amdxdna/aie2_pci.h b/drivers/accel/amdxdna/aie2_pci.h
index 7c308672b5fe..81564483cb16 100644
--- a/drivers/accel/amdxdna/aie2_pci.h
+++ b/drivers/accel/amdxdna/aie2_pci.h
@@ -165,6 +165,11 @@ struct aie2_exec_msg_ops {
 	u32 (*get_chain_msg_op)(u32 cmd_op);
 };
 
+enum aie2_tdr_status {
+	AIE2_TDR_WAIT,
+	AIE2_TDR_SIGNALED,
+};
+
 struct amdxdna_dev_hdl {
 	struct aie_device		aie;
 	const struct amdxdna_dev_priv	*priv;
@@ -197,6 +202,7 @@ struct amdxdna_dev_hdl {
 	u32				hwctx_num;
 
 	struct amdxdna_async_error	last_async_err;
+	enum aie2_tdr_status		tdr_status;
 };
 
 struct aie2_hw_ops {
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2026-04-13 16:45 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-04-09 17:58 [PATCH V1] accel/amdxdna: Check for device hang on job timeout Lizhi Hou
2026-04-10 20:37 ` Mario Limonciello
2026-04-13 16:45   ` Lizhi Hou

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox