[PATCH v2] drm/xe: Enhance CT_DEAD for production builds

Intel-XE Archive on lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH v2] drm/xe: Enhance CT_DEAD for production builds
@ 2025-11-21 16:25 Matthew Brost
  0 siblings, 0 replies; only message in thread
From: Matthew Brost @ 2025-11-21 16:25 UTC (permalink / raw)
  To: intel-xe; +Cc: Daniele Ceraolo Spurio

If the CT fails on production builds, log its state to dmesg for quick
analysis. Also, log the CT state if a G2H fence times out.

v2:
 - Actually log CT state if a G2H fence times out

Cc: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
Signed-off-by: Matthew Brost <matthew.brost@intel.com>
---
 drivers/gpu/drm/xe/xe_guc_ct.c | 36 ++++++++++++++++++++++++++++------
 1 file changed, 30 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/xe/xe_guc_ct.c b/drivers/gpu/drm/xe/xe_guc_ct.c
index 2697d711adb2..6845d609ec10 100644
--- a/drivers/gpu/drm/xe/xe_guc_ct.c
+++ b/drivers/gpu/drm/xe/xe_guc_ct.c
@@ -41,8 +41,8 @@ static void safe_mode_worker_func(struct work_struct *w);
 static void ct_exit_safe_mode(struct xe_guc_ct *ct);
 static void guc_ct_change_state(struct xe_guc_ct *ct,
 				enum xe_guc_ct_state state);
+static void xe_guc_ct_print_err_state(struct xe_guc_ct *ct, int reason);
 
-#if IS_ENABLED(CONFIG_DRM_XE_DEBUG)
 enum {
 	/* Internal states, not error conditions */
 	CT_DEAD_STATE_REARM,			/* 0x0001 */
@@ -63,18 +63,21 @@ enum {
 	CT_DEAD_PARSE_G2H_ORIGIN,		/* 0x2000 */
 	CT_DEAD_PARSE_G2H_TYPE,			/* 0x4000 */
 	CT_DEAD_CRASH,				/* 0x8000 */
+	CT_DEAD_G2H_TIMEOUT,			/* 0x10000 */
 };
 
+#if IS_ENABLED(CONFIG_DRM_XE_DEBUG)
 static void ct_dead_worker_func(struct work_struct *w);
 static void ct_dead_capture(struct xe_guc_ct *ct, struct guc_ctb *ctb, u32 reason_code);
 
 #define CT_DEAD(ct, ctb, reason_code)		ct_dead_capture((ct), (ctb), CT_DEAD_##reason_code)
 #else
-#define CT_DEAD(ct, ctb, reason)			\
-	do {						\
-		struct guc_ctb *_ctb = (ctb);		\
-		if (_ctb)				\
-			_ctb->info.broken = true;	\
+#define CT_DEAD(ct, ctb, reason_code)					\
+	do {								\
+		struct guc_ctb *_ctb = (ctb);				\
+		xe_guc_ct_print_err_state(ct, CT_DEAD_##reason_code);	\
+		if (_ctb)						\
+			_ctb->info.broken = true;			\
 	} while (0)
 #endif
 
@@ -1220,6 +1223,7 @@ static int guc_ct_send_recv(struct xe_guc_ct *ct, const u32 *action, u32 len,
 	if (!ret) {
 		xe_gt_err(gt, "Timed out wait for G2H, fence %u, action %04x, done %s",
 			  g2h_fence.seqno, action[0], str_yes_no(g2h_fence.done));
+		xe_guc_ct_print_err_state(ct, CT_DEAD_G2H_TIMEOUT);
 		xa_erase(&ct->fence_lookup, g2h_fence.seqno);
 		mutex_unlock(&ct->lock);
 		return -ETIME;
@@ -2016,6 +2020,26 @@ void xe_guc_ct_print(struct xe_guc_ct *ct, struct drm_printer *p, bool want_ctb)
 	xe_guc_ct_snapshot_free(snapshot);
 }
 
+static void xe_guc_ct_print_err_state(struct xe_guc_ct *ct, int reason)
+{
+	struct xe_device *xe = ct_to_xe(ct);
+	struct xe_gt *gt = ct_to_gt(ct);
+	struct guc_ctb *h2g = &ct->ctbs.h2g;
+	struct guc_ctb *g2h = &ct->ctbs.g2h;
+
+	/* Don't spam dmesg, only print first failure */
+	if (h2g->info.broken || g2h->info.broken)
+		return;
+
+	xe_gt_err(gt, "CT_DEAD: reason=%d\n", reason);
+	xe_gt_err(gt, "H2G.head=%d, H2G.tail=%d, H2G.status=%d\n",
+		  desc_read(xe, h2g, head), desc_read(xe, h2g, tail),
+		  desc_read(xe, h2g, status));
+	xe_gt_err(gt, "G2H.head=%d, G2H.tail=%d, G2H.status=%d\n",
+		  desc_read(xe, g2h, head), desc_read(xe, g2h, tail),
+		  desc_read(xe, g2h, status));
+}
+
 #if IS_ENABLED(CONFIG_DRM_XE_DEBUG)
 
 #ifdef CONFIG_FUNCTION_ERROR_INJECTION
-- 
2.34.1


^ permalink raw reply related	[flat|nested] only message in thread

only message in thread, other threads:[~2025-11-21 16:25 UTC | newest]

Thread overview: (only message) (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-11-21 16:25 [PATCH v2] drm/xe: Enhance CT_DEAD for production builds Matthew Brost

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox