public inbox for intel-xe@lists.freedesktop.org
 help / color / mirror / Atom feed
From: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
To: Michal Wajdeczko <michal.wajdeczko@intel.com>,
	<intel-xe@lists.freedesktop.org>
Cc: Matthew Brost <matthew.brost@intel.com>
Subject: Re: [PATCH] drm/xe/guc: Add support for NO_RESPONSE_BUSY in CTB
Date: Tue, 7 Apr 2026 15:18:05 -0700	[thread overview]
Message-ID: <6b5850d5-879f-4ea0-ad29-63ef0d8474d1@intel.com> (raw)
In-Reply-To: <20260403204433.5765-1-michal.wajdeczko@intel.com>



On 4/3/2026 1:44 PM, Michal Wajdeczko wrote:
> We only have support for G2H NO_RESPONSE_BUSY messages over MMIO,
> but it turned out that GuC also uses that type of messages in CTB.
>
> The following error was recently observed on BMG after adding VGT
> policy updates to the GT restart sequence:
>
>   [] xe 0000:03:00.0: [drm] *ERROR* Tile0: GT1: G2H channel broken on read, type=3, reset required
>   [] xe 0000:03:00.0: [drm] *ERROR* Tile0: GT1: CT dequeue failed: -95
>   ...
>   [] xe 0000:03:00.0: [drm] *ERROR* Tile0: GT1: Timed out wait for G2H, fence 21965, action 5502, done no
>   [] xe 0000:03:00.0: [drm] PF: Tile0: GT1: Failed to push 1 policy KLV (-ETIME)
>   [] xe 0000:03:00.0: [drm] Tile0: GT1: { key 0x8004 : no value } # engine_group_config
>
> where type=3 was this unrecognized NO_RESPONSE_BUSY message.
>
> Note that GuC might send the real RESPONSE message right after
> the BUSY message, so we must be prepared to update our g2h_fence
> data twice before sender actually wakes up and clears the flags.
>
> Signed-off-by: Michal Wajdeczko <michal.wajdeczko@intel.com>
> ---
> Cc: Matthew Brost <matthew.brost@intel.com>
> Cc: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
> Link: https://intel-gfx-ci.01.org/tree/intel-xe/xe-pw-164119v2/shard-bmg-9/igt@xe_exec_reset@gt-reset.html
> ---
>   drivers/gpu/drm/xe/xe_guc_ct.c | 29 +++++++++++++++++++++++++++--
>   1 file changed, 27 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/xe/xe_guc_ct.c b/drivers/gpu/drm/xe/xe_guc_ct.c
> index a11cff7a20be..19305acb98e4 100644
> --- a/drivers/gpu/drm/xe/xe_guc_ct.c
> +++ b/drivers/gpu/drm/xe/xe_guc_ct.c
> @@ -186,6 +186,7 @@ static void fast_req_track(struct xe_guc_ct *ct, u16 fence, u16 action) { }
>   struct g2h_fence {
>   	u32 *response_buffer;
>   	u32 seqno;
> +	/* fields below this point are setup based on the response */
>   	u32 response_data;
>   	u16 response_len;
>   	u16 error;
> @@ -193,6 +194,7 @@ struct g2h_fence {
>   	u16 reason;
>   	bool cancel;
>   	bool retry;
> +	bool wait;
>   	bool fail;
>   	bool done;
>   };
> @@ -204,6 +206,11 @@ static void g2h_fence_init(struct g2h_fence *g2h_fence, u32 *response_buffer)
>   	g2h_fence->seqno = ~0x0;
>   }
>   
> +static void g2h_fence_void(struct g2h_fence *g2h_fence)

I'm not convinced that g2h_fence_void is the correct function name here. 
Maybe g2h_fence_clear_response or something like that?

> +{
> +	memset_after(g2h_fence, 0, seqno);
> +}
> +
>   static void g2h_fence_cancel(struct g2h_fence *g2h_fence)
>   {
>   	g2h_fence->cancel = true;
> @@ -1331,6 +1338,7 @@ static int guc_ct_send_recv(struct xe_guc_ct *ct, const u32 *action, u32 len,
>   	/* READ_ONCEs pairs with WRITE_ONCEs in parse_g2h_response
>   	 * and g2h_fence_cancel.
>   	 */
> +wait_again:
>   	ret = wait_event_timeout(ct->g2h_fence_wq, READ_ONCE(g2h_fence.done), HZ);
>   	if (!ret) {
>   		LNL_FLUSH_WORK(&ct->g2h_worker);
> @@ -1356,6 +1364,12 @@ static int guc_ct_send_recv(struct xe_guc_ct *ct, const u32 *action, u32 len,
>   		return -ETIME;
>   	}
>   
> +	if (g2h_fence.wait) {
> +		xe_gt_dbg(gt, "H2G action %#x busy...\n", action[0]);
> +		g2h_fence_void(&g2h_fence);
> +		mutex_unlock(&ct->lock);
> +		goto wait_again;
> +	}
>   	if (g2h_fence.retry) {
>   		xe_gt_dbg(gt, "H2G action %#x retrying: reason %#x\n",
>   			  action[0], g2h_fence.reason);
> @@ -1508,7 +1522,12 @@ static int parse_g2h_response(struct xe_guc_ct *ct, u32 *msg, u32 len)
>   		return -EPROTO;
>   	}
>   
> -	g2h_fence = xa_erase(&ct->fence_lookup, fence);
> +	/* don't erase as we still expect a final response with the same fence */
> +	if (type == GUC_HXG_TYPE_NO_RESPONSE_BUSY)
> +		g2h_fence = xa_load(&ct->fence_lookup, fence);
> +	else
> +		g2h_fence = xa_erase(&ct->fence_lookup, fence);
> +
>   	if (unlikely(!g2h_fence)) {

if we hit this error with a NO_RESPONSE_BUSY we'll release the memory 
with the fence still in the xa, which seems wrong.

>   		/* Don't tear down channel, as send could've timed out */
>   		/* CT_DEAD(ct, NULL, PARSE_G2H_UNKNOWN); */
> @@ -1518,6 +1537,7 @@ static int parse_g2h_response(struct xe_guc_ct *ct, u32 *msg, u32 len)
>   	}
>   
>   	xe_gt_assert(gt, fence == g2h_fence->seqno);
> +	g2h_fence_void(g2h_fence);

Is this here because we might be parsing the G2H with the actual 
response before the waiter has had time to process the initial BUSY 
response? It might be worth adding a comment to explain that.

Daniele

>   
>   	if (type == GUC_HXG_TYPE_RESPONSE_FAILURE) {
>   		g2h_fence->fail = true;
> @@ -1526,6 +1546,9 @@ static int parse_g2h_response(struct xe_guc_ct *ct, u32 *msg, u32 len)
>   	} else if (type == GUC_HXG_TYPE_NO_RESPONSE_RETRY) {
>   		g2h_fence->retry = true;
>   		g2h_fence->reason = FIELD_GET(GUC_HXG_RETRY_MSG_0_REASON, hxg[0]);
> +	} else if (type == GUC_HXG_TYPE_NO_RESPONSE_BUSY) {
> +		g2h_fence->wait = true;
> +		g2h_fence->reason = FIELD_GET(GUC_HXG_BUSY_MSG_0_COUNTER, hxg[0]);
>   	} else if (g2h_fence->response_buffer) {
>   		g2h_fence->response_len = hxg_len;
>   		memcpy(g2h_fence->response_buffer, hxg, hxg_len * sizeof(u32));
> @@ -1533,7 +1556,8 @@ static int parse_g2h_response(struct xe_guc_ct *ct, u32 *msg, u32 len)
>   		g2h_fence->response_data = FIELD_GET(GUC_HXG_RESPONSE_MSG_0_DATA0, hxg[0]);
>   	}
>   
> -	g2h_release_space(ct, GUC_CTB_HXG_MSG_MAX_LEN);
> +	if (!g2h_fence->wait)
> +		g2h_release_space(ct, GUC_CTB_HXG_MSG_MAX_LEN);
>   
>   	/* WRITE_ONCE pairs with READ_ONCEs in guc_ct_send_recv. */
>   	WRITE_ONCE(g2h_fence->done, true);
> @@ -1570,6 +1594,7 @@ static int parse_g2h_msg(struct xe_guc_ct *ct, u32 *msg, u32 len)
>   	case GUC_HXG_TYPE_RESPONSE_SUCCESS:
>   	case GUC_HXG_TYPE_RESPONSE_FAILURE:
>   	case GUC_HXG_TYPE_NO_RESPONSE_RETRY:
> +	case GUC_HXG_TYPE_NO_RESPONSE_BUSY:
>   		ret = parse_g2h_response(ct, msg, len);
>   		break;
>   	default:


  parent reply	other threads:[~2026-04-07 22:18 UTC|newest]

Thread overview: 8+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-04-03 20:44 [PATCH] drm/xe/guc: Add support for NO_RESPONSE_BUSY in CTB Michal Wajdeczko
2026-04-03 20:50 ` ✗ CI.checkpatch: warning for " Patchwork
2026-04-03 20:51 ` ✓ CI.KUnit: success " Patchwork
2026-04-03 21:27 ` ✓ Xe.CI.BAT: " Patchwork
2026-04-03 22:31 ` ✗ Xe.CI.FULL: failure " Patchwork
2026-04-07 22:18 ` Daniele Ceraolo Spurio [this message]
2026-04-08 13:54   ` [PATCH] " Michal Wajdeczko
2026-04-08 17:01     ` Daniele Ceraolo Spurio

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=6b5850d5-879f-4ea0-ad29-63ef0d8474d1@intel.com \
    --to=daniele.ceraolospurio@intel.com \
    --cc=intel-xe@lists.freedesktop.org \
    --cc=matthew.brost@intel.com \
    --cc=michal.wajdeczko@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox