Re: [PATCH 3/3] drm/xe/vf: Improve getting clean NULL context

Intel-XE Archive on lore.kernel.org
 help / color / mirror / Atom feed

From: "Michał Winiarski" <michal.winiarski@intel.com>
To: Michal Wajdeczko <michal.wajdeczko@intel.com>
Cc: <intel-xe@lists.freedesktop.org>
Subject: Re: [PATCH 3/3] drm/xe/vf: Improve getting clean NULL context
Date: Mon, 23 Mar 2026 10:01:25 +0100	[thread overview]
Message-ID: <acEA8JxXmhy4ASpF@nostramo> (raw)
In-Reply-To: <20260303201354.17948-4-michal.wajdeczko@intel.com>

On Tue, Mar 03, 2026 at 09:13:54PM +0100, Michal Wajdeczko wrote:
> There is a small risk that when fetching a NULL context image the
> VF may get a tweaked context image prepared by another VF that was
> previously running on the engine before the GuC scheduler switched
> the VFs.
> 
> To avoid that risk, without forcing GuC scheduler to trigger costly
> engine reset on every VF switch, use a watchdog mechanism that when
> configured with impossible condition, triggers an interrupt, which
> GuC will handle by doing an engine reset. Also adjust job size to
> account for additional dwords with watchdog setup.
> 
> Signed-off-by: Michal Wajdeczko <michal.wajdeczko@intel.com>

Reviewed-by: Michał Winiarski <michal.winiarski@intel.com>

Thanks,
-Michał

> ---
>  drivers/gpu/drm/xe/xe_gt.c              |  9 ++++--
>  drivers/gpu/drm/xe/xe_ring_ops.c        | 37 +++++++++++++++++++++++++
>  drivers/gpu/drm/xe/xe_ring_ops_types.h  |  2 +-
>  drivers/gpu/drm/xe/xe_sched_job_types.h |  2 ++
>  4 files changed, 46 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/gpu/drm/xe/xe_gt.c b/drivers/gpu/drm/xe/xe_gt.c
> index b455af1e6072..cf639729a22d 100644
> --- a/drivers/gpu/drm/xe/xe_gt.c
> +++ b/drivers/gpu/drm/xe/xe_gt.c
> @@ -169,7 +169,7 @@ static void xe_gt_enable_comp_1wcoh(struct xe_gt *gt)
>  static void gt_reset_worker(struct work_struct *w);
>  
>  static int emit_job_sync(struct xe_exec_queue *q, struct xe_bb *bb,
> -			 long timeout_jiffies)
> +			 long timeout_jiffies, bool force_reset)
>  {
>  	struct xe_sched_job *job;
>  	struct dma_fence *fence;
> @@ -179,6 +179,8 @@ static int emit_job_sync(struct xe_exec_queue *q, struct xe_bb *bb,
>  	if (IS_ERR(job))
>  		return PTR_ERR(job);
>  
> +	job->ring_ops_force_reset = force_reset;
> +
>  	xe_sched_job_arm(job);
>  	fence = dma_fence_get(&job->drm.s_fence->finished);
>  	xe_sched_job_push(job);
> @@ -202,7 +204,7 @@ static int emit_nop_job(struct xe_gt *gt, struct xe_exec_queue *q)
>  	if (IS_ERR(bb))
>  		return PTR_ERR(bb);
>  
> -	ret = emit_job_sync(q, bb, HZ);
> +	ret = emit_job_sync(q, bb, HZ, false);
>  	xe_bb_free(bb, NULL);
>  
>  	return ret;
> @@ -367,7 +369,8 @@ static int emit_wa_job(struct xe_gt *gt, struct xe_exec_queue *q)
>  
>  	bb->len = cs - bb->cs;
>  
> -	ret = emit_job_sync(q, bb, HZ);
> +	/* only VFs need to trigger reset to get a clean NULL context */
> +	ret = emit_job_sync(q, bb, HZ, IS_SRIOV_VF(gt_to_xe(gt)));
>  
>  	xe_bb_free(bb, NULL);
>  
> diff --git a/drivers/gpu/drm/xe/xe_ring_ops.c b/drivers/gpu/drm/xe/xe_ring_ops.c
> index 53d420d72164..bce7d93ce3a3 100644
> --- a/drivers/gpu/drm/xe/xe_ring_ops.c
> +++ b/drivers/gpu/drm/xe/xe_ring_ops.c
> @@ -256,6 +256,32 @@ static int emit_copy_timestamp(struct xe_device *xe, struct xe_lrc *lrc,
>  	return i;
>  }
>  
> +static int emit_fake_watchdog(struct xe_lrc *lrc, u32 *dw, int i)
> +{
> +	/*
> +	 * Setup a watchdog with impossible condition to always trigger an
> +	 * hardware interrupt that would force the GuC to reset the engine.
> +	 */
> +
> +	dw[i++] = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(2) | MI_LRI_LRM_CS_MMIO;
> +	dw[i++] = PR_CTR_THRSH(0).addr;
> +	dw[i++] = 2; /* small threshold */
> +	dw[i++] = PR_CTR_CTRL(0).addr;
> +	dw[i++] = CTR_LOGIC_OP(START);
> +
> +	dw[i++] = MI_SEMAPHORE_WAIT | MI_SEMW_GGTT | MI_SEMW_POLL | MI_SEMW_COMPARE(SAD_EQ_SDD);
> +	dw[i++] = 0xdead; /* this should never be seen */
> +	dw[i++] = lower_32_bits(xe_lrc_ggtt_addr(lrc));
> +	dw[i++] = upper_32_bits(xe_lrc_ggtt_addr(lrc));
> +	dw[i++] = 0; /* unused token */
> +
> +	dw[i++] = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(1) | MI_LRI_LRM_CS_MMIO;
> +	dw[i++] = PR_CTR_CTRL(0).addr;
> +	dw[i++] = CTR_LOGIC_OP(STOP);
> +
> +	return i;
> +}
> +
>  /* for engines that don't require any special HW handling (no EUs, no aux inval, etc) */
>  static void __emit_job_gen12_simple(struct xe_sched_job *job, struct xe_lrc *lrc,
>  				    u64 batch_addr, u32 *head, u32 seqno)
> @@ -266,6 +292,9 @@ static void __emit_job_gen12_simple(struct xe_sched_job *job, struct xe_lrc *lrc
>  
>  	*head = lrc->ring.tail;
>  
> +	if (job->ring_ops_force_reset)
> +		i = emit_fake_watchdog(lrc, dw, i);
> +
>  	i = emit_copy_timestamp(gt_to_xe(gt), lrc, dw, i);
>  
>  	if (job->ring_ops_flush_tlb) {
> @@ -324,6 +353,9 @@ static void __emit_job_gen12_video(struct xe_sched_job *job, struct xe_lrc *lrc,
>  
>  	*head = lrc->ring.tail;
>  
> +	if (job->ring_ops_force_reset)
> +		i = emit_fake_watchdog(lrc, dw, i);
> +
>  	i = emit_copy_timestamp(xe, lrc, dw, i);
>  
>  	dw[i++] = preparser_disable(true);
> @@ -381,6 +413,9 @@ static void __emit_job_gen12_render_compute(struct xe_sched_job *job,
>  
>  	*head = lrc->ring.tail;
>  
> +	if (job->ring_ops_force_reset)
> +		i = emit_fake_watchdog(lrc, dw, i);
> +
>  	i = emit_copy_timestamp(xe, lrc, dw, i);
>  
>  	dw[i++] = preparser_disable(true);
> @@ -433,6 +468,8 @@ static void emit_migration_job_gen12(struct xe_sched_job *job,
>  
>  	*head = lrc->ring.tail;
>  
> +	xe_gt_assert(gt, !job->ring_ops_force_reset);
> +
>  	i = emit_copy_timestamp(xe, lrc, dw, i);
>  
>  	i = emit_store_imm_ggtt(saddr, seqno, dw, i);
> diff --git a/drivers/gpu/drm/xe/xe_ring_ops_types.h b/drivers/gpu/drm/xe/xe_ring_ops_types.h
> index d7e3e150a9a5..a42a465ac438 100644
> --- a/drivers/gpu/drm/xe/xe_ring_ops_types.h
> +++ b/drivers/gpu/drm/xe/xe_ring_ops_types.h
> @@ -8,7 +8,7 @@
>  
>  struct xe_sched_job;
>  
> -#define MAX_JOB_SIZE_DW 58
> +#define MAX_JOB_SIZE_DW 72
>  #define MAX_JOB_SIZE_BYTES (MAX_JOB_SIZE_DW * 4)
>  
>  /**
> diff --git a/drivers/gpu/drm/xe/xe_sched_job_types.h b/drivers/gpu/drm/xe/xe_sched_job_types.h
> index 13c2970e81a8..0490b1247a6e 100644
> --- a/drivers/gpu/drm/xe/xe_sched_job_types.h
> +++ b/drivers/gpu/drm/xe/xe_sched_job_types.h
> @@ -63,6 +63,8 @@ struct xe_sched_job {
>  	u64 sample_timestamp;
>  	/** @ring_ops_flush_tlb: The ring ops need to flush TLB before payload. */
>  	bool ring_ops_flush_tlb;
> +	/** @ring_ops_force_reset: The ring ops need to trigger a reset before payload. */
> +	bool ring_ops_force_reset;
>  	/** @ggtt: mapped in ggtt. */
>  	bool ggtt;
>  	/** @restore_replay: job being replayed for restore */
> -- 
> 2.47.1
>

next prev parent reply	other threads:[~2026-03-23  9:01 UTC|newest]

Thread overview: 15+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-03-03 20:13 [PATCH 0/3] drm/xe/vf: Improve getting clean NULL context Michal Wajdeczko
2026-03-03 20:13 ` [PATCH 1/3] drm/xe: Add PR_CTR_CTRL/THRSH register definitions Michal Wajdeczko
2026-03-23  8:51   ` Michał Winiarski
2026-03-03 20:13 ` [PATCH 2/3] drm/xe: Add MI_SEMAPHORE_WAIT command definition Michal Wajdeczko
2026-03-23  8:53   ` Michał Winiarski
2026-03-03 20:13 ` [PATCH 3/3] drm/xe/vf: Improve getting clean NULL context Michal Wajdeczko
2026-03-23  9:01   ` Michał Winiarski [this message]
2026-03-03 20:21 ` ✓ CI.KUnit: success for " Patchwork
2026-03-03 21:04 ` ✗ Xe.CI.BAT: failure " Patchwork
2026-03-04  7:24 ` ✗ Xe.CI.FULL: " Patchwork
2026-03-06  8:58 ` ✓ CI.KUnit: success for drm/xe/vf: Improve getting clean NULL context (rev2) Patchwork
2026-03-06  9:35 ` ✗ Xe.CI.BAT: failure " Patchwork
2026-03-23  9:08   ` Michal Wajdeczko
2026-03-07  9:40 ` ✗ Xe.CI.FULL: " Patchwork
2026-03-23  9:16   ` Michal Wajdeczko

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=acEA8JxXmhy4ASpF@nostramo \
    --to=michal.winiarski@intel.com \
    --cc=intel-xe@lists.freedesktop.org \
    --cc=michal.wajdeczko@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox