Re: [PATCH v3] drm/xe/migrate: Atomicize CCS copy command setup

Intel-XE Archive on lore.kernel.org
 help / color / mirror / Atom feed

From: Matthew Brost <matthew.brost@intel.com>
To: Satyanarayana K V P <satyanarayana.k.v.p@intel.com>
Cc: <intel-xe@lists.freedesktop.org>,
	Michal Wajdeczko <michal.wajdeczko@intel.com>,
	Matthew Auld <matthew.auld@intel.com>
Subject: Re: [PATCH v3] drm/xe/migrate: Atomicize CCS copy command setup
Date: Tue, 30 Sep 2025 19:50:17 -0700	[thread overview]
Message-ID: <aNyW6R8KbcMR4HNB@lstrano-desk.jf.intel.com> (raw)
In-Reply-To: <20250929164507.2593639-1-satyanarayana.k.v.p@intel.com>

On Mon, Sep 29, 2025 at 04:45:07PM +0000, Satyanarayana K V P wrote:
> The CCS copy command is a 5-dword sequence. If the vCPU halts during
> save/restore while this sequence is being programmed, partial writes may
> trigger page faults when saving IGPU CCS metadata. Use the VMOVDQU
> instruction to write the sequence atomically.
> 
> Since VMOVDQU operates on 256-bit chunks, update EMIT_COPY_CCS_DW to emit
> 8 dwords instead of 5 dwords.
> 
> Update emit_flush_invalidate() to use VMOVDQU operating with 128-bit
> chunks.
> 
> Signed-off-by: Satyanarayana K V P <satyanarayana.k.v.p@intel.com>
> Cc: Michal Wajdeczko <michal.wajdeczko@intel.com>
> Cc: Matthew Brost <matthew.brost@intel.com>
> Cc: Matthew Auld <matthew.auld@intel.com>
> 

For this series and the subsequent follow-up that addresses the inverse
side of the problem (i.e., avoiding memset calls on BB detach), I
believe we really need an IGT that focuses solely on creating and
destroying many CCS BOs in a loop, using threads to attempt to trigger
the race condition. With enough iterations, we should be able to get the
save/restore BB to hang on PTL.

Once the patches are applied, the test should pass reliably. In my
opinion, that’s the only way to truly validate the correctness of the
fix—it’s nearly impossible to reason about correctness without such
testing. So this is where we need to start: testing. This is almost
always the case with complex issues.

With that in mind, can we get this IGT implemented and added to VMTB?

Matt 

> ---
> V2 -> V3:
> - Added support for 128 bit and 256 bit instructions with memcpy_vmovdqu
> - Updated emit_flush_invalidate() to use vmovdqu instruction.
> 
> V1 -> V2:
> - Use memcpy_vmovdqu only for x86 arch and for VF. Else use memcpy
>   (Auld, Matthew)
>   - Fix issues reported by patchworks.
> ---
>  drivers/gpu/drm/xe/xe_migrate.c | 92 +++++++++++++++++++++++++++------
>  1 file changed, 77 insertions(+), 15 deletions(-)
> 
> diff --git a/drivers/gpu/drm/xe/xe_migrate.c b/drivers/gpu/drm/xe/xe_migrate.c
> index 1d667fa36cf3..a37d4cb28aac 100644
> --- a/drivers/gpu/drm/xe/xe_migrate.c
> +++ b/drivers/gpu/drm/xe/xe_migrate.c
> @@ -5,6 +5,7 @@
>  
>  #include "xe_migrate.h"
>  
> +#include <asm/fpu/api.h>
>  #include <linux/bitfield.h>
>  #include <linux/sizes.h>
>  
> @@ -644,18 +645,64 @@ static void emit_pte(struct xe_migrate *m,
>  	}
>  }
>  
> -#define EMIT_COPY_CCS_DW 5
> +/*
> + * The CCS copy command is a 5-dword sequence. If the vCPU halts during
> + * save/restore while this sequence is being issued, partial writes may trigger
> + * page faults when saving iGPU CCS metadata. Use the VMOVDQU instruction to
> + * write the sequence atomically.
> + */
> +static void memcpy_vmovdqu(void *dst, const void *src, u32 size)
> +{
> +	kernel_fpu_begin();
> +
> +#ifdef CONFIG_X86
> +	if (size == SZ_128) {
> +		asm("vmovdqu (%0), %%xmm0\n"
> +		    "vmovups %%xmm0,   (%1)\n"
> +		    :: "r" (src), "r" (dst) : "memory");
> +	} else if (size == SZ_256) {
> +		asm("vmovdqu (%0), %%ymm0\n"
> +		    "vmovups %%ymm0,   (%1)\n"
> +		    :: "r" (src), "r" (dst) : "memory");
> +	}
> +#endif
> +	kernel_fpu_end();
> +}
> +
> +static int xe_migrate_memcpy_atomic(struct xe_gt *gt, void *dst, const void
> +				     *src, u32 size)
> +{
> +	int instr_size = size * SZ_8;
> +
> +	if (IS_SRIOV_VF(gt_to_xe(gt)) && static_cpu_has(X86_FEATURE_AVX)) {
> +		if (instr_size != SZ_128 && instr_size != SZ_256) {
> +			drm_dbg(&gt_to_xe(gt)->drm,
> +				"Invalid size received for atomic copy %d", instr_size);
> +			return -EINVAL;
> +		}
> +
> +		memcpy_vmovdqu(dst, src, instr_size);
> +	} else {
> +		memcpy(dst, src, size);
> +	}
> +
> +	return 0;
> +}
> +
> +#define EMIT_COPY_CCS_DW 8
>  static void emit_copy_ccs(struct xe_gt *gt, struct xe_bb *bb,
>  			  u64 dst_ofs, bool dst_is_indirect,
>  			  u64 src_ofs, bool src_is_indirect,
>  			  u32 size)
>  {
> +	u32 dw[EMIT_COPY_CCS_DW] = {MI_NOOP};
>  	struct xe_device *xe = gt_to_xe(gt);
>  	u32 *cs = bb->cs + bb->len;
>  	u32 num_ccs_blks;
>  	u32 num_pages;
>  	u32 ccs_copy_size;
>  	u32 mocs;
> +	u32 i = 0;
>  
>  	if (GRAPHICS_VERx100(xe) >= 2000) {
>  		num_pages = DIV_ROUND_UP(size, XE_PAGE_SIZE);
> @@ -673,14 +720,17 @@ static void emit_copy_ccs(struct xe_gt *gt, struct xe_bb *bb,
>  		mocs = FIELD_PREP(XY_CTRL_SURF_MOCS_MASK, gt->mocs.uc_index);
>  	}
>  
> -	*cs++ = XY_CTRL_SURF_COPY_BLT |
> -		(src_is_indirect ? 0x0 : 0x1) << SRC_ACCESS_TYPE_SHIFT |
> -		(dst_is_indirect ? 0x0 : 0x1) << DST_ACCESS_TYPE_SHIFT |
> -		ccs_copy_size;
> -	*cs++ = lower_32_bits(src_ofs);
> -	*cs++ = upper_32_bits(src_ofs) | mocs;
> -	*cs++ = lower_32_bits(dst_ofs);
> -	*cs++ = upper_32_bits(dst_ofs) | mocs;
> +	dw[i++] = XY_CTRL_SURF_COPY_BLT |
> +		  (src_is_indirect ? 0x0 : 0x1) << SRC_ACCESS_TYPE_SHIFT |
> +		  (dst_is_indirect ? 0x0 : 0x1) << DST_ACCESS_TYPE_SHIFT |
> +		  ccs_copy_size;
> +	dw[i++] = lower_32_bits(src_ofs);
> +	dw[i++] = upper_32_bits(src_ofs) | mocs;
> +	dw[i++] = lower_32_bits(dst_ofs);
> +	dw[i++] = upper_32_bits(dst_ofs) | mocs;
> +
> +	if (!xe_migrate_memcpy_atomic(gt, cs, dw, sizeof(u32) * EMIT_COPY_CCS_DW))
> +		cs += EMIT_COPY_CCS_DW;
>  
>  	bb->len = cs - bb->cs;
>  }
> @@ -980,16 +1030,28 @@ struct xe_lrc *xe_migrate_lrc(struct xe_migrate *migrate)
>  	return migrate->q->lrc[0];
>  }
>  
> +/*
> + * The MI_FLUSH_DW command is a 4-dword sequence. If the vCPU halts during
> + * save/restore while this sequence is being issued, partial writes may
> + * trigger page faults when saving iGPU CCS metadata. Use
> + * xe_migrate_memcpy_atomic() to write the sequence atomically.
> + */
>  static int emit_flush_invalidate(struct xe_exec_queue *q, u32 *dw, int i,
>  				 u32 flags)
>  {
>  	struct xe_lrc *lrc = xe_exec_queue_lrc(q);
> -	dw[i++] = MI_FLUSH_DW | MI_INVALIDATE_TLB | MI_FLUSH_DW_OP_STOREDW |
> -		  MI_FLUSH_IMM_DW | flags;
> -	dw[i++] = lower_32_bits(xe_lrc_start_seqno_ggtt_addr(lrc)) |
> -		  MI_FLUSH_DW_USE_GTT;
> -	dw[i++] = upper_32_bits(xe_lrc_start_seqno_ggtt_addr(lrc));
> -	dw[i++] = MI_NOOP;
> +	u32 tmp_dw[SZ_4] = {MI_NOOP}, j = 0;
> +
> +	tmp_dw[j++] = MI_FLUSH_DW | MI_INVALIDATE_TLB | MI_FLUSH_DW_OP_STOREDW |
> +		      MI_FLUSH_IMM_DW | flags;
> +	tmp_dw[j++] = lower_32_bits(xe_lrc_start_seqno_ggtt_addr(lrc)) |
> +		      MI_FLUSH_DW_USE_GTT;
> +	tmp_dw[j++] = upper_32_bits(xe_lrc_start_seqno_ggtt_addr(lrc));
> +	tmp_dw[j++] = MI_NOOP;
> +
> +	if (!xe_migrate_memcpy_atomic(q->gt, &dw[i], tmp_dw, sizeof(u32) * j))
> +		i += j;
> +
>  	dw[i++] = MI_NOOP;
>  
>  	return i;
> -- 
> 2.43.0
>

     prev parent reply	other threads:[~2025-10-01  2:50 UTC|newest]

Thread overview: 6+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-09-29 16:45 [PATCH v3] drm/xe/migrate: Atomicize CCS copy command setup Satyanarayana K V P
2025-09-29 16:51 ` ✗ CI.KUnit: failure for drm/xe/migrate: Atomicize CCS copy command setup (rev3) Patchwork
2025-10-10 19:12   ` Rodrigo Vivi
2025-09-29 18:54 ` [PATCH v3] drm/xe/migrate: Atomicize CCS copy command setup Michal Wajdeczko
2025-10-01  2:31   ` Matthew Brost
2025-10-01  2:50 ` Matthew Brost [this message]

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=aNyW6R8KbcMR4HNB@lstrano-desk.jf.intel.com \
    --to=matthew.brost@intel.com \
    --cc=intel-xe@lists.freedesktop.org \
    --cc=matthew.auld@intel.com \
    --cc=michal.wajdeczko@intel.com \
    --cc=satyanarayana.k.v.p@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox