[PATCH v3] drm/xe/migrate: Atomicize CCS copy command setup

Intel-XE Archive on lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH v3] drm/xe/migrate: Atomicize CCS copy command setup
@ 2025-09-29 16:45 Satyanarayana K V P
  2025-09-29 16:51 ` ✗ CI.KUnit: failure for drm/xe/migrate: Atomicize CCS copy command setup (rev3) Patchwork
                   ` (2 more replies)
  0 siblings, 3 replies; 6+ messages in thread
From: Satyanarayana K V P @ 2025-09-29 16:45 UTC (permalink / raw)
  To: intel-xe; +Cc: Satyanarayana K V P, Michal Wajdeczko, Matthew Brost,
	Matthew Auld

The CCS copy command is a 5-dword sequence. If the vCPU halts during
save/restore while this sequence is being programmed, partial writes may
trigger page faults when saving IGPU CCS metadata. Use the VMOVDQU
instruction to write the sequence atomically.

Since VMOVDQU operates on 256-bit chunks, update EMIT_COPY_CCS_DW to emit
8 dwords instead of 5 dwords.

Update emit_flush_invalidate() to use VMOVDQU operating with 128-bit
chunks.

Signed-off-by: Satyanarayana K V P <satyanarayana.k.v.p@intel.com>
Cc: Michal Wajdeczko <michal.wajdeczko@intel.com>
Cc: Matthew Brost <matthew.brost@intel.com>
Cc: Matthew Auld <matthew.auld@intel.com>

---
V2 -> V3:
- Added support for 128 bit and 256 bit instructions with memcpy_vmovdqu
- Updated emit_flush_invalidate() to use vmovdqu instruction.

V1 -> V2:
- Use memcpy_vmovdqu only for x86 arch and for VF. Else use memcpy
  (Auld, Matthew)
  - Fix issues reported by patchworks.
---
 drivers/gpu/drm/xe/xe_migrate.c | 92 +++++++++++++++++++++++++++------
 1 file changed, 77 insertions(+), 15 deletions(-)

diff --git a/drivers/gpu/drm/xe/xe_migrate.c b/drivers/gpu/drm/xe/xe_migrate.c
index 1d667fa36cf3..a37d4cb28aac 100644
--- a/drivers/gpu/drm/xe/xe_migrate.c
+++ b/drivers/gpu/drm/xe/xe_migrate.c
@@ -5,6 +5,7 @@
 
 #include "xe_migrate.h"
 
+#include <asm/fpu/api.h>
 #include <linux/bitfield.h>
 #include <linux/sizes.h>
 
@@ -644,18 +645,64 @@ static void emit_pte(struct xe_migrate *m,
 	}
 }
 
-#define EMIT_COPY_CCS_DW 5
+/*
+ * The CCS copy command is a 5-dword sequence. If the vCPU halts during
+ * save/restore while this sequence is being issued, partial writes may trigger
+ * page faults when saving iGPU CCS metadata. Use the VMOVDQU instruction to
+ * write the sequence atomically.
+ */
+static void memcpy_vmovdqu(void *dst, const void *src, u32 size)
+{
+	kernel_fpu_begin();
+
+#ifdef CONFIG_X86
+	if (size == SZ_128) {
+		asm("vmovdqu (%0), %%xmm0\n"
+		    "vmovups %%xmm0,   (%1)\n"
+		    :: "r" (src), "r" (dst) : "memory");
+	} else if (size == SZ_256) {
+		asm("vmovdqu (%0), %%ymm0\n"
+		    "vmovups %%ymm0,   (%1)\n"
+		    :: "r" (src), "r" (dst) : "memory");
+	}
+#endif
+	kernel_fpu_end();
+}
+
+static int xe_migrate_memcpy_atomic(struct xe_gt *gt, void *dst, const void
+				     *src, u32 size)
+{
+	int instr_size = size * SZ_8;
+
+	if (IS_SRIOV_VF(gt_to_xe(gt)) && static_cpu_has(X86_FEATURE_AVX)) {
+		if (instr_size != SZ_128 && instr_size != SZ_256) {
+			drm_dbg(&gt_to_xe(gt)->drm,
+				"Invalid size received for atomic copy %d", instr_size);
+			return -EINVAL;
+		}
+
+		memcpy_vmovdqu(dst, src, instr_size);
+	} else {
+		memcpy(dst, src, size);
+	}
+
+	return 0;
+}
+
+#define EMIT_COPY_CCS_DW 8
 static void emit_copy_ccs(struct xe_gt *gt, struct xe_bb *bb,
 			  u64 dst_ofs, bool dst_is_indirect,
 			  u64 src_ofs, bool src_is_indirect,
 			  u32 size)
 {
+	u32 dw[EMIT_COPY_CCS_DW] = {MI_NOOP};
 	struct xe_device *xe = gt_to_xe(gt);
 	u32 *cs = bb->cs + bb->len;
 	u32 num_ccs_blks;
 	u32 num_pages;
 	u32 ccs_copy_size;
 	u32 mocs;
+	u32 i = 0;
 
 	if (GRAPHICS_VERx100(xe) >= 2000) {
 		num_pages = DIV_ROUND_UP(size, XE_PAGE_SIZE);
@@ -673,14 +720,17 @@ static void emit_copy_ccs(struct xe_gt *gt, struct xe_bb *bb,
 		mocs = FIELD_PREP(XY_CTRL_SURF_MOCS_MASK, gt->mocs.uc_index);
 	}
 
-	*cs++ = XY_CTRL_SURF_COPY_BLT |
-		(src_is_indirect ? 0x0 : 0x1) << SRC_ACCESS_TYPE_SHIFT |
-		(dst_is_indirect ? 0x0 : 0x1) << DST_ACCESS_TYPE_SHIFT |
-		ccs_copy_size;
-	*cs++ = lower_32_bits(src_ofs);
-	*cs++ = upper_32_bits(src_ofs) | mocs;
-	*cs++ = lower_32_bits(dst_ofs);
-	*cs++ = upper_32_bits(dst_ofs) | mocs;
+	dw[i++] = XY_CTRL_SURF_COPY_BLT |
+		  (src_is_indirect ? 0x0 : 0x1) << SRC_ACCESS_TYPE_SHIFT |
+		  (dst_is_indirect ? 0x0 : 0x1) << DST_ACCESS_TYPE_SHIFT |
+		  ccs_copy_size;
+	dw[i++] = lower_32_bits(src_ofs);
+	dw[i++] = upper_32_bits(src_ofs) | mocs;
+	dw[i++] = lower_32_bits(dst_ofs);
+	dw[i++] = upper_32_bits(dst_ofs) | mocs;
+
+	if (!xe_migrate_memcpy_atomic(gt, cs, dw, sizeof(u32) * EMIT_COPY_CCS_DW))
+		cs += EMIT_COPY_CCS_DW;
 
 	bb->len = cs - bb->cs;
 }
@@ -980,16 +1030,28 @@ struct xe_lrc *xe_migrate_lrc(struct xe_migrate *migrate)
 	return migrate->q->lrc[0];
 }
 
+/*
+ * The MI_FLUSH_DW command is a 4-dword sequence. If the vCPU halts during
+ * save/restore while this sequence is being issued, partial writes may
+ * trigger page faults when saving iGPU CCS metadata. Use
+ * xe_migrate_memcpy_atomic() to write the sequence atomically.
+ */
 static int emit_flush_invalidate(struct xe_exec_queue *q, u32 *dw, int i,
 				 u32 flags)
 {
 	struct xe_lrc *lrc = xe_exec_queue_lrc(q);
-	dw[i++] = MI_FLUSH_DW | MI_INVALIDATE_TLB | MI_FLUSH_DW_OP_STOREDW |
-		  MI_FLUSH_IMM_DW | flags;
-	dw[i++] = lower_32_bits(xe_lrc_start_seqno_ggtt_addr(lrc)) |
-		  MI_FLUSH_DW_USE_GTT;
-	dw[i++] = upper_32_bits(xe_lrc_start_seqno_ggtt_addr(lrc));
-	dw[i++] = MI_NOOP;
+	u32 tmp_dw[SZ_4] = {MI_NOOP}, j = 0;
+
+	tmp_dw[j++] = MI_FLUSH_DW | MI_INVALIDATE_TLB | MI_FLUSH_DW_OP_STOREDW |
+		      MI_FLUSH_IMM_DW | flags;
+	tmp_dw[j++] = lower_32_bits(xe_lrc_start_seqno_ggtt_addr(lrc)) |
+		      MI_FLUSH_DW_USE_GTT;
+	tmp_dw[j++] = upper_32_bits(xe_lrc_start_seqno_ggtt_addr(lrc));
+	tmp_dw[j++] = MI_NOOP;
+
+	if (!xe_migrate_memcpy_atomic(q->gt, &dw[i], tmp_dw, sizeof(u32) * j))
+		i += j;
+
 	dw[i++] = MI_NOOP;
 
 	return i;
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* ✗ CI.KUnit: failure for drm/xe/migrate: Atomicize CCS copy command setup (rev3)
  2025-09-29 16:45 [PATCH v3] drm/xe/migrate: Atomicize CCS copy command setup Satyanarayana K V P
@ 2025-09-29 16:51 ` Patchwork
  2025-10-10 19:12   ` Rodrigo Vivi
  2025-09-29 18:54 ` [PATCH v3] drm/xe/migrate: Atomicize CCS copy command setup Michal Wajdeczko
  2025-10-01  2:50 ` Matthew Brost
  2 siblings, 1 reply; 6+ messages in thread
From: Patchwork @ 2025-09-29 16:51 UTC (permalink / raw)
  To: Satyanarayana K V P; +Cc: intel-xe

== Series Details ==

Series: drm/xe/migrate: Atomicize CCS copy command setup (rev3)
URL   : https://patchwork.freedesktop.org/series/155096/
State : failure

== Summary ==

+ trap cleanup EXIT
+ /kernel/tools/testing/kunit/kunit.py run --kunitconfig /kernel/drivers/gpu/drm/xe/.kunitconfig
ERROR:root:../drivers/gpu/drm/xe/xe_migrate.c: In function ‘xe_migrate_memcpy_atomic’:
../drivers/gpu/drm/xe/xe_migrate.c:677:42: error: implicit declaration of function ‘static_cpu_has’; did you mean ‘static_key_false’? [-Werror=implicit-function-declaration]
  677 |         if (IS_SRIOV_VF(gt_to_xe(gt)) && static_cpu_has(X86_FEATURE_AVX)) {
      |                                          ^~~~~~~~~~~~~~
      |                                          static_key_false
cc1: some warnings being treated as errors
make[7]: *** [../scripts/Makefile.build:287: drivers/gpu/drm/xe/xe_migrate.o] Error 1
make[7]: *** Waiting for unfinished jobs....
make[6]: *** [../scripts/Makefile.build:556: drivers/gpu/drm/xe] Error 2
make[5]: *** [../scripts/Makefile.build:556: drivers/gpu/drm] Error 2
make[4]: *** [../scripts/Makefile.build:556: drivers/gpu] Error 2
make[3]: *** [../scripts/Makefile.build:556: drivers] Error 2
make[2]: *** [/kernel/Makefile:2011: .] Error 2
make[1]: *** [/kernel/Makefile:248: __sub-make] Error 2
make: *** [Makefile:248: __sub-make] Error 2

[16:50:55] Configuring KUnit Kernel ...
Generating .config ...
Populating config with:
$ make ARCH=um O=.kunit olddefconfig
[16:50:59] Building KUnit Kernel ...
Populating config with:
$ make ARCH=um O=.kunit olddefconfig
Building with:
$ make all compile_commands.json scripts_gdb ARCH=um O=.kunit --jobs=48
+ cleanup
++ stat -c %u:%g /kernel
+ chown -R 1003:1003 /kernel



^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: ✗ CI.KUnit: failure for drm/xe/migrate: Atomicize CCS copy command setup (rev3)
  2025-09-29 16:51 ` ✗ CI.KUnit: failure for drm/xe/migrate: Atomicize CCS copy command setup (rev3) Patchwork
@ 2025-10-10 19:12   ` Rodrigo Vivi
  0 siblings, 0 replies; 6+ messages in thread
From: Rodrigo Vivi @ 2025-10-10 19:12 UTC (permalink / raw)
  To: intel-xe; +Cc: Satyanarayana K V P

On Mon, Sep 29, 2025 at 04:51:28PM -0000, Patchwork wrote:
> == Series Details ==
> 
> Series: drm/xe/migrate: Atomicize CCS copy command setup (rev3)
> URL   : https://patchwork.freedesktop.org/series/155096/
> State : failure
> 
> == Summary ==
> 
> + trap cleanup EXIT
> + /kernel/tools/testing/kunit/kunit.py run --kunitconfig /kernel/drivers/gpu/drm/xe/.kunitconfig
> ERROR:root:../drivers/gpu/drm/xe/xe_migrate.c: In function ‘xe_migrate_memcpy_atomic’:
> ../drivers/gpu/drm/xe/xe_migrate.c:677:42: error: implicit declaration of function ‘static_cpu_has’; did you mean ‘static_key_false’? [-Werror=implicit-function-declaration]
>   677 |         if (IS_SRIOV_VF(gt_to_xe(gt)) && static_cpu_has(X86_FEATURE_AVX)) {
>       |                                          ^~~~~~~~~~~~~~
>       |                                          static_key_false
> cc1: some warnings being treated as errors

please ensure this is passing....

> make[7]: *** [../scripts/Makefile.build:287: drivers/gpu/drm/xe/xe_migrate.o] Error 1
> make[7]: *** Waiting for unfinished jobs....
> make[6]: *** [../scripts/Makefile.build:556: drivers/gpu/drm/xe] Error 2
> make[5]: *** [../scripts/Makefile.build:556: drivers/gpu/drm] Error 2
> make[4]: *** [../scripts/Makefile.build:556: drivers/gpu] Error 2
> make[3]: *** [../scripts/Makefile.build:556: drivers] Error 2
> make[2]: *** [/kernel/Makefile:2011: .] Error 2
> make[1]: *** [/kernel/Makefile:248: __sub-make] Error 2
> make: *** [Makefile:248: __sub-make] Error 2
> 
> [16:50:55] Configuring KUnit Kernel ...
> Generating .config ...
> Populating config with:
> $ make ARCH=um O=.kunit olddefconfig
> [16:50:59] Building KUnit Kernel ...
> Populating config with:
> $ make ARCH=um O=.kunit olddefconfig
> Building with:
> $ make all compile_commands.json scripts_gdb ARCH=um O=.kunit --jobs=48
> + cleanup
> ++ stat -c %u:%g /kernel
> + chown -R 1003:1003 /kernel
> 
> 

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH v3] drm/xe/migrate: Atomicize CCS copy command setup
  2025-09-29 16:45 [PATCH v3] drm/xe/migrate: Atomicize CCS copy command setup Satyanarayana K V P
  2025-09-29 16:51 ` ✗ CI.KUnit: failure for drm/xe/migrate: Atomicize CCS copy command setup (rev3) Patchwork
@ 2025-09-29 18:54 ` Michal Wajdeczko
  2025-10-01  2:31   ` Matthew Brost
  2025-10-01  2:50 ` Matthew Brost
  2 siblings, 1 reply; 6+ messages in thread
From: Michal Wajdeczko @ 2025-09-29 18:54 UTC (permalink / raw)
  To: Satyanarayana K V P, intel-xe; +Cc: Matthew Brost, Matthew Auld



On 9/29/2025 6:45 PM, Satyanarayana K V P wrote:
> The CCS copy command is a 5-dword sequence. If the vCPU halts during
> save/restore while this sequence is being programmed, partial writes may
> trigger page faults when saving IGPU CCS metadata. Use the VMOVDQU
> instruction to write the sequence atomically.
> 
> Since VMOVDQU operates on 256-bit chunks, update EMIT_COPY_CCS_DW to emit
> 8 dwords instead of 5 dwords.
> 
> Update emit_flush_invalidate() to use VMOVDQU operating with 128-bit
> chunks.
> 
> Signed-off-by: Satyanarayana K V P <satyanarayana.k.v.p@intel.com>
> Cc: Michal Wajdeczko <michal.wajdeczko@intel.com>
> Cc: Matthew Brost <matthew.brost@intel.com>
> Cc: Matthew Auld <matthew.auld@intel.com>
> 
> ---
> V2 -> V3:
> - Added support for 128 bit and 256 bit instructions with memcpy_vmovdqu
> - Updated emit_flush_invalidate() to use vmovdqu instruction.
> 
> V1 -> V2:
> - Use memcpy_vmovdqu only for x86 arch and for VF. Else use memcpy
>   (Auld, Matthew)
>   - Fix issues reported by patchworks.
> ---
>  drivers/gpu/drm/xe/xe_migrate.c | 92 +++++++++++++++++++++++++++------
>  1 file changed, 77 insertions(+), 15 deletions(-)
> 
> diff --git a/drivers/gpu/drm/xe/xe_migrate.c b/drivers/gpu/drm/xe/xe_migrate.c
> index 1d667fa36cf3..a37d4cb28aac 100644
> --- a/drivers/gpu/drm/xe/xe_migrate.c
> +++ b/drivers/gpu/drm/xe/xe_migrate.c
> @@ -5,6 +5,7 @@
>  
>  #include "xe_migrate.h"
>  
> +#include <asm/fpu/api.h>
>  #include <linux/bitfield.h>
>  #include <linux/sizes.h>
>  
> @@ -644,18 +645,64 @@ static void emit_pte(struct xe_migrate *m,
>  	}
>  }
>  
> -#define EMIT_COPY_CCS_DW 5
> +/*
> + * The CCS copy command is a 5-dword sequence. If the vCPU halts during
> + * save/restore while this sequence is being issued, partial writes may trigger
> + * page faults when saving iGPU CCS metadata. Use the VMOVDQU instruction to
> + * write the sequence atomically.

shouldn't this comment be near/inside emit_copy_ccs() ?

> + */
> +static void memcpy_vmovdqu(void *dst, const void *src, u32 size)
> +{
> +	kernel_fpu_begin();
> +
> +#ifdef CONFIG_X86
> +	if (size == SZ_128) {
> +		asm("vmovdqu (%0), %%xmm0\n"
> +		    "vmovups %%xmm0,   (%1)\n"
> +		    :: "r" (src), "r" (dst) : "memory");
> +	} else if (size == SZ_256) {
> +		asm("vmovdqu (%0), %%ymm0\n"
> +		    "vmovups %%ymm0,   (%1)\n"
> +		    :: "r" (src), "r" (dst) : "memory");
> +	}
> +#endif
> +	kernel_fpu_end();
> +}
> +
> +static int xe_migrate_memcpy_atomic(struct xe_gt *gt, void *dst, const void

maybe name it as emit_atomic() ?

> +				     *src, u32 size)
> +{
> +	int instr_size = size * SZ_8;

why signed int?
and maybe explain where this 8 comes from?
did you mean BITS_PER_BYTE here

> +
> +	if (IS_SRIOV_VF(gt_to_xe(gt)) && static_cpu_has(X86_FEATURE_AVX)) {
> +		if (instr_size != SZ_128 && instr_size != SZ_256) {
> +			drm_dbg(&gt_to_xe(gt)->drm,
> +				"Invalid size received for atomic copy %d", instr_size);

as this is internal/static function, there should be no need for runtime checks - this all your code
assert shall be sufficient
> +			return -EINVAL;
> +		}
> +
> +		memcpy_vmovdqu(dst, src, instr_size);
> +	} else {
> +		memcpy(dst, src, size);
> +	}
> +
> +	return 0;
> +}
> +
> +#define EMIT_COPY_CCS_DW 8
>  static void emit_copy_ccs(struct xe_gt *gt, struct xe_bb *bb,
>  			  u64 dst_ofs, bool dst_is_indirect,
>  			  u64 src_ofs, bool src_is_indirect,
>  			  u32 size)
>  {
> +	u32 dw[EMIT_COPY_CCS_DW] = {MI_NOOP};
>  	struct xe_device *xe = gt_to_xe(gt);
>  	u32 *cs = bb->cs + bb->len;
>  	u32 num_ccs_blks;
>  	u32 num_pages;
>  	u32 ccs_copy_size;
>  	u32 mocs;
> +	u32 i = 0;
>  
>  	if (GRAPHICS_VERx100(xe) >= 2000) {
>  		num_pages = DIV_ROUND_UP(size, XE_PAGE_SIZE);
> @@ -673,14 +720,17 @@ static void emit_copy_ccs(struct xe_gt *gt, struct xe_bb *bb,
>  		mocs = FIELD_PREP(XY_CTRL_SURF_MOCS_MASK, gt->mocs.uc_index);
>  	}
>  
> -	*cs++ = XY_CTRL_SURF_COPY_BLT |
> -		(src_is_indirect ? 0x0 : 0x1) << SRC_ACCESS_TYPE_SHIFT |
> -		(dst_is_indirect ? 0x0 : 0x1) << DST_ACCESS_TYPE_SHIFT |
> -		ccs_copy_size;
> -	*cs++ = lower_32_bits(src_ofs);
> -	*cs++ = upper_32_bits(src_ofs) | mocs;
> -	*cs++ = lower_32_bits(dst_ofs);
> -	*cs++ = upper_32_bits(dst_ofs) | mocs;
> +	dw[i++] = XY_CTRL_SURF_COPY_BLT |
> +		  (src_is_indirect ? 0x0 : 0x1) << SRC_ACCESS_TYPE_SHIFT |
> +		  (dst_is_indirect ? 0x0 : 0x1) << DST_ACCESS_TYPE_SHIFT |
> +		  ccs_copy_size;
> +	dw[i++] = lower_32_bits(src_ofs);
> +	dw[i++] = upper_32_bits(src_ofs) | mocs;
> +	dw[i++] = lower_32_bits(dst_ofs);
> +	dw[i++] = upper_32_bits(dst_ofs) | mocs;
> +
> +	if (!xe_migrate_memcpy_atomic(gt, cs, dw, sizeof(u32) * EMIT_COPY_CCS_DW))
> +		cs += EMIT_COPY_CCS_DW;
>  
>  	bb->len = cs - bb->cs;
>  }
> @@ -980,16 +1030,28 @@ struct xe_lrc *xe_migrate_lrc(struct xe_migrate *migrate)
>  	return migrate->q->lrc[0];
>  }
>  
> +/*
> + * The MI_FLUSH_DW command is a 4-dword sequence. If the vCPU halts during
> + * save/restore while this sequence is being issued, partial writes may
> + * trigger page faults when saving iGPU CCS metadata. Use
> + * xe_migrate_memcpy_atomic() to write the sequence atomically.
> + */
>  static int emit_flush_invalidate(struct xe_exec_queue *q, u32 *dw, int i,
>  				 u32 flags)
>  {
>  	struct xe_lrc *lrc = xe_exec_queue_lrc(q);
> -	dw[i++] = MI_FLUSH_DW | MI_INVALIDATE_TLB | MI_FLUSH_DW_OP_STOREDW |
> -		  MI_FLUSH_IMM_DW | flags;
> -	dw[i++] = lower_32_bits(xe_lrc_start_seqno_ggtt_addr(lrc)) |
> -		  MI_FLUSH_DW_USE_GTT;
> -	dw[i++] = upper_32_bits(xe_lrc_start_seqno_ggtt_addr(lrc));
> -	dw[i++] = MI_NOOP;
> +	u32 tmp_dw[SZ_4] = {MI_NOOP}, j = 0;
> +
> +	tmp_dw[j++] = MI_FLUSH_DW | MI_INVALIDATE_TLB | MI_FLUSH_DW_OP_STOREDW |
> +		      MI_FLUSH_IMM_DW | flags;
> +	tmp_dw[j++] = lower_32_bits(xe_lrc_start_seqno_ggtt_addr(lrc)) |
> +		      MI_FLUSH_DW_USE_GTT;
> +	tmp_dw[j++] = upper_32_bits(xe_lrc_start_seqno_ggtt_addr(lrc));
> +	tmp_dw[j++] = MI_NOOP;
> +
> +	if (!xe_migrate_memcpy_atomic(q->gt, &dw[i], tmp_dw, sizeof(u32) * j))

j must be is always 4, correct? maybe use sizeof(tmp) ?

> +		i += j;
> +
>  	dw[i++] = MI_NOOP;

why this extra noop?
>  
>  	return i;


^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH v3] drm/xe/migrate: Atomicize CCS copy command setup
  2025-09-29 18:54 ` [PATCH v3] drm/xe/migrate: Atomicize CCS copy command setup Michal Wajdeczko
@ 2025-10-01  2:31   ` Matthew Brost
  0 siblings, 0 replies; 6+ messages in thread
From: Matthew Brost @ 2025-10-01  2:31 UTC (permalink / raw)
  To: Michal Wajdeczko; +Cc: Satyanarayana K V P, intel-xe, Matthew Auld

On Mon, Sep 29, 2025 at 08:54:39PM +0200, Michal Wajdeczko wrote:
> 
> 
> On 9/29/2025 6:45 PM, Satyanarayana K V P wrote:
> > The CCS copy command is a 5-dword sequence. If the vCPU halts during
> > save/restore while this sequence is being programmed, partial writes may
> > trigger page faults when saving IGPU CCS metadata. Use the VMOVDQU
> > instruction to write the sequence atomically.
> > 
> > Since VMOVDQU operates on 256-bit chunks, update EMIT_COPY_CCS_DW to emit
> > 8 dwords instead of 5 dwords.
> > 
> > Update emit_flush_invalidate() to use VMOVDQU operating with 128-bit
> > chunks.
> > 
> > Signed-off-by: Satyanarayana K V P <satyanarayana.k.v.p@intel.com>
> > Cc: Michal Wajdeczko <michal.wajdeczko@intel.com>
> > Cc: Matthew Brost <matthew.brost@intel.com>
> > Cc: Matthew Auld <matthew.auld@intel.com>
> > 
> > ---
> > V2 -> V3:
> > - Added support for 128 bit and 256 bit instructions with memcpy_vmovdqu
> > - Updated emit_flush_invalidate() to use vmovdqu instruction.
> > 
> > V1 -> V2:
> > - Use memcpy_vmovdqu only for x86 arch and for VF. Else use memcpy
> >   (Auld, Matthew)
> >   - Fix issues reported by patchworks.
> > ---
> >  drivers/gpu/drm/xe/xe_migrate.c | 92 +++++++++++++++++++++++++++------
> >  1 file changed, 77 insertions(+), 15 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/xe/xe_migrate.c b/drivers/gpu/drm/xe/xe_migrate.c
> > index 1d667fa36cf3..a37d4cb28aac 100644
> > --- a/drivers/gpu/drm/xe/xe_migrate.c
> > +++ b/drivers/gpu/drm/xe/xe_migrate.c
> > @@ -5,6 +5,7 @@
> >  
> >  #include "xe_migrate.h"
> >  
> > +#include <asm/fpu/api.h>
> >  #include <linux/bitfield.h>
> >  #include <linux/sizes.h>
> >  
> > @@ -644,18 +645,64 @@ static void emit_pte(struct xe_migrate *m,
> >  	}
> >  }
> >  
> > -#define EMIT_COPY_CCS_DW 5
> > +/*
> > + * The CCS copy command is a 5-dword sequence. If the vCPU halts during
> > + * save/restore while this sequence is being issued, partial writes may trigger
> > + * page faults when saving iGPU CCS metadata. Use the VMOVDQU instruction to
> > + * write the sequence atomically.
> 
> shouldn't this comment be near/inside emit_copy_ccs() ?
> 
> > + */
> > +static void memcpy_vmovdqu(void *dst, const void *src, u32 size)
> > +{
> > +	kernel_fpu_begin();
> > +
> > +#ifdef CONFIG_X86
> > +	if (size == SZ_128) {
> > +		asm("vmovdqu (%0), %%xmm0\n"
> > +		    "vmovups %%xmm0,   (%1)\n"
> > +		    :: "r" (src), "r" (dst) : "memory");
> > +	} else if (size == SZ_256) {
> > +		asm("vmovdqu (%0), %%ymm0\n"
> > +		    "vmovups %%ymm0,   (%1)\n"
> > +		    :: "r" (src), "r" (dst) : "memory");
> > +	}
> > +#endif
> > +	kernel_fpu_end();
> > +}
> > +
> > +static int xe_migrate_memcpy_atomic(struct xe_gt *gt, void *dst, const void
> 
> maybe name it as emit_atomic() ?
> 
> > +				     *src, u32 size)
> > +{
> > +	int instr_size = size * SZ_8;
> 
> why signed int?
> and maybe explain where this 8 comes from?
> did you mean BITS_PER_BYTE here
> 
> > +
> > +	if (IS_SRIOV_VF(gt_to_xe(gt)) && static_cpu_has(X86_FEATURE_AVX)) {
> > +		if (instr_size != SZ_128 && instr_size != SZ_256) {
> > +			drm_dbg(&gt_to_xe(gt)->drm,
> > +				"Invalid size received for atomic copy %d", instr_size);
> 
> as this is internal/static function, there should be no need for runtime checks - this all your code
> assert shall be sufficient

I agree here. In general internal functions to a file don't need to
check arugments or if they do, asserts are preferred way to do this.
Even exported functions within the driver, IMO asserts are fine. It is
when expose a function to user space (IOCTLs) or if export module level
functions is where argument sanitization becomes really important.

> > +			return -EINVAL;
> > +		}
> > +
> > +		memcpy_vmovdqu(dst, src, instr_size);
> > +	} else {
> > +		memcpy(dst, src, size);
> > +	}
> > +
> > +	return 0;
> > +}
> > +
> > +#define EMIT_COPY_CCS_DW 8
> >  static void emit_copy_ccs(struct xe_gt *gt, struct xe_bb *bb,
> >  			  u64 dst_ofs, bool dst_is_indirect,
> >  			  u64 src_ofs, bool src_is_indirect,
> >  			  u32 size)
> >  {
> > +	u32 dw[EMIT_COPY_CCS_DW] = {MI_NOOP};
> >  	struct xe_device *xe = gt_to_xe(gt);
> >  	u32 *cs = bb->cs + bb->len;
> >  	u32 num_ccs_blks;
> >  	u32 num_pages;
> >  	u32 ccs_copy_size;
> >  	u32 mocs;
> > +	u32 i = 0;
> >  
> >  	if (GRAPHICS_VERx100(xe) >= 2000) {
> >  		num_pages = DIV_ROUND_UP(size, XE_PAGE_SIZE);
> > @@ -673,14 +720,17 @@ static void emit_copy_ccs(struct xe_gt *gt, struct xe_bb *bb,
> >  		mocs = FIELD_PREP(XY_CTRL_SURF_MOCS_MASK, gt->mocs.uc_index);
> >  	}
> >  
> > -	*cs++ = XY_CTRL_SURF_COPY_BLT |
> > -		(src_is_indirect ? 0x0 : 0x1) << SRC_ACCESS_TYPE_SHIFT |
> > -		(dst_is_indirect ? 0x0 : 0x1) << DST_ACCESS_TYPE_SHIFT |
> > -		ccs_copy_size;
> > -	*cs++ = lower_32_bits(src_ofs);
> > -	*cs++ = upper_32_bits(src_ofs) | mocs;
> > -	*cs++ = lower_32_bits(dst_ofs);
> > -	*cs++ = upper_32_bits(dst_ofs) | mocs;
> > +	dw[i++] = XY_CTRL_SURF_COPY_BLT |
> > +		  (src_is_indirect ? 0x0 : 0x1) << SRC_ACCESS_TYPE_SHIFT |
> > +		  (dst_is_indirect ? 0x0 : 0x1) << DST_ACCESS_TYPE_SHIFT |
> > +		  ccs_copy_size;
> > +	dw[i++] = lower_32_bits(src_ofs);
> > +	dw[i++] = upper_32_bits(src_ofs) | mocs;
> > +	dw[i++] = lower_32_bits(dst_ofs);
> > +	dw[i++] = upper_32_bits(dst_ofs) | mocs;
> > +
> > +	if (!xe_migrate_memcpy_atomic(gt, cs, dw, sizeof(u32) * EMIT_COPY_CCS_DW))
> > +		cs += EMIT_COPY_CCS_DW;
> >  
> >  	bb->len = cs - bb->cs;
> >  }
> > @@ -980,16 +1030,28 @@ struct xe_lrc *xe_migrate_lrc(struct xe_migrate *migrate)
> >  	return migrate->q->lrc[0];
> >  }
> >  
> > +/*
> > + * The MI_FLUSH_DW command is a 4-dword sequence. If the vCPU halts during
> > + * save/restore while this sequence is being issued, partial writes may
> > + * trigger page faults when saving iGPU CCS metadata. Use
> > + * xe_migrate_memcpy_atomic() to write the sequence atomically.
> > + */
> >  static int emit_flush_invalidate(struct xe_exec_queue *q, u32 *dw, int i,
> >  				 u32 flags)
> >  {
> >  	struct xe_lrc *lrc = xe_exec_queue_lrc(q);
> > -	dw[i++] = MI_FLUSH_DW | MI_INVALIDATE_TLB | MI_FLUSH_DW_OP_STOREDW |
> > -		  MI_FLUSH_IMM_DW | flags;
> > -	dw[i++] = lower_32_bits(xe_lrc_start_seqno_ggtt_addr(lrc)) |
> > -		  MI_FLUSH_DW_USE_GTT;
> > -	dw[i++] = upper_32_bits(xe_lrc_start_seqno_ggtt_addr(lrc));
> > -	dw[i++] = MI_NOOP;
> > +	u32 tmp_dw[SZ_4] = {MI_NOOP}, j = 0;
> > +
> > +	tmp_dw[j++] = MI_FLUSH_DW | MI_INVALIDATE_TLB | MI_FLUSH_DW_OP_STOREDW |
> > +		      MI_FLUSH_IMM_DW | flags;
> > +	tmp_dw[j++] = lower_32_bits(xe_lrc_start_seqno_ggtt_addr(lrc)) |
> > +		      MI_FLUSH_DW_USE_GTT;
> > +	tmp_dw[j++] = upper_32_bits(xe_lrc_start_seqno_ggtt_addr(lrc));
> > +	tmp_dw[j++] = MI_NOOP;
> > +
> > +	if (!xe_migrate_memcpy_atomic(q->gt, &dw[i], tmp_dw, sizeof(u32) * j))
> 
> j must be is always 4, correct? maybe use sizeof(tmp) ?
> 
> > +		i += j;
> > +
> >  	dw[i++] = MI_NOOP;
> 
> why this extra noop?

This is existing code but probably could be deleted.

Matt

> >  
> >  	return i;
> 

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH v3] drm/xe/migrate: Atomicize CCS copy command setup
  2025-09-29 16:45 [PATCH v3] drm/xe/migrate: Atomicize CCS copy command setup Satyanarayana K V P
  2025-09-29 16:51 ` ✗ CI.KUnit: failure for drm/xe/migrate: Atomicize CCS copy command setup (rev3) Patchwork
  2025-09-29 18:54 ` [PATCH v3] drm/xe/migrate: Atomicize CCS copy command setup Michal Wajdeczko
@ 2025-10-01  2:50 ` Matthew Brost
  2 siblings, 0 replies; 6+ messages in thread
From: Matthew Brost @ 2025-10-01  2:50 UTC (permalink / raw)
  To: Satyanarayana K V P; +Cc: intel-xe, Michal Wajdeczko, Matthew Auld

On Mon, Sep 29, 2025 at 04:45:07PM +0000, Satyanarayana K V P wrote:
> The CCS copy command is a 5-dword sequence. If the vCPU halts during
> save/restore while this sequence is being programmed, partial writes may
> trigger page faults when saving IGPU CCS metadata. Use the VMOVDQU
> instruction to write the sequence atomically.
> 
> Since VMOVDQU operates on 256-bit chunks, update EMIT_COPY_CCS_DW to emit
> 8 dwords instead of 5 dwords.
> 
> Update emit_flush_invalidate() to use VMOVDQU operating with 128-bit
> chunks.
> 
> Signed-off-by: Satyanarayana K V P <satyanarayana.k.v.p@intel.com>
> Cc: Michal Wajdeczko <michal.wajdeczko@intel.com>
> Cc: Matthew Brost <matthew.brost@intel.com>
> Cc: Matthew Auld <matthew.auld@intel.com>
> 

For this series and the subsequent follow-up that addresses the inverse
side of the problem (i.e., avoiding memset calls on BB detach), I
believe we really need an IGT that focuses solely on creating and
destroying many CCS BOs in a loop, using threads to attempt to trigger
the race condition. With enough iterations, we should be able to get the
save/restore BB to hang on PTL.

Once the patches are applied, the test should pass reliably. In my
opinion, that’s the only way to truly validate the correctness of the
fix—it’s nearly impossible to reason about correctness without such
testing. So this is where we need to start: testing. This is almost
always the case with complex issues.

With that in mind, can we get this IGT implemented and added to VMTB?

Matt 

> ---
> V2 -> V3:
> - Added support for 128 bit and 256 bit instructions with memcpy_vmovdqu
> - Updated emit_flush_invalidate() to use vmovdqu instruction.
> 
> V1 -> V2:
> - Use memcpy_vmovdqu only for x86 arch and for VF. Else use memcpy
>   (Auld, Matthew)
>   - Fix issues reported by patchworks.
> ---
>  drivers/gpu/drm/xe/xe_migrate.c | 92 +++++++++++++++++++++++++++------
>  1 file changed, 77 insertions(+), 15 deletions(-)
> 
> diff --git a/drivers/gpu/drm/xe/xe_migrate.c b/drivers/gpu/drm/xe/xe_migrate.c
> index 1d667fa36cf3..a37d4cb28aac 100644
> --- a/drivers/gpu/drm/xe/xe_migrate.c
> +++ b/drivers/gpu/drm/xe/xe_migrate.c
> @@ -5,6 +5,7 @@
>  
>  #include "xe_migrate.h"
>  
> +#include <asm/fpu/api.h>
>  #include <linux/bitfield.h>
>  #include <linux/sizes.h>
>  
> @@ -644,18 +645,64 @@ static void emit_pte(struct xe_migrate *m,
>  	}
>  }
>  
> -#define EMIT_COPY_CCS_DW 5
> +/*
> + * The CCS copy command is a 5-dword sequence. If the vCPU halts during
> + * save/restore while this sequence is being issued, partial writes may trigger
> + * page faults when saving iGPU CCS metadata. Use the VMOVDQU instruction to
> + * write the sequence atomically.
> + */
> +static void memcpy_vmovdqu(void *dst, const void *src, u32 size)
> +{
> +	kernel_fpu_begin();
> +
> +#ifdef CONFIG_X86
> +	if (size == SZ_128) {
> +		asm("vmovdqu (%0), %%xmm0\n"
> +		    "vmovups %%xmm0,   (%1)\n"
> +		    :: "r" (src), "r" (dst) : "memory");
> +	} else if (size == SZ_256) {
> +		asm("vmovdqu (%0), %%ymm0\n"
> +		    "vmovups %%ymm0,   (%1)\n"
> +		    :: "r" (src), "r" (dst) : "memory");
> +	}
> +#endif
> +	kernel_fpu_end();
> +}
> +
> +static int xe_migrate_memcpy_atomic(struct xe_gt *gt, void *dst, const void
> +				     *src, u32 size)
> +{
> +	int instr_size = size * SZ_8;
> +
> +	if (IS_SRIOV_VF(gt_to_xe(gt)) && static_cpu_has(X86_FEATURE_AVX)) {
> +		if (instr_size != SZ_128 && instr_size != SZ_256) {
> +			drm_dbg(&gt_to_xe(gt)->drm,
> +				"Invalid size received for atomic copy %d", instr_size);
> +			return -EINVAL;
> +		}
> +
> +		memcpy_vmovdqu(dst, src, instr_size);
> +	} else {
> +		memcpy(dst, src, size);
> +	}
> +
> +	return 0;
> +}
> +
> +#define EMIT_COPY_CCS_DW 8
>  static void emit_copy_ccs(struct xe_gt *gt, struct xe_bb *bb,
>  			  u64 dst_ofs, bool dst_is_indirect,
>  			  u64 src_ofs, bool src_is_indirect,
>  			  u32 size)
>  {
> +	u32 dw[EMIT_COPY_CCS_DW] = {MI_NOOP};
>  	struct xe_device *xe = gt_to_xe(gt);
>  	u32 *cs = bb->cs + bb->len;
>  	u32 num_ccs_blks;
>  	u32 num_pages;
>  	u32 ccs_copy_size;
>  	u32 mocs;
> +	u32 i = 0;
>  
>  	if (GRAPHICS_VERx100(xe) >= 2000) {
>  		num_pages = DIV_ROUND_UP(size, XE_PAGE_SIZE);
> @@ -673,14 +720,17 @@ static void emit_copy_ccs(struct xe_gt *gt, struct xe_bb *bb,
>  		mocs = FIELD_PREP(XY_CTRL_SURF_MOCS_MASK, gt->mocs.uc_index);
>  	}
>  
> -	*cs++ = XY_CTRL_SURF_COPY_BLT |
> -		(src_is_indirect ? 0x0 : 0x1) << SRC_ACCESS_TYPE_SHIFT |
> -		(dst_is_indirect ? 0x0 : 0x1) << DST_ACCESS_TYPE_SHIFT |
> -		ccs_copy_size;
> -	*cs++ = lower_32_bits(src_ofs);
> -	*cs++ = upper_32_bits(src_ofs) | mocs;
> -	*cs++ = lower_32_bits(dst_ofs);
> -	*cs++ = upper_32_bits(dst_ofs) | mocs;
> +	dw[i++] = XY_CTRL_SURF_COPY_BLT |
> +		  (src_is_indirect ? 0x0 : 0x1) << SRC_ACCESS_TYPE_SHIFT |
> +		  (dst_is_indirect ? 0x0 : 0x1) << DST_ACCESS_TYPE_SHIFT |
> +		  ccs_copy_size;
> +	dw[i++] = lower_32_bits(src_ofs);
> +	dw[i++] = upper_32_bits(src_ofs) | mocs;
> +	dw[i++] = lower_32_bits(dst_ofs);
> +	dw[i++] = upper_32_bits(dst_ofs) | mocs;
> +
> +	if (!xe_migrate_memcpy_atomic(gt, cs, dw, sizeof(u32) * EMIT_COPY_CCS_DW))
> +		cs += EMIT_COPY_CCS_DW;
>  
>  	bb->len = cs - bb->cs;
>  }
> @@ -980,16 +1030,28 @@ struct xe_lrc *xe_migrate_lrc(struct xe_migrate *migrate)
>  	return migrate->q->lrc[0];
>  }
>  
> +/*
> + * The MI_FLUSH_DW command is a 4-dword sequence. If the vCPU halts during
> + * save/restore while this sequence is being issued, partial writes may
> + * trigger page faults when saving iGPU CCS metadata. Use
> + * xe_migrate_memcpy_atomic() to write the sequence atomically.
> + */
>  static int emit_flush_invalidate(struct xe_exec_queue *q, u32 *dw, int i,
>  				 u32 flags)
>  {
>  	struct xe_lrc *lrc = xe_exec_queue_lrc(q);
> -	dw[i++] = MI_FLUSH_DW | MI_INVALIDATE_TLB | MI_FLUSH_DW_OP_STOREDW |
> -		  MI_FLUSH_IMM_DW | flags;
> -	dw[i++] = lower_32_bits(xe_lrc_start_seqno_ggtt_addr(lrc)) |
> -		  MI_FLUSH_DW_USE_GTT;
> -	dw[i++] = upper_32_bits(xe_lrc_start_seqno_ggtt_addr(lrc));
> -	dw[i++] = MI_NOOP;
> +	u32 tmp_dw[SZ_4] = {MI_NOOP}, j = 0;
> +
> +	tmp_dw[j++] = MI_FLUSH_DW | MI_INVALIDATE_TLB | MI_FLUSH_DW_OP_STOREDW |
> +		      MI_FLUSH_IMM_DW | flags;
> +	tmp_dw[j++] = lower_32_bits(xe_lrc_start_seqno_ggtt_addr(lrc)) |
> +		      MI_FLUSH_DW_USE_GTT;
> +	tmp_dw[j++] = upper_32_bits(xe_lrc_start_seqno_ggtt_addr(lrc));
> +	tmp_dw[j++] = MI_NOOP;
> +
> +	if (!xe_migrate_memcpy_atomic(q->gt, &dw[i], tmp_dw, sizeof(u32) * j))
> +		i += j;
> +
>  	dw[i++] = MI_NOOP;
>  
>  	return i;
> -- 
> 2.43.0
> 

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2025-10-10 19:12 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-09-29 16:45 [PATCH v3] drm/xe/migrate: Atomicize CCS copy command setup Satyanarayana K V P
2025-09-29 16:51 ` ✗ CI.KUnit: failure for drm/xe/migrate: Atomicize CCS copy command setup (rev3) Patchwork
2025-10-10 19:12   ` Rodrigo Vivi
2025-09-29 18:54 ` [PATCH v3] drm/xe/migrate: Atomicize CCS copy command setup Michal Wajdeczko
2025-10-01  2:31   ` Matthew Brost
2025-10-01  2:50 ` Matthew Brost

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox