[PATCH v4 0/3] drm/xe/migrate: Atomicize CCS copy command setup

Intel-XE Archive on lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH v4 0/3] drm/xe/migrate: Atomicize CCS copy command setup
@ 2025-10-06 15:24 Satyanarayana K V P
  2025-10-06 15:24 ` [PATCH v4 1/3] " Satyanarayana K V P
                   ` (2 more replies)
  0 siblings, 3 replies; 9+ messages in thread
From: Satyanarayana K V P @ 2025-10-06 15:24 UTC (permalink / raw)
  To: intel-xe; +Cc: Satyanarayana K V P

The CCS copy command is a 5-dword sequence. If the vCPU halts during
save/restore while this sequence is being programmed, partial writes may
trigger page faults when saving IGPU CCS metadata. Use the VMOVDQU
instruction to write the sequence atomically. Since VMOVDQU operates on
256-bit chunks, update EMIT_COPY_CCS_DW to emit 8 dwords instead of 5
dwords. Update emit_flush_invalidate() to use VMOVDQU operating	with
128-bit chunks.

The MI_STORE_DATA_IMM instruction header is quad dword in size. If the
vCPU halts during save/restore while this sequence is being programmed,
partial writes may trigger page faults when saving IGPU CCS metadata.
Update instruction header atomically.

Clear the contents of the CCS read/write batch buffer, ensuring no page
faults / GPU hang occur if migration happens midway.

---
V3 -> V4:
- Fixed review comments. (Wajdeczko)
- Fix issues reported by patchworks.

V2 -> V3:
- Added support for 128 bit and 256 bit instructions with memcpy_vmovdqu
- Updated emit_flush_invalidate() to use vmovdqu instruction.

V1 -> V2:
- Use memcpy_vmovdqu only for x86 arch and for VF. Else use memcpy
(Auld, Matthew)
- Fix issues reported by patchworks.

Satyanarayana K V P (3):
  drm/xe/migrate: Atomicize CCS copy command setup
  drm/xe/migrate: Make emit_pte() header write atomic
  drm/xe/vf: Clear CCS read/write buffers in atomic way

 drivers/gpu/drm/xe/xe_migrate.c      | 231 ++++++++++++++++++++++++---
 drivers/gpu/drm/xe/xe_migrate.h      |   3 +
 drivers/gpu/drm/xe/xe_sriov_vf_ccs.c |   5 +-
 3 files changed, 215 insertions(+), 24 deletions(-)

-- 
2.51.0

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [PATCH v4 1/3] drm/xe/migrate: Atomicize CCS copy command setup
  2025-10-06 15:24 [PATCH v4 0/3] drm/xe/migrate: Atomicize CCS copy command setup Satyanarayana K V P
@ 2025-10-06 15:24 ` Satyanarayana K V P
  2025-10-06 19:58   ` Matthew Brost
  2025-10-06 15:24 ` [PATCH v4 2/3] drm/xe/migrate: Make emit_pte() header write atomic Satyanarayana K V P
  2025-10-06 15:24 ` [PATCH v4 3/3] drm/xe/vf: Clear CCS read/write buffers in atomic way Satyanarayana K V P
  2 siblings, 1 reply; 9+ messages in thread
From: Satyanarayana K V P @ 2025-10-06 15:24 UTC (permalink / raw)
  To: intel-xe; +Cc: Satyanarayana K V P, Michal Wajdeczko, Matthew Brost,
	Matthew Auld

The CCS copy command is a 5-dword sequence. If the vCPU halts during
save/restore while this sequence is being programmed, partial writes may
trigger page faults when saving IGPU CCS metadata. Use the VMOVDQU
instruction to write the sequence atomically.

Since VMOVDQU operates on 256-bit chunks, update EMIT_COPY_CCS_DW to emit
8 dwords instead of 5 dwords.

Update emit_flush_invalidate() to use VMOVDQU operating with 128-bit
chunks.

Signed-off-by: Satyanarayana K V P <satyanarayana.k.v.p@intel.com>
Cc: Michal Wajdeczko <michal.wajdeczko@intel.com>
Cc: Matthew Brost <matthew.brost@intel.com>
Cc: Matthew Auld <matthew.auld@intel.com>

---
V3 -> V4:
- Fixed review comments. (Wajdeczko)
- Fix issues reported by patchworks.

V2 -> V3:
- Added support for 128 bit and 256 bit instructions with memcpy_vmovdqu
- Updated emit_flush_invalidate() to use vmovdqu instruction.

V1 -> V2:
- Use memcpy_vmovdqu only for x86 arch and for VF. Else use memcpy
  (Auld, Matthew)
- Fix issues reported by patchworks.
---
 drivers/gpu/drm/xe/xe_migrate.c | 92 +++++++++++++++++++++++++--------
 1 file changed, 71 insertions(+), 21 deletions(-)

diff --git a/drivers/gpu/drm/xe/xe_migrate.c b/drivers/gpu/drm/xe/xe_migrate.c
index c39c3b423d05..b960fdcecd88 100644
--- a/drivers/gpu/drm/xe/xe_migrate.c
+++ b/drivers/gpu/drm/xe/xe_migrate.c
@@ -5,7 +5,9 @@
 
 #include "xe_migrate.h"
 
+#include <asm/fpu/api.h>
 #include <linux/bitfield.h>
+#include <linux/cpufeature.h>
 #include <linux/sizes.h>
 
 #include <drm/drm_managed.h>
@@ -644,18 +646,50 @@ static void emit_pte(struct xe_migrate *m,
 	}
 }
 
-#define EMIT_COPY_CCS_DW 5
+static void memcpy_vmovdqu(void *dst, const void *src, u32 size)
+{
+	kernel_fpu_begin();
+
+#ifdef CONFIG_X86
+	if (size == SZ_128) {
+		asm("vmovdqu (%0), %%xmm0\n"
+		    "vmovups %%xmm0,   (%1)\n"
+		    :: "r" (src), "r" (dst) : "memory");
+	} else if (size == SZ_256) {
+		asm("vmovdqu (%0), %%ymm0\n"
+		    "vmovups %%ymm0,   (%1)\n"
+		    :: "r" (src), "r" (dst) : "memory");
+	}
+#endif
+	kernel_fpu_end();
+}
+
+static void emit_atomic(struct xe_gt *gt, void *dst, const void *src, u32 size)
+{
+	u32 instr_size = size * BITS_PER_BYTE;
+
+	xe_assert(gt_to_xe(gt), !(instr_size != SZ_128 && instr_size != SZ_256));
+
+	if (IS_SRIOV_VF(gt_to_xe(gt)) && static_cpu_has(X86_FEATURE_AVX))
+		memcpy_vmovdqu(dst, src, instr_size);
+	else
+		memcpy(dst, src, size);
+}
+
+#define EMIT_COPY_CCS_DW 8
 static void emit_copy_ccs(struct xe_gt *gt, struct xe_bb *bb,
 			  u64 dst_ofs, bool dst_is_indirect,
 			  u64 src_ofs, bool src_is_indirect,
 			  u32 size)
 {
+	u32 dw[EMIT_COPY_CCS_DW] = {MI_NOOP};
 	struct xe_device *xe = gt_to_xe(gt);
 	u32 *cs = bb->cs + bb->len;
 	u32 num_ccs_blks;
 	u32 num_pages;
 	u32 ccs_copy_size;
 	u32 mocs;
+	u32 i = 0;
 
 	if (GRAPHICS_VERx100(xe) >= 2000) {
 		num_pages = DIV_ROUND_UP(size, XE_PAGE_SIZE);
@@ -673,15 +707,23 @@ static void emit_copy_ccs(struct xe_gt *gt, struct xe_bb *bb,
 		mocs = FIELD_PREP(XY_CTRL_SURF_MOCS_MASK, gt->mocs.uc_index);
 	}
 
-	*cs++ = XY_CTRL_SURF_COPY_BLT |
-		(src_is_indirect ? 0x0 : 0x1) << SRC_ACCESS_TYPE_SHIFT |
-		(dst_is_indirect ? 0x0 : 0x1) << DST_ACCESS_TYPE_SHIFT |
-		ccs_copy_size;
-	*cs++ = lower_32_bits(src_ofs);
-	*cs++ = upper_32_bits(src_ofs) | mocs;
-	*cs++ = lower_32_bits(dst_ofs);
-	*cs++ = upper_32_bits(dst_ofs) | mocs;
+	dw[i++] = XY_CTRL_SURF_COPY_BLT |
+		  (src_is_indirect ? 0x0 : 0x1) << SRC_ACCESS_TYPE_SHIFT |
+		  (dst_is_indirect ? 0x0 : 0x1) << DST_ACCESS_TYPE_SHIFT |
+		  ccs_copy_size;
+	dw[i++] = lower_32_bits(src_ofs);
+	dw[i++] = upper_32_bits(src_ofs) | mocs;
+	dw[i++] = lower_32_bits(dst_ofs);
+	dw[i++] = upper_32_bits(dst_ofs) | mocs;
 
+	/*
+	 * The CCS copy command is a 5-dword sequence. If the vCPU halts during
+	 * save/restore while this sequence is being issued, partial writes may trigger
+	 * page faults when saving iGPU CCS metadata. Use the VMOVDQU instruction to
+	 * write the sequence atomically.
+	 */
+	emit_atomic(gt, cs, dw, sizeof(u32) * EMIT_COPY_CCS_DW);
+	cs += EMIT_COPY_CCS_DW;
 	bb->len = cs - bb->cs;
 }
 
@@ -993,18 +1035,26 @@ static u64 migrate_vm_ppgtt_addr_tlb_inval(void)
 	return (NUM_KERNEL_PDE - 2) * XE_PAGE_SIZE;
 }
 
-static int emit_flush_invalidate(u32 *dw, int i, u32 flags)
+/*
+ * The MI_FLUSH_DW command is a 4-dword sequence. If the vCPU halts during
+ * save/restore while this sequence is being issued, partial writes may
+ * trigger page faults when saving iGPU CCS metadata. Use
+ * emit_atomic() to write the sequence atomically.
+ */
+static int emit_flush_invalidate(struct xe_exec_queue *q, u32 *dw, int i, u32 flags)
 {
 	u64 addr = migrate_vm_ppgtt_addr_tlb_inval();
+	u32 tmp_dw[SZ_4] = {MI_NOOP}, j = 0;
+
+	tmp_dw[j++] = MI_FLUSH_DW | MI_INVALIDATE_TLB | MI_FLUSH_DW_OP_STOREDW |
+		      MI_FLUSH_IMM_DW | flags;
+	tmp_dw[j++] = lower_32_bits(addr);
+	tmp_dw[j++] = upper_32_bits(addr);
+	tmp_dw[j++] = MI_NOOP;
 
-	dw[i++] = MI_FLUSH_DW | MI_INVALIDATE_TLB | MI_FLUSH_DW_OP_STOREDW |
-		  MI_FLUSH_IMM_DW | flags;
-	dw[i++] = lower_32_bits(addr);
-	dw[i++] = upper_32_bits(addr);
-	dw[i++] = MI_NOOP;
-	dw[i++] = MI_NOOP;
+	emit_atomic(q->gt, &dw[i], tmp_dw, sizeof(tmp_dw));
 
-	return i;
+	return i + j;
 }
 
 /**
@@ -1049,7 +1099,7 @@ int xe_migrate_ccs_rw_copy(struct xe_tile *tile, struct xe_exec_queue *q,
 	/* Calculate Batch buffer size */
 	batch_size = 0;
 	while (size) {
-		batch_size += 10; /* Flush + ggtt addr + 2 NOP */
+		batch_size += 8; /* Flush + ggtt addr + 1 NOP */
 		u64 ccs_ofs, ccs_size;
 		u32 ccs_pt;
 
@@ -1090,7 +1140,7 @@ int xe_migrate_ccs_rw_copy(struct xe_tile *tile, struct xe_exec_queue *q,
 	 * sizes here again before copy command is emitted.
 	 */
 	while (size) {
-		batch_size += 10; /* Flush + ggtt addr + 2 NOP */
+		batch_size += 8; /* Flush + ggtt addr + 1 NOP */
 		u32 flush_flags = 0;
 		u64 ccs_ofs, ccs_size;
 		u32 ccs_pt;
@@ -1113,11 +1163,11 @@ int xe_migrate_ccs_rw_copy(struct xe_tile *tile, struct xe_exec_queue *q,
 
 		emit_pte(m, bb, ccs_pt, false, false, &ccs_it, ccs_size, src);
 
-		bb->len = emit_flush_invalidate(bb->cs, bb->len, flush_flags);
+		bb->len = emit_flush_invalidate(q, bb->cs, bb->len, flush_flags);
 		flush_flags = xe_migrate_ccs_copy(m, bb, src_L0_ofs, src_is_pltt,
 						  src_L0_ofs, dst_is_pltt,
 						  src_L0, ccs_ofs, true);
-		bb->len = emit_flush_invalidate(bb->cs, bb->len, flush_flags);
+		bb->len = emit_flush_invalidate(q, bb->cs, bb->len, flush_flags);
 
 		size -= src_L0;
 	}
-- 
2.51.0


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [PATCH v4 2/3] drm/xe/migrate: Make emit_pte() header write atomic
  2025-10-06 15:24 [PATCH v4 0/3] drm/xe/migrate: Atomicize CCS copy command setup Satyanarayana K V P
  2025-10-06 15:24 ` [PATCH v4 1/3] " Satyanarayana K V P
@ 2025-10-06 15:24 ` Satyanarayana K V P
  2025-10-06 19:42   ` Matthew Brost
  2025-10-06 15:24 ` [PATCH v4 3/3] drm/xe/vf: Clear CCS read/write buffers in atomic way Satyanarayana K V P
  2 siblings, 1 reply; 9+ messages in thread
From: Satyanarayana K V P @ 2025-10-06 15:24 UTC (permalink / raw)
  To: intel-xe; +Cc: Satyanarayana K V P, Michal Wajdeczko, Matthew Brost,
	Matthew Auld

The MI_STORE_DATA_IMM instruction header is quad dword in size. If the
vCPU halts during save/restore while this sequence is being programmed,
partial writes may trigger page faults when saving IGPU CCS metadata.
Update instruction header atomically.

Signed-off-by: Satyanarayana K V P <satyanarayana.k.v.p@intel.com>
Cc: Michal Wajdeczko <michal.wajdeczko@intel.com>
Cc: Matthew Brost <matthew.brost@intel.com>
Cc: Matthew Auld <matthew.auld@intel.com>

---
V3 -> V4:
- New commit added.

V2 -> V3:
- None

V1 -> V2:
- None
---
 drivers/gpu/drm/xe/xe_migrate.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/xe/xe_migrate.c b/drivers/gpu/drm/xe/xe_migrate.c
index b960fdcecd88..4c575be45a76 100644
--- a/drivers/gpu/drm/xe/xe_migrate.c
+++ b/drivers/gpu/drm/xe/xe_migrate.c
@@ -606,9 +606,14 @@ static void emit_pte(struct xe_migrate *m,
 
 	while (ptes) {
 		u32 chunk = min(MAX_PTE_PER_SDI, ptes);
+		u32 dw[SZ_2], i = 0;
+
+		dw[i++] = MI_STORE_DATA_IMM | MI_SDI_NUM_QW(chunk);
+		dw[i++] = ofs;
+
+		WRITE_ONCE(*(u64 *)&bb->cs[bb->len], READ_ONCE(*(u64 *)dw));
+		bb->len += i;
 
-		bb->cs[bb->len++] = MI_STORE_DATA_IMM | MI_SDI_NUM_QW(chunk);
-		bb->cs[bb->len++] = ofs;
 		bb->cs[bb->len++] = 0;
 
 		cur_ofs = ofs;
-- 
2.51.0


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [PATCH v4 3/3] drm/xe/vf: Clear CCS read/write buffers in atomic way
  2025-10-06 15:24 [PATCH v4 0/3] drm/xe/migrate: Atomicize CCS copy command setup Satyanarayana K V P
  2025-10-06 15:24 ` [PATCH v4 1/3] " Satyanarayana K V P
  2025-10-06 15:24 ` [PATCH v4 2/3] drm/xe/migrate: Make emit_pte() header write atomic Satyanarayana K V P
@ 2025-10-06 15:24 ` Satyanarayana K V P
  2025-10-06 20:12   ` Matthew Brost
  2 siblings, 1 reply; 9+ messages in thread
From: Satyanarayana K V P @ 2025-10-06 15:24 UTC (permalink / raw)
  To: intel-xe; +Cc: Satyanarayana K V P, Michal Wajdeczko, Matthew Brost,
	Matthew Auld

Clear the contents of the CCS read/write batch buffer, ensuring no page
faults / GPU hang occur if migration happens midway.

Signed-off-by: Satyanarayana K V P <satyanarayana.k.v.p@intel.com>
Cc: Michal Wajdeczko <michal.wajdeczko@intel.com>
Cc: Matthew Brost <matthew.brost@intel.com>
Cc: Matthew Auld <matthew.auld@intel.com>

---
V3 -> V4:
- New commit added.

V2 -> V3:
- None

V1 -> V2:
- None
---
 drivers/gpu/drm/xe/xe_migrate.c      | 130 +++++++++++++++++++++++++++
 drivers/gpu/drm/xe/xe_migrate.h      |   3 +
 drivers/gpu/drm/xe/xe_sriov_vf_ccs.c |   5 +-
 3 files changed, 137 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/xe/xe_migrate.c b/drivers/gpu/drm/xe/xe_migrate.c
index 4c575be45a76..71c446e74d84 100644
--- a/drivers/gpu/drm/xe/xe_migrate.c
+++ b/drivers/gpu/drm/xe/xe_migrate.c
@@ -651,6 +651,42 @@ static void emit_pte(struct xe_migrate *m,
 	}
 }
 
+static void emit_pte_clear(struct xe_gt *gt, struct xe_bb *bb, int start_offset,
+			   int end_offset)
+{
+	u32 dw_nop[SZ_4] = {MI_NOOP};
+	int i = start_offset;
+	int len = end_offset;
+	u32 *cs = bb->cs;
+
+	/* Reverses the operations performed by emit_pte() */
+	while (i < len) {
+		u32 dwords, qwords;
+
+		xe_assert(gt_to_xe(gt), (REG_FIELD_GET(REG_GENMASK(31, 23), cs[i]) == 0x20));
+
+		qwords = REG_FIELD_GET(MI_SDI_LEN_DW, cs[i]);
+		/*
+		 * If Store QW is enabled, then the value of the dwlengh
+		 * includes the header, address and multiple QW pairs of data
+		 * which means the values will be limited to odd values starting
+		 * at a value of 3(3 representing the size of a 5 DW command
+		 * including header, 2 dw address and 2 dw data).
+		 */
+		dwords = qwords - 1;
+		/*
+		 * Do not clear header first. Clear PTEs first and then clear the
+		 * header to avoid page faults.
+		 */
+		memset(&cs[i + 3], MI_NOOP, (dwords) * sizeof(u32));
+
+		WRITE_ONCE(*(u64 *)&cs[i], READ_ONCE(*(u64 *)dw_nop));
+
+		cs[i + 2] = MI_NOOP;
+		i += (dwords + 3);
+	}
+}
+
 static void memcpy_vmovdqu(void *dst, const void *src, u32 size)
 {
 	kernel_fpu_begin();
@@ -732,6 +768,17 @@ static void emit_copy_ccs(struct xe_gt *gt, struct xe_bb *bb,
 	bb->len = cs - bb->cs;
 }
 
+static u32 emit_copy_ccs_clear(struct xe_gt *gt, struct xe_bb *bb, u32 offset)
+{
+	u32 dw[EMIT_COPY_CCS_DW] = {MI_NOOP};
+	u32 *cs = bb->cs + offset - EMIT_COPY_CCS_DW;
+
+	xe_assert(gt_to_xe(gt), (REG_FIELD_GET(REG_GENMASK(31, 22), *cs) == 0x148));
+	emit_atomic(gt, cs, dw, sizeof(dw));
+
+	return offset - EMIT_COPY_CCS_DW;
+}
+
 #define EMIT_COPY_DW 10
 static void emit_copy(struct xe_gt *gt, struct xe_bb *bb,
 		      u64 src_ofs, u64 dst_ofs, unsigned int size,
@@ -1062,6 +1109,19 @@ static int emit_flush_invalidate(struct xe_exec_queue *q, u32 *dw, int i, u32 fl
 	return i + j;
 }
 
+static u32 emit_flush_invalidate_clear(struct xe_gt *gt, struct xe_bb *bb,
+				       u32 offset)
+{
+	u32 dw[SZ_4] = {MI_NOOP};
+	u32 *cs = bb->cs + offset - SZ_4;
+
+	xe_assert(gt_to_xe(gt), (REG_FIELD_GET(REG_GENMASK(31, 23), *cs) == 0x26));
+
+	emit_atomic(gt, cs, dw, sizeof(dw));
+
+	return offset - SZ_4;
+}
+
 /**
  * xe_migrate_ccs_rw_copy() - Copy content of TTM resources.
  * @tile: Tile whose migration context to be used.
@@ -1186,6 +1246,76 @@ int xe_migrate_ccs_rw_copy(struct xe_tile *tile, struct xe_exec_queue *q,
 	return err;
 }
 
+static u32 ccs_rw_pte_size(struct xe_gt *gt, struct xe_bb *bb, u32 offset)
+{
+	int len = bb->len;
+	u32 *cs = bb->cs;
+	u32 i = offset;
+
+	while (i < len) {
+		u32 dwords, qwords;
+
+		xe_assert(gt_to_xe(gt), (REG_FIELD_GET(REG_GENMASK(31, 23), cs[i]) == 0x20));
+
+		qwords = REG_FIELD_GET(MI_SDI_LEN_DW, cs[i]);
+		/*
+		 * If Store QW is enabled, then the value of the dwlengh
+		 * includes the header, address and multiple QW pairs of data
+		 * which means the values will be limited to odd values starting
+		 * at a value of 3(3 representing the size of a 5 DW command
+		 * including header, 2 dw address and 2 dw data).
+		 */
+		dwords = qwords - 1;
+		i += dwords + 3;
+
+		/*
+		 * Break if the next dword is for emit_flush_invalidate_clear()
+		 * or emit_copy_ccs_clear()
+		 */
+		if ((REG_FIELD_GET(REG_GENMASK(31, 23), cs[i]) == 0x26) ||
+		    (REG_FIELD_GET(REG_GENMASK(31, 22), cs[i]) == 0x148))
+			break;
+	}
+	return i;
+}
+
+/**
+ * xe_migrate_ccs_rw_copy_clear() - Clear the CCS read/write batch buffer
+ * content.
+ * @tile: Tile whose migration context to be used.
+ * @src_bo: The buffer object @src is currently bound to.
+ * @read_write : Creates BB commands for CCS read/write.
+ *
+ * The CCS copy command has three stages: PTE setup, TLB invalidation, and CCS
+ * copy. Each stage includes a header followed by instructions. When clearing,
+ * remove the instructions first, then the header. For the TLB invalidation and
+ * CCS copy stages, ensure the writes are atomic.
+ *
+ * This reverses the operations performed by xe_migrate_ccs_rw_copy().
+ *
+ * Returns: None.
+ */
+void xe_migrate_ccs_rw_copy_clear(struct xe_tile *tile, struct xe_bo *src_bo,
+				  enum xe_sriov_vf_ccs_rw_ctxs read_write)
+{
+	struct xe_bb *bb = src_bo->bb_ccs[read_write];
+	u32 bb_offset = 0, bb_offset_chunk = 0;
+	struct xe_gt *gt = tile->primary_gt;
+
+	while (bb_offset_chunk >= 0 && bb_offset_chunk < bb->len) {
+		bb_offset = ccs_rw_pte_size(gt, bb, bb_offset_chunk);
+		/*
+		 * After PTE entries, we have one TLB invalidation, CCS copy
+		 * command and another TLB invalidation command.
+		 */
+		bb_offset_chunk = bb_offset + SZ_4 + EMIT_COPY_CCS_DW + SZ_4;
+		bb_offset = emit_flush_invalidate_clear(gt, bb, bb_offset_chunk);
+		bb_offset = emit_copy_ccs_clear(gt, bb, bb_offset);
+		bb_offset = emit_flush_invalidate_clear(gt, bb, bb_offset);
+		emit_pte_clear(gt, bb, bb_offset_chunk, bb_offset);
+	}
+}
+
 /**
  * xe_get_migrate_exec_queue() - Get the execution queue from migrate context.
  * @migrate: Migrate context.
diff --git a/drivers/gpu/drm/xe/xe_migrate.h b/drivers/gpu/drm/xe/xe_migrate.h
index 0d8944b1cee6..bd2c0eb3ad94 100644
--- a/drivers/gpu/drm/xe/xe_migrate.h
+++ b/drivers/gpu/drm/xe/xe_migrate.h
@@ -129,6 +129,9 @@ int xe_migrate_ccs_rw_copy(struct xe_tile *tile, struct xe_exec_queue *q,
 			   struct xe_bo *src_bo,
 			   enum xe_sriov_vf_ccs_rw_ctxs read_write);
 
+void xe_migrate_ccs_rw_copy_clear(struct xe_tile *tile, struct xe_bo *src_bo,
+				  enum xe_sriov_vf_ccs_rw_ctxs read_write);
+
 struct xe_lrc *xe_migrate_lrc(struct xe_migrate *migrate);
 struct xe_exec_queue *xe_migrate_exec_queue(struct xe_migrate *migrate);
 struct dma_fence *xe_migrate_raw_vram_copy(struct xe_bo *vram_bo, u64 vram_offset,
diff --git a/drivers/gpu/drm/xe/xe_sriov_vf_ccs.c b/drivers/gpu/drm/xe/xe_sriov_vf_ccs.c
index 790249801364..2d3728cb24ca 100644
--- a/drivers/gpu/drm/xe/xe_sriov_vf_ccs.c
+++ b/drivers/gpu/drm/xe/xe_sriov_vf_ccs.c
@@ -387,6 +387,7 @@ int xe_sriov_vf_ccs_detach_bo(struct xe_bo *bo)
 {
 	struct xe_device *xe = xe_bo_device(bo);
 	enum xe_sriov_vf_ccs_rw_ctxs ctx_id;
+	struct xe_tile *tile;
 	struct xe_bb *bb;
 
 	xe_assert(xe, IS_VF_CCS_READY(xe));
@@ -394,12 +395,14 @@ int xe_sriov_vf_ccs_detach_bo(struct xe_bo *bo)
 	if (!xe_bo_has_valid_ccs_bb(bo))
 		return 0;
 
+	tile = xe_device_get_root_tile(xe);
+
 	for_each_ccs_rw_ctx(ctx_id) {
 		bb = bo->bb_ccs[ctx_id];
 		if (!bb)
 			continue;
 
-		memset(bb->cs, MI_NOOP, bb->len * sizeof(u32));
+		xe_migrate_ccs_rw_copy_clear(tile, bo, ctx_id);
 		xe_bb_free(bb, NULL);
 		bo->bb_ccs[ctx_id] = NULL;
 	}
-- 
2.51.0


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [PATCH v4 2/3] drm/xe/migrate: Make emit_pte() header write atomic
  2025-10-06 15:24 ` [PATCH v4 2/3] drm/xe/migrate: Make emit_pte() header write atomic Satyanarayana K V P
@ 2025-10-06 19:42   ` Matthew Brost
  0 siblings, 0 replies; 9+ messages in thread
From: Matthew Brost @ 2025-10-06 19:42 UTC (permalink / raw)
  To: Satyanarayana K V P; +Cc: intel-xe, Michal Wajdeczko, Matthew Auld

On Mon, Oct 06, 2025 at 08:54:46PM +0530, Satyanarayana K V P wrote:
> The MI_STORE_DATA_IMM instruction header is quad dword in size. If the
> vCPU halts during save/restore while this sequence is being programmed,
> partial writes may trigger page faults when saving IGPU CCS metadata.
> Update instruction header atomically.
> 
> Signed-off-by: Satyanarayana K V P <satyanarayana.k.v.p@intel.com>
> Cc: Michal Wajdeczko <michal.wajdeczko@intel.com>
> Cc: Matthew Brost <matthew.brost@intel.com>
> Cc: Matthew Auld <matthew.auld@intel.com>
> 
> ---
> V3 -> V4:
> - New commit added.
> 
> V2 -> V3:
> - None
> 
> V1 -> V2:
> - None
> ---
>  drivers/gpu/drm/xe/xe_migrate.c | 9 +++++++--
>  1 file changed, 7 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/gpu/drm/xe/xe_migrate.c b/drivers/gpu/drm/xe/xe_migrate.c
> index b960fdcecd88..4c575be45a76 100644
> --- a/drivers/gpu/drm/xe/xe_migrate.c
> +++ b/drivers/gpu/drm/xe/xe_migrate.c
> @@ -606,9 +606,14 @@ static void emit_pte(struct xe_migrate *m,
>  
>  	while (ptes) {
>  		u32 chunk = min(MAX_PTE_PER_SDI, ptes);
> +		u32 dw[SZ_2], i = 0;
> +
> +		dw[i++] = MI_STORE_DATA_IMM | MI_SDI_NUM_QW(chunk);
> +		dw[i++] = ofs;
> +
> +		WRITE_ONCE(*(u64 *)&bb->cs[bb->len], READ_ONCE(*(u64 *)dw));

You don't need the READ_ONCE.

This patch should safely cover the vCPU halting case.

However, I think we still need to work out how to handle the case where
vCPUs are unhalted and modifying the buffer faster than the GPU can read
it. For example, the instruction header might be read as a NOP by the
GPU, but the CPU then programs the subsequent store values, which the
GPU interprets as invalid instructions. We can figure out how to fix
this part in a follow up as it will depend on my VF migration series
landing first.

Matt

> +		bb->len += i;
>  
> -		bb->cs[bb->len++] = MI_STORE_DATA_IMM | MI_SDI_NUM_QW(chunk);
> -		bb->cs[bb->len++] = ofs;
>  		bb->cs[bb->len++] = 0;
>  
>  		cur_ofs = ofs;
> -- 
> 2.51.0
> 

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v4 1/3] drm/xe/migrate: Atomicize CCS copy command setup
  2025-10-06 15:24 ` [PATCH v4 1/3] " Satyanarayana K V P
@ 2025-10-06 19:58   ` Matthew Brost
  2025-10-08  9:50     ` K V P, Satyanarayana
  0 siblings, 1 reply; 9+ messages in thread
From: Matthew Brost @ 2025-10-06 19:58 UTC (permalink / raw)
  To: Satyanarayana K V P; +Cc: intel-xe, Michal Wajdeczko, Matthew Auld

On Mon, Oct 06, 2025 at 08:54:45PM +0530, Satyanarayana K V P wrote:
> The CCS copy command is a 5-dword sequence. If the vCPU halts during
> save/restore while this sequence is being programmed, partial writes may
> trigger page faults when saving IGPU CCS metadata. Use the VMOVDQU
> instruction to write the sequence atomically.
> 
> Since VMOVDQU operates on 256-bit chunks, update EMIT_COPY_CCS_DW to emit
> 8 dwords instead of 5 dwords.
> 
> Update emit_flush_invalidate() to use VMOVDQU operating with 128-bit
> chunks.
> 
> Signed-off-by: Satyanarayana K V P <satyanarayana.k.v.p@intel.com>
> Cc: Michal Wajdeczko <michal.wajdeczko@intel.com>
> Cc: Matthew Brost <matthew.brost@intel.com>
> Cc: Matthew Auld <matthew.auld@intel.com>
> 
> ---
> V3 -> V4:
> - Fixed review comments. (Wajdeczko)
> - Fix issues reported by patchworks.
> 
> V2 -> V3:
> - Added support for 128 bit and 256 bit instructions with memcpy_vmovdqu
> - Updated emit_flush_invalidate() to use vmovdqu instruction.
> 
> V1 -> V2:
> - Use memcpy_vmovdqu only for x86 arch and for VF. Else use memcpy
>   (Auld, Matthew)
> - Fix issues reported by patchworks.
> ---
>  drivers/gpu/drm/xe/xe_migrate.c | 92 +++++++++++++++++++++++++--------
>  1 file changed, 71 insertions(+), 21 deletions(-)
> 
> diff --git a/drivers/gpu/drm/xe/xe_migrate.c b/drivers/gpu/drm/xe/xe_migrate.c
> index c39c3b423d05..b960fdcecd88 100644
> --- a/drivers/gpu/drm/xe/xe_migrate.c
> +++ b/drivers/gpu/drm/xe/xe_migrate.c
> @@ -5,7 +5,9 @@
>  
>  #include "xe_migrate.h"
>  
> +#include <asm/fpu/api.h>
>  #include <linux/bitfield.h>
> +#include <linux/cpufeature.h>
>  #include <linux/sizes.h>
>  
>  #include <drm/drm_managed.h>
> @@ -644,18 +646,50 @@ static void emit_pte(struct xe_migrate *m,
>  	}
>  }
>  
> -#define EMIT_COPY_CCS_DW 5
> +static void memcpy_vmovdqu(void *dst, const void *src, u32 size)
> +{
> +	kernel_fpu_begin();
> +
> +#ifdef CONFIG_X86
> +	if (size == SZ_128) {
> +		asm("vmovdqu (%0), %%xmm0\n"
> +		    "vmovups %%xmm0,   (%1)\n"
> +		    :: "r" (src), "r" (dst) : "memory");
> +	} else if (size == SZ_256) {
> +		asm("vmovdqu (%0), %%ymm0\n"
> +		    "vmovups %%ymm0,   (%1)\n"
> +		    :: "r" (src), "r" (dst) : "memory");
> +	}
> +#endif
> +	kernel_fpu_end();

I think you can hide this entire function by #ifdef CONFIG_X86.

> +}
> +
> +static void emit_atomic(struct xe_gt *gt, void *dst, const void *src, u32 size)
> +{
> +	u32 instr_size = size * BITS_PER_BYTE;
> +
> +	xe_assert(gt_to_xe(gt), !(instr_size != SZ_128 && instr_size != SZ_256));

I think it is slightly more clear to write it like this:

xe_assert(gt_to_xe(gt), instr_size == SZ_128 || instr_size == SZ_256);

I suspect Michal would insist on xe_gt_assert here too.

> +
> +	if (IS_SRIOV_VF(gt_to_xe(gt)) && static_cpu_has(X86_FEATURE_AVX))

Should this be VF CCS initialized check rather than generic VF check?

> +		memcpy_vmovdqu(dst, src, instr_size);
> +	else
> +		memcpy(dst, src, size);
> +}
> +
> +#define EMIT_COPY_CCS_DW 8
>  static void emit_copy_ccs(struct xe_gt *gt, struct xe_bb *bb,
>  			  u64 dst_ofs, bool dst_is_indirect,
>  			  u64 src_ofs, bool src_is_indirect,
>  			  u32 size)
>  {
> +	u32 dw[EMIT_COPY_CCS_DW] = {MI_NOOP};
>  	struct xe_device *xe = gt_to_xe(gt);
>  	u32 *cs = bb->cs + bb->len;
>  	u32 num_ccs_blks;
>  	u32 num_pages;
>  	u32 ccs_copy_size;
>  	u32 mocs;
> +	u32 i = 0;
>  
>  	if (GRAPHICS_VERx100(xe) >= 2000) {
>  		num_pages = DIV_ROUND_UP(size, XE_PAGE_SIZE);
> @@ -673,15 +707,23 @@ static void emit_copy_ccs(struct xe_gt *gt, struct xe_bb *bb,
>  		mocs = FIELD_PREP(XY_CTRL_SURF_MOCS_MASK, gt->mocs.uc_index);
>  	}
>  
> -	*cs++ = XY_CTRL_SURF_COPY_BLT |
> -		(src_is_indirect ? 0x0 : 0x1) << SRC_ACCESS_TYPE_SHIFT |
> -		(dst_is_indirect ? 0x0 : 0x1) << DST_ACCESS_TYPE_SHIFT |
> -		ccs_copy_size;
> -	*cs++ = lower_32_bits(src_ofs);
> -	*cs++ = upper_32_bits(src_ofs) | mocs;
> -	*cs++ = lower_32_bits(dst_ofs);
> -	*cs++ = upper_32_bits(dst_ofs) | mocs;
> +	dw[i++] = XY_CTRL_SURF_COPY_BLT |
> +		  (src_is_indirect ? 0x0 : 0x1) << SRC_ACCESS_TYPE_SHIFT |
> +		  (dst_is_indirect ? 0x0 : 0x1) << DST_ACCESS_TYPE_SHIFT |
> +		  ccs_copy_size;
> +	dw[i++] = lower_32_bits(src_ofs);
> +	dw[i++] = upper_32_bits(src_ofs) | mocs;
> +	dw[i++] = lower_32_bits(dst_ofs);
> +	dw[i++] = upper_32_bits(dst_ofs) | mocs;
>  
> +	/*
> +	 * The CCS copy command is a 5-dword sequence. If the vCPU halts during
> +	 * save/restore while this sequence is being issued, partial writes may trigger
> +	 * page faults when saving iGPU CCS metadata. Use the VMOVDQU instruction to
> +	 * write the sequence atomically.
> +	 */
> +	emit_atomic(gt, cs, dw, sizeof(u32) * EMIT_COPY_CCS_DW);

sizeof(dw) to check this consistent with below or change below to match
the logic here.

> +	cs += EMIT_COPY_CCS_DW;
>  	bb->len = cs - bb->cs;
>  }
>  
> @@ -993,18 +1035,26 @@ static u64 migrate_vm_ppgtt_addr_tlb_inval(void)
>  	return (NUM_KERNEL_PDE - 2) * XE_PAGE_SIZE;
>  }
>  
> -static int emit_flush_invalidate(u32 *dw, int i, u32 flags)
> +/*
> + * The MI_FLUSH_DW command is a 4-dword sequence. If the vCPU halts during
> + * save/restore while this sequence is being issued, partial writes may
> + * trigger page faults when saving iGPU CCS metadata. Use
> + * emit_atomic() to write the sequence atomically.
> + */
> +static int emit_flush_invalidate(struct xe_exec_queue *q, u32 *dw, int i, u32 flags)

s/dw/cs ?

>  {
>  	u64 addr = migrate_vm_ppgtt_addr_tlb_inval();
> +	u32 tmp_dw[SZ_4] = {MI_NOOP}, j = 0;

#define EMIT_FLUSH_INVALIDATE_DW 4 ?

s/tmp_dw/cs ?

> +
> +	tmp_dw[j++] = MI_FLUSH_DW | MI_INVALIDATE_TLB | MI_FLUSH_DW_OP_STOREDW |
> +		      MI_FLUSH_IMM_DW | flags;
> +	tmp_dw[j++] = lower_32_bits(addr);
> +	tmp_dw[j++] = upper_32_bits(addr);
> +	tmp_dw[j++] = MI_NOOP;
>  
> -	dw[i++] = MI_FLUSH_DW | MI_INVALIDATE_TLB | MI_FLUSH_DW_OP_STOREDW |
> -		  MI_FLUSH_IMM_DW | flags;
> -	dw[i++] = lower_32_bits(addr);
> -	dw[i++] = upper_32_bits(addr);
> -	dw[i++] = MI_NOOP;
> -	dw[i++] = MI_NOOP;
> +	emit_atomic(q->gt, &dw[i], tmp_dw, sizeof(tmp_dw));
>  
> -	return i;
> +	return i + j;
>  }
>  
>  /**
> @@ -1049,7 +1099,7 @@ int xe_migrate_ccs_rw_copy(struct xe_tile *tile, struct xe_exec_queue *q,
>  	/* Calculate Batch buffer size */
>  	batch_size = 0;
>  	while (size) {
> -		batch_size += 10; /* Flush + ggtt addr + 2 NOP */
> +		batch_size += 8; /* Flush + ggtt addr + 1 NOP */
>  		u64 ccs_ofs, ccs_size;
>  		u32 ccs_pt;
>  
> @@ -1090,7 +1140,7 @@ int xe_migrate_ccs_rw_copy(struct xe_tile *tile, struct xe_exec_queue *q,
>  	 * sizes here again before copy command is emitted.
>  	 */
>  	while (size) {
> -		batch_size += 10; /* Flush + ggtt addr + 2 NOP */
> +		batch_size += 8; /* Flush + ggtt addr + 1 NOP */

EMIT_FLUSH_INVALIDATE_DW * 2 ?

>  		u32 flush_flags = 0;
>  		u64 ccs_ofs, ccs_size;
>  		u32 ccs_pt;
> @@ -1113,11 +1163,11 @@ int xe_migrate_ccs_rw_copy(struct xe_tile *tile, struct xe_exec_queue *q,
>  
>  		emit_pte(m, bb, ccs_pt, false, false, &ccs_it, ccs_size, src);
>  
> -		bb->len = emit_flush_invalidate(bb->cs, bb->len, flush_flags);
> +		bb->len = emit_flush_invalidate(q, bb->cs, bb->len, flush_flags);
>  		flush_flags = xe_migrate_ccs_copy(m, bb, src_L0_ofs, src_is_pltt,
>  						  src_L0_ofs, dst_is_pltt,
>  						  src_L0, ccs_ofs, true);
> -		bb->len = emit_flush_invalidate(bb->cs, bb->len, flush_flags);
> +		bb->len = emit_flush_invalidate(q, bb->cs, bb->len, flush_flags);

Side note: I don't think the second emit_flush_invalidate is actually
necessary here. Removing it is probably out of scope for this series,
but once this is merged and testing is stable, we can try removing it in
a follow-up and see what happens.

Matt

>  
>  		size -= src_L0;
>  	}
> -- 
> 2.51.0
> 

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v4 3/3] drm/xe/vf: Clear CCS read/write buffers in atomic way
  2025-10-06 15:24 ` [PATCH v4 3/3] drm/xe/vf: Clear CCS read/write buffers in atomic way Satyanarayana K V P
@ 2025-10-06 20:12   ` Matthew Brost
  2025-10-08  1:40     ` Matthew Brost
  0 siblings, 1 reply; 9+ messages in thread
From: Matthew Brost @ 2025-10-06 20:12 UTC (permalink / raw)
  To: Satyanarayana K V P; +Cc: intel-xe, Michal Wajdeczko, Matthew Auld

On Mon, Oct 06, 2025 at 08:54:47PM +0530, Satyanarayana K V P wrote:
> Clear the contents of the CCS read/write batch buffer, ensuring no page
> faults / GPU hang occur if migration happens midway.
> 

It is going to take me minute to fully validate the algorithm given the
complexity but some quick comments.

> Signed-off-by: Satyanarayana K V P <satyanarayana.k.v.p@intel.com>
> Cc: Michal Wajdeczko <michal.wajdeczko@intel.com>
> Cc: Matthew Brost <matthew.brost@intel.com>
> Cc: Matthew Auld <matthew.auld@intel.com>
> 
> ---
> V3 -> V4:
> - New commit added.
> 
> V2 -> V3:
> - None
> 
> V1 -> V2:
> - None
> ---
>  drivers/gpu/drm/xe/xe_migrate.c      | 130 +++++++++++++++++++++++++++
>  drivers/gpu/drm/xe/xe_migrate.h      |   3 +
>  drivers/gpu/drm/xe/xe_sriov_vf_ccs.c |   5 +-
>  3 files changed, 137 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/gpu/drm/xe/xe_migrate.c b/drivers/gpu/drm/xe/xe_migrate.c
> index 4c575be45a76..71c446e74d84 100644
> --- a/drivers/gpu/drm/xe/xe_migrate.c
> +++ b/drivers/gpu/drm/xe/xe_migrate.c
> @@ -651,6 +651,42 @@ static void emit_pte(struct xe_migrate *m,
>  	}
>  }
>  
> +static void emit_pte_clear(struct xe_gt *gt, struct xe_bb *bb, int start_offset,
> +			   int end_offset)
> +{
> +	u32 dw_nop[SZ_4] = {MI_NOOP};

SZ_2 or just 2.

> +	int i = start_offset;
> +	int len = end_offset;
> +	u32 *cs = bb->cs;
> +
> +	/* Reverses the operations performed by emit_pte() */
> +	while (i < len) {
> +		u32 dwords, qwords;
> +
> +		xe_assert(gt_to_xe(gt), (REG_FIELD_GET(REG_GENMASK(31, 23), cs[i]) == 0x20));
> +
> +		qwords = REG_FIELD_GET(MI_SDI_LEN_DW, cs[i]);
> +		/*
> +		 * If Store QW is enabled, then the value of the dwlengh
> +		 * includes the header, address and multiple QW pairs of data
> +		 * which means the values will be limited to odd values starting
> +		 * at a value of 3(3 representing the size of a 5 DW command
> +		 * including header, 2 dw address and 2 dw data).
> +		 */
> +		dwords = qwords - 1;
> +		/*
> +		 * Do not clear header first. Clear PTEs first and then clear the
> +		 * header to avoid page faults.
> +		 */
> +		memset(&cs[i + 3], MI_NOOP, (dwords) * sizeof(u32));
> +

I think you a need wmb() here to ensure the above is GPU visable before
clearing the header instruction.

> +		WRITE_ONCE(*(u64 *)&cs[i], READ_ONCE(*(u64 *)dw_nop));
> +
> +		cs[i + 2] = MI_NOOP;
> +		i += (dwords + 3);
> +	}
> +}
> +
>  static void memcpy_vmovdqu(void *dst, const void *src, u32 size)
>  {
>  	kernel_fpu_begin();
> @@ -732,6 +768,17 @@ static void emit_copy_ccs(struct xe_gt *gt, struct xe_bb *bb,
>  	bb->len = cs - bb->cs;
>  }
>  
> +static u32 emit_copy_ccs_clear(struct xe_gt *gt, struct xe_bb *bb, u32 offset)
> +{
> +	u32 dw[EMIT_COPY_CCS_DW] = {MI_NOOP};
> +	u32 *cs = bb->cs + offset - EMIT_COPY_CCS_DW;
> +
> +	xe_assert(gt_to_xe(gt), (REG_FIELD_GET(REG_GENMASK(31, 22), *cs) == 0x148));
> +	emit_atomic(gt, cs, dw, sizeof(dw));

I think you need a wmb() here to the clearing of copy is GPU visable
before you start clearing out the PTEs.

> +
> +	return offset - EMIT_COPY_CCS_DW;
> +}
> +
>  #define EMIT_COPY_DW 10
>  static void emit_copy(struct xe_gt *gt, struct xe_bb *bb,
>  		      u64 src_ofs, u64 dst_ofs, unsigned int size,
> @@ -1062,6 +1109,19 @@ static int emit_flush_invalidate(struct xe_exec_queue *q, u32 *dw, int i, u32 fl
>  	return i + j;
>  }
>  
> +static u32 emit_flush_invalidate_clear(struct xe_gt *gt, struct xe_bb *bb,
> +				       u32 offset)
> +{
> +	u32 dw[SZ_4] = {MI_NOOP};

As discussed in patch 1 use EMIT_FLUSH_INVALIDATE_DW.

Matt

> +	u32 *cs = bb->cs + offset - SZ_4;
> +
> +	xe_assert(gt_to_xe(gt), (REG_FIELD_GET(REG_GENMASK(31, 23), *cs) == 0x26));
> +
> +	emit_atomic(gt, cs, dw, sizeof(dw));
> +
> +	return offset - SZ_4;
> +}
> +
>  /**
>   * xe_migrate_ccs_rw_copy() - Copy content of TTM resources.
>   * @tile: Tile whose migration context to be used.
> @@ -1186,6 +1246,76 @@ int xe_migrate_ccs_rw_copy(struct xe_tile *tile, struct xe_exec_queue *q,
>  	return err;
>  }
>  
> +static u32 ccs_rw_pte_size(struct xe_gt *gt, struct xe_bb *bb, u32 offset)
> +{
> +	int len = bb->len;
> +	u32 *cs = bb->cs;
> +	u32 i = offset;
> +
> +	while (i < len) {
> +		u32 dwords, qwords;
> +
> +		xe_assert(gt_to_xe(gt), (REG_FIELD_GET(REG_GENMASK(31, 23), cs[i]) == 0x20));
> +
> +		qwords = REG_FIELD_GET(MI_SDI_LEN_DW, cs[i]);
> +		/*
> +		 * If Store QW is enabled, then the value of the dwlengh
> +		 * includes the header, address and multiple QW pairs of data
> +		 * which means the values will be limited to odd values starting
> +		 * at a value of 3(3 representing the size of a 5 DW command
> +		 * including header, 2 dw address and 2 dw data).
> +		 */
> +		dwords = qwords - 1;
> +		i += dwords + 3;
> +
> +		/*
> +		 * Break if the next dword is for emit_flush_invalidate_clear()
> +		 * or emit_copy_ccs_clear()
> +		 */
> +		if ((REG_FIELD_GET(REG_GENMASK(31, 23), cs[i]) == 0x26) ||
> +		    (REG_FIELD_GET(REG_GENMASK(31, 22), cs[i]) == 0x148))
> +			break;
> +	}
> +	return i;
> +}
> +
> +/**
> + * xe_migrate_ccs_rw_copy_clear() - Clear the CCS read/write batch buffer
> + * content.
> + * @tile: Tile whose migration context to be used.
> + * @src_bo: The buffer object @src is currently bound to.
> + * @read_write : Creates BB commands for CCS read/write.
> + *
> + * The CCS copy command has three stages: PTE setup, TLB invalidation, and CCS
> + * copy. Each stage includes a header followed by instructions. When clearing,
> + * remove the instructions first, then the header. For the TLB invalidation and
> + * CCS copy stages, ensure the writes are atomic.
> + *
> + * This reverses the operations performed by xe_migrate_ccs_rw_copy().
> + *
> + * Returns: None.
> + */
> +void xe_migrate_ccs_rw_copy_clear(struct xe_tile *tile, struct xe_bo *src_bo,
> +				  enum xe_sriov_vf_ccs_rw_ctxs read_write)
> +{
> +	struct xe_bb *bb = src_bo->bb_ccs[read_write];
> +	u32 bb_offset = 0, bb_offset_chunk = 0;
> +	struct xe_gt *gt = tile->primary_gt;
> +
> +	while (bb_offset_chunk >= 0 && bb_offset_chunk < bb->len) {
> +		bb_offset = ccs_rw_pte_size(gt, bb, bb_offset_chunk);
> +		/*
> +		 * After PTE entries, we have one TLB invalidation, CCS copy
> +		 * command and another TLB invalidation command.
> +		 */
> +		bb_offset_chunk = bb_offset + SZ_4 + EMIT_COPY_CCS_DW + SZ_4;
> +		bb_offset = emit_flush_invalidate_clear(gt, bb, bb_offset_chunk);
> +		bb_offset = emit_copy_ccs_clear(gt, bb, bb_offset);
> +		bb_offset = emit_flush_invalidate_clear(gt, bb, bb_offset);
> +		emit_pte_clear(gt, bb, bb_offset_chunk, bb_offset);
> +	}
> +}
> +
>  /**
>   * xe_get_migrate_exec_queue() - Get the execution queue from migrate context.
>   * @migrate: Migrate context.
> diff --git a/drivers/gpu/drm/xe/xe_migrate.h b/drivers/gpu/drm/xe/xe_migrate.h
> index 0d8944b1cee6..bd2c0eb3ad94 100644
> --- a/drivers/gpu/drm/xe/xe_migrate.h
> +++ b/drivers/gpu/drm/xe/xe_migrate.h
> @@ -129,6 +129,9 @@ int xe_migrate_ccs_rw_copy(struct xe_tile *tile, struct xe_exec_queue *q,
>  			   struct xe_bo *src_bo,
>  			   enum xe_sriov_vf_ccs_rw_ctxs read_write);
>  
> +void xe_migrate_ccs_rw_copy_clear(struct xe_tile *tile, struct xe_bo *src_bo,
> +				  enum xe_sriov_vf_ccs_rw_ctxs read_write);
> +
>  struct xe_lrc *xe_migrate_lrc(struct xe_migrate *migrate);
>  struct xe_exec_queue *xe_migrate_exec_queue(struct xe_migrate *migrate);
>  struct dma_fence *xe_migrate_raw_vram_copy(struct xe_bo *vram_bo, u64 vram_offset,
> diff --git a/drivers/gpu/drm/xe/xe_sriov_vf_ccs.c b/drivers/gpu/drm/xe/xe_sriov_vf_ccs.c
> index 790249801364..2d3728cb24ca 100644
> --- a/drivers/gpu/drm/xe/xe_sriov_vf_ccs.c
> +++ b/drivers/gpu/drm/xe/xe_sriov_vf_ccs.c
> @@ -387,6 +387,7 @@ int xe_sriov_vf_ccs_detach_bo(struct xe_bo *bo)
>  {
>  	struct xe_device *xe = xe_bo_device(bo);
>  	enum xe_sriov_vf_ccs_rw_ctxs ctx_id;
> +	struct xe_tile *tile;
>  	struct xe_bb *bb;
>  
>  	xe_assert(xe, IS_VF_CCS_READY(xe));
> @@ -394,12 +395,14 @@ int xe_sriov_vf_ccs_detach_bo(struct xe_bo *bo)
>  	if (!xe_bo_has_valid_ccs_bb(bo))
>  		return 0;
>  
> +	tile = xe_device_get_root_tile(xe);
> +
>  	for_each_ccs_rw_ctx(ctx_id) {
>  		bb = bo->bb_ccs[ctx_id];
>  		if (!bb)
>  			continue;
>  
> -		memset(bb->cs, MI_NOOP, bb->len * sizeof(u32));
> +		xe_migrate_ccs_rw_copy_clear(tile, bo, ctx_id);
>  		xe_bb_free(bb, NULL);
>  		bo->bb_ccs[ctx_id] = NULL;
>  	}
> -- 
> 2.51.0
> 

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v4 3/3] drm/xe/vf: Clear CCS read/write buffers in atomic way
  2025-10-06 20:12   ` Matthew Brost
@ 2025-10-08  1:40     ` Matthew Brost
  0 siblings, 0 replies; 9+ messages in thread
From: Matthew Brost @ 2025-10-08  1:40 UTC (permalink / raw)
  To: Satyanarayana K V P; +Cc: intel-xe, Michal Wajdeczko, Matthew Auld

On Mon, Oct 06, 2025 at 01:12:43PM -0700, Matthew Brost wrote:
> On Mon, Oct 06, 2025 at 08:54:47PM +0530, Satyanarayana K V P wrote:
> > Clear the contents of the CCS read/write batch buffer, ensuring no page
> > faults / GPU hang occur if migration happens midway.
> > 
> 
> It is going to take me minute to fully validate the algorithm given the
> complexity but some quick comments.
> 
> > Signed-off-by: Satyanarayana K V P <satyanarayana.k.v.p@intel.com>
> > Cc: Michal Wajdeczko <michal.wajdeczko@intel.com>
> > Cc: Matthew Brost <matthew.brost@intel.com>
> > Cc: Matthew Auld <matthew.auld@intel.com>
> > 
> > ---
> > V3 -> V4:
> > - New commit added.
> > 
> > V2 -> V3:
> > - None
> > 
> > V1 -> V2:
> > - None
> > ---
> >  drivers/gpu/drm/xe/xe_migrate.c      | 130 +++++++++++++++++++++++++++
> >  drivers/gpu/drm/xe/xe_migrate.h      |   3 +
> >  drivers/gpu/drm/xe/xe_sriov_vf_ccs.c |   5 +-
> >  3 files changed, 137 insertions(+), 1 deletion(-)
> > 
> > diff --git a/drivers/gpu/drm/xe/xe_migrate.c b/drivers/gpu/drm/xe/xe_migrate.c
> > index 4c575be45a76..71c446e74d84 100644
> > --- a/drivers/gpu/drm/xe/xe_migrate.c
> > +++ b/drivers/gpu/drm/xe/xe_migrate.c
> > @@ -651,6 +651,42 @@ static void emit_pte(struct xe_migrate *m,
> >  	}
> >  }
> >  
> > +static void emit_pte_clear(struct xe_gt *gt, struct xe_bb *bb, int start_offset,
> > +			   int end_offset)
> > +{
> > +	u32 dw_nop[SZ_4] = {MI_NOOP};
> 
> SZ_2 or just 2.
> 
> > +	int i = start_offset;
> > +	int len = end_offset;
> > +	u32 *cs = bb->cs;
> > +
> > +	/* Reverses the operations performed by emit_pte() */
> > +	while (i < len) {
> > +		u32 dwords, qwords;
> > +
> > +		xe_assert(gt_to_xe(gt), (REG_FIELD_GET(REG_GENMASK(31, 23), cs[i]) == 0x20));

Ah, I see you have asserts everywhere which is self validating. Good
idea. If these asserts are not popping then I don't really have any more
comments. Looks good.

Matt

> > +
> > +		qwords = REG_FIELD_GET(MI_SDI_LEN_DW, cs[i]);
> > +		/*
> > +		 * If Store QW is enabled, then the value of the dwlengh
> > +		 * includes the header, address and multiple QW pairs of data
> > +		 * which means the values will be limited to odd values starting
> > +		 * at a value of 3(3 representing the size of a 5 DW command
> > +		 * including header, 2 dw address and 2 dw data).
> > +		 */
> > +		dwords = qwords - 1;
> > +		/*
> > +		 * Do not clear header first. Clear PTEs first and then clear the
> > +		 * header to avoid page faults.
> > +		 */
> > +		memset(&cs[i + 3], MI_NOOP, (dwords) * sizeof(u32));
> > +
> 
> I think you a need wmb() here to ensure the above is GPU visable before
> clearing the header instruction.
> 
> > +		WRITE_ONCE(*(u64 *)&cs[i], READ_ONCE(*(u64 *)dw_nop));
> > +
> > +		cs[i + 2] = MI_NOOP;
> > +		i += (dwords + 3);
> > +	}
> > +}
> > +
> >  static void memcpy_vmovdqu(void *dst, const void *src, u32 size)
> >  {
> >  	kernel_fpu_begin();
> > @@ -732,6 +768,17 @@ static void emit_copy_ccs(struct xe_gt *gt, struct xe_bb *bb,
> >  	bb->len = cs - bb->cs;
> >  }
> >  
> > +static u32 emit_copy_ccs_clear(struct xe_gt *gt, struct xe_bb *bb, u32 offset)
> > +{
> > +	u32 dw[EMIT_COPY_CCS_DW] = {MI_NOOP};
> > +	u32 *cs = bb->cs + offset - EMIT_COPY_CCS_DW;
> > +
> > +	xe_assert(gt_to_xe(gt), (REG_FIELD_GET(REG_GENMASK(31, 22), *cs) == 0x148));
> > +	emit_atomic(gt, cs, dw, sizeof(dw));
> 
> I think you need a wmb() here to the clearing of copy is GPU visable
> before you start clearing out the PTEs.
> 
> > +
> > +	return offset - EMIT_COPY_CCS_DW;
> > +}
> > +
> >  #define EMIT_COPY_DW 10
> >  static void emit_copy(struct xe_gt *gt, struct xe_bb *bb,
> >  		      u64 src_ofs, u64 dst_ofs, unsigned int size,
> > @@ -1062,6 +1109,19 @@ static int emit_flush_invalidate(struct xe_exec_queue *q, u32 *dw, int i, u32 fl
> >  	return i + j;
> >  }
> >  
> > +static u32 emit_flush_invalidate_clear(struct xe_gt *gt, struct xe_bb *bb,
> > +				       u32 offset)
> > +{
> > +	u32 dw[SZ_4] = {MI_NOOP};
> 
> As discussed in patch 1 use EMIT_FLUSH_INVALIDATE_DW.
> 
> Matt
> 
> > +	u32 *cs = bb->cs + offset - SZ_4;
> > +
> > +	xe_assert(gt_to_xe(gt), (REG_FIELD_GET(REG_GENMASK(31, 23), *cs) == 0x26));
> > +
> > +	emit_atomic(gt, cs, dw, sizeof(dw));
> > +
> > +	return offset - SZ_4;
> > +}
> > +
> >  /**
> >   * xe_migrate_ccs_rw_copy() - Copy content of TTM resources.
> >   * @tile: Tile whose migration context to be used.
> > @@ -1186,6 +1246,76 @@ int xe_migrate_ccs_rw_copy(struct xe_tile *tile, struct xe_exec_queue *q,
> >  	return err;
> >  }
> >  
> > +static u32 ccs_rw_pte_size(struct xe_gt *gt, struct xe_bb *bb, u32 offset)
> > +{
> > +	int len = bb->len;
> > +	u32 *cs = bb->cs;
> > +	u32 i = offset;
> > +
> > +	while (i < len) {
> > +		u32 dwords, qwords;
> > +
> > +		xe_assert(gt_to_xe(gt), (REG_FIELD_GET(REG_GENMASK(31, 23), cs[i]) == 0x20));
> > +
> > +		qwords = REG_FIELD_GET(MI_SDI_LEN_DW, cs[i]);
> > +		/*
> > +		 * If Store QW is enabled, then the value of the dwlengh
> > +		 * includes the header, address and multiple QW pairs of data
> > +		 * which means the values will be limited to odd values starting
> > +		 * at a value of 3(3 representing the size of a 5 DW command
> > +		 * including header, 2 dw address and 2 dw data).
> > +		 */
> > +		dwords = qwords - 1;
> > +		i += dwords + 3;
> > +
> > +		/*
> > +		 * Break if the next dword is for emit_flush_invalidate_clear()
> > +		 * or emit_copy_ccs_clear()
> > +		 */
> > +		if ((REG_FIELD_GET(REG_GENMASK(31, 23), cs[i]) == 0x26) ||
> > +		    (REG_FIELD_GET(REG_GENMASK(31, 22), cs[i]) == 0x148))
> > +			break;
> > +	}
> > +	return i;
> > +}
> > +
> > +/**
> > + * xe_migrate_ccs_rw_copy_clear() - Clear the CCS read/write batch buffer
> > + * content.
> > + * @tile: Tile whose migration context to be used.
> > + * @src_bo: The buffer object @src is currently bound to.
> > + * @read_write : Creates BB commands for CCS read/write.
> > + *
> > + * The CCS copy command has three stages: PTE setup, TLB invalidation, and CCS
> > + * copy. Each stage includes a header followed by instructions. When clearing,
> > + * remove the instructions first, then the header. For the TLB invalidation and
> > + * CCS copy stages, ensure the writes are atomic.
> > + *
> > + * This reverses the operations performed by xe_migrate_ccs_rw_copy().
> > + *
> > + * Returns: None.
> > + */
> > +void xe_migrate_ccs_rw_copy_clear(struct xe_tile *tile, struct xe_bo *src_bo,
> > +				  enum xe_sriov_vf_ccs_rw_ctxs read_write)
> > +{
> > +	struct xe_bb *bb = src_bo->bb_ccs[read_write];
> > +	u32 bb_offset = 0, bb_offset_chunk = 0;
> > +	struct xe_gt *gt = tile->primary_gt;
> > +
> > +	while (bb_offset_chunk >= 0 && bb_offset_chunk < bb->len) {
> > +		bb_offset = ccs_rw_pte_size(gt, bb, bb_offset_chunk);
> > +		/*
> > +		 * After PTE entries, we have one TLB invalidation, CCS copy
> > +		 * command and another TLB invalidation command.
> > +		 */
> > +		bb_offset_chunk = bb_offset + SZ_4 + EMIT_COPY_CCS_DW + SZ_4;
> > +		bb_offset = emit_flush_invalidate_clear(gt, bb, bb_offset_chunk);
> > +		bb_offset = emit_copy_ccs_clear(gt, bb, bb_offset);
> > +		bb_offset = emit_flush_invalidate_clear(gt, bb, bb_offset);
> > +		emit_pte_clear(gt, bb, bb_offset_chunk, bb_offset);
> > +	}
> > +}
> > +
> >  /**
> >   * xe_get_migrate_exec_queue() - Get the execution queue from migrate context.
> >   * @migrate: Migrate context.
> > diff --git a/drivers/gpu/drm/xe/xe_migrate.h b/drivers/gpu/drm/xe/xe_migrate.h
> > index 0d8944b1cee6..bd2c0eb3ad94 100644
> > --- a/drivers/gpu/drm/xe/xe_migrate.h
> > +++ b/drivers/gpu/drm/xe/xe_migrate.h
> > @@ -129,6 +129,9 @@ int xe_migrate_ccs_rw_copy(struct xe_tile *tile, struct xe_exec_queue *q,
> >  			   struct xe_bo *src_bo,
> >  			   enum xe_sriov_vf_ccs_rw_ctxs read_write);
> >  
> > +void xe_migrate_ccs_rw_copy_clear(struct xe_tile *tile, struct xe_bo *src_bo,
> > +				  enum xe_sriov_vf_ccs_rw_ctxs read_write);
> > +
> >  struct xe_lrc *xe_migrate_lrc(struct xe_migrate *migrate);
> >  struct xe_exec_queue *xe_migrate_exec_queue(struct xe_migrate *migrate);
> >  struct dma_fence *xe_migrate_raw_vram_copy(struct xe_bo *vram_bo, u64 vram_offset,
> > diff --git a/drivers/gpu/drm/xe/xe_sriov_vf_ccs.c b/drivers/gpu/drm/xe/xe_sriov_vf_ccs.c
> > index 790249801364..2d3728cb24ca 100644
> > --- a/drivers/gpu/drm/xe/xe_sriov_vf_ccs.c
> > +++ b/drivers/gpu/drm/xe/xe_sriov_vf_ccs.c
> > @@ -387,6 +387,7 @@ int xe_sriov_vf_ccs_detach_bo(struct xe_bo *bo)
> >  {
> >  	struct xe_device *xe = xe_bo_device(bo);
> >  	enum xe_sriov_vf_ccs_rw_ctxs ctx_id;
> > +	struct xe_tile *tile;
> >  	struct xe_bb *bb;
> >  
> >  	xe_assert(xe, IS_VF_CCS_READY(xe));
> > @@ -394,12 +395,14 @@ int xe_sriov_vf_ccs_detach_bo(struct xe_bo *bo)
> >  	if (!xe_bo_has_valid_ccs_bb(bo))
> >  		return 0;
> >  
> > +	tile = xe_device_get_root_tile(xe);
> > +
> >  	for_each_ccs_rw_ctx(ctx_id) {
> >  		bb = bo->bb_ccs[ctx_id];
> >  		if (!bb)
> >  			continue;
> >  
> > -		memset(bb->cs, MI_NOOP, bb->len * sizeof(u32));
> > +		xe_migrate_ccs_rw_copy_clear(tile, bo, ctx_id);
> >  		xe_bb_free(bb, NULL);
> >  		bo->bb_ccs[ctx_id] = NULL;
> >  	}
> > -- 
> > 2.51.0
> > 

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v4 1/3] drm/xe/migrate: Atomicize CCS copy command setup
  2025-10-06 19:58   ` Matthew Brost
@ 2025-10-08  9:50     ` K V P, Satyanarayana
  0 siblings, 0 replies; 9+ messages in thread
From: K V P, Satyanarayana @ 2025-10-08  9:50 UTC (permalink / raw)
  To: Matthew Brost; +Cc: intel-xe, Michal Wajdeczko, Matthew Auld



On 07-10-2025 01:28, Matthew Brost wrote:
> On Mon, Oct 06, 2025 at 08:54:45PM +0530, Satyanarayana K V P wrote:
>> The CCS copy command is a 5-dword sequence. If the vCPU halts during
>> save/restore while this sequence is being programmed, partial writes may
>> trigger page faults when saving IGPU CCS metadata. Use the VMOVDQU
>> instruction to write the sequence atomically.
>>
>> Since VMOVDQU operates on 256-bit chunks, update EMIT_COPY_CCS_DW to emit
>> 8 dwords instead of 5 dwords.
>>
>> Update emit_flush_invalidate() to use VMOVDQU operating with 128-bit
>> chunks.
>>
>> Signed-off-by: Satyanarayana K V P <satyanarayana.k.v.p@intel.com>
>> Cc: Michal Wajdeczko <michal.wajdeczko@intel.com>
>> Cc: Matthew Brost <matthew.brost@intel.com>
>> Cc: Matthew Auld <matthew.auld@intel.com>
>>
>> ---
>> V3 -> V4:
>> - Fixed review comments. (Wajdeczko)
>> - Fix issues reported by patchworks.
>>
>> V2 -> V3:
>> - Added support for 128 bit and 256 bit instructions with memcpy_vmovdqu
>> - Updated emit_flush_invalidate() to use vmovdqu instruction.
>>
>> V1 -> V2:
>> - Use memcpy_vmovdqu only for x86 arch and for VF. Else use memcpy
>>    (Auld, Matthew)
>> - Fix issues reported by patchworks.
>> ---
>>   drivers/gpu/drm/xe/xe_migrate.c | 92 +++++++++++++++++++++++++--------
>>   1 file changed, 71 insertions(+), 21 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/xe/xe_migrate.c b/drivers/gpu/drm/xe/xe_migrate.c
>> index c39c3b423d05..b960fdcecd88 100644
>> --- a/drivers/gpu/drm/xe/xe_migrate.c
>> +++ b/drivers/gpu/drm/xe/xe_migrate.c
>> @@ -5,7 +5,9 @@
>>   
>>   #include "xe_migrate.h"
>>   
>> +#include <asm/fpu/api.h>
>>   #include <linux/bitfield.h>
>> +#include <linux/cpufeature.h>
>>   #include <linux/sizes.h>
>>   
>>   #include <drm/drm_managed.h>
>> @@ -644,18 +646,50 @@ static void emit_pte(struct xe_migrate *m,
>>   	}
>>   }
>>   
>> -#define EMIT_COPY_CCS_DW 5
>> +static void memcpy_vmovdqu(void *dst, const void *src, u32 size)
>> +{
>> +	kernel_fpu_begin();
>> +
>> +#ifdef CONFIG_X86
>> +	if (size == SZ_128) {
>> +		asm("vmovdqu (%0), %%xmm0\n"
>> +		    "vmovups %%xmm0,   (%1)\n"
>> +		    :: "r" (src), "r" (dst) : "memory");
>> +	} else if (size == SZ_256) {
>> +		asm("vmovdqu (%0), %%ymm0\n"
>> +		    "vmovups %%ymm0,   (%1)\n"
>> +		    :: "r" (src), "r" (dst) : "memory");
>> +	}
>> +#endif
>> +	kernel_fpu_end();
> 
> I think you can hide this entire function by #ifdef CONFIG_X86.
Kept the body of function under #ifdef. Otherwise we may compilation 
error when CONFIG_X86 is not defined.>
>> +}
>> +
>> +static void emit_atomic(struct xe_gt *gt, void *dst, const void *src, u32 size)
>> +{
>> +	u32 instr_size = size * BITS_PER_BYTE;
>> +
>> +	xe_assert(gt_to_xe(gt), !(instr_size != SZ_128 && instr_size != SZ_256));
> 
> I think it is slightly more clear to write it like this:
> 
> xe_assert(gt_to_xe(gt), instr_size == SZ_128 || instr_size == SZ_256);
> 
> I suspect Michal would insist on xe_gt_assert here too.
Since CCS save/restore is per device, xe_assert() should hold good.>
>> +
>> +	if (IS_SRIOV_VF(gt_to_xe(gt)) && static_cpu_has(X86_FEATURE_AVX))
> 
> Should this be VF CCS initialized check rather than generic VF check?
Fixed in new revision.>
>> +		memcpy_vmovdqu(dst, src, instr_size);
>> +	else
>> +		memcpy(dst, src, size);
>> +}
>> +
>> +#define EMIT_COPY_CCS_DW 8
>>   static void emit_copy_ccs(struct xe_gt *gt, struct xe_bb *bb,
>>   			  u64 dst_ofs, bool dst_is_indirect,
>>   			  u64 src_ofs, bool src_is_indirect,
>>   			  u32 size)
>>   {
>> +	u32 dw[EMIT_COPY_CCS_DW] = {MI_NOOP};
>>   	struct xe_device *xe = gt_to_xe(gt);
>>   	u32 *cs = bb->cs + bb->len;
>>   	u32 num_ccs_blks;
>>   	u32 num_pages;
>>   	u32 ccs_copy_size;
>>   	u32 mocs;
>> +	u32 i = 0;
>>   
>>   	if (GRAPHICS_VERx100(xe) >= 2000) {
>>   		num_pages = DIV_ROUND_UP(size, XE_PAGE_SIZE);
>> @@ -673,15 +707,23 @@ static void emit_copy_ccs(struct xe_gt *gt, struct xe_bb *bb,
>>   		mocs = FIELD_PREP(XY_CTRL_SURF_MOCS_MASK, gt->mocs.uc_index);
>>   	}
>>   
>> -	*cs++ = XY_CTRL_SURF_COPY_BLT |
>> -		(src_is_indirect ? 0x0 : 0x1) << SRC_ACCESS_TYPE_SHIFT |
>> -		(dst_is_indirect ? 0x0 : 0x1) << DST_ACCESS_TYPE_SHIFT |
>> -		ccs_copy_size;
>> -	*cs++ = lower_32_bits(src_ofs);
>> -	*cs++ = upper_32_bits(src_ofs) | mocs;
>> -	*cs++ = lower_32_bits(dst_ofs);
>> -	*cs++ = upper_32_bits(dst_ofs) | mocs;
>> +	dw[i++] = XY_CTRL_SURF_COPY_BLT |
>> +		  (src_is_indirect ? 0x0 : 0x1) << SRC_ACCESS_TYPE_SHIFT |
>> +		  (dst_is_indirect ? 0x0 : 0x1) << DST_ACCESS_TYPE_SHIFT |
>> +		  ccs_copy_size;
>> +	dw[i++] = lower_32_bits(src_ofs);
>> +	dw[i++] = upper_32_bits(src_ofs) | mocs;
>> +	dw[i++] = lower_32_bits(dst_ofs);
>> +	dw[i++] = upper_32_bits(dst_ofs) | mocs;
>>   
>> +	/*
>> +	 * The CCS copy command is a 5-dword sequence. If the vCPU halts during
>> +	 * save/restore while this sequence is being issued, partial writes may trigger
>> +	 * page faults when saving iGPU CCS metadata. Use the VMOVDQU instruction to
>> +	 * write the sequence atomically.
>> +	 */
>> +	emit_atomic(gt, cs, dw, sizeof(u32) * EMIT_COPY_CCS_DW);
> 
> sizeof(dw) to check this consistent with below or change below to match
> the logic here.
Fixed in new revision.>
>> +	cs += EMIT_COPY_CCS_DW;
>>   	bb->len = cs - bb->cs;
>>   }
>>   
>> @@ -993,18 +1035,26 @@ static u64 migrate_vm_ppgtt_addr_tlb_inval(void)
>>   	return (NUM_KERNEL_PDE - 2) * XE_PAGE_SIZE;
>>   }
>>   
>> -static int emit_flush_invalidate(u32 *dw, int i, u32 flags)
>> +/*
>> + * The MI_FLUSH_DW command is a 4-dword sequence. If the vCPU halts during
>> + * save/restore while this sequence is being issued, partial writes may
>> + * trigger page faults when saving iGPU CCS metadata. Use
>> + * emit_atomic() to write the sequence atomically.
>> + */
>> +static int emit_flush_invalidate(struct xe_exec_queue *q, u32 *dw, int i, u32 flags)
> 
> s/dw/cs ?
Fixed in new revision.>
>>   {
>>   	u64 addr = migrate_vm_ppgtt_addr_tlb_inval();
>> +	u32 tmp_dw[SZ_4] = {MI_NOOP}, j = 0;
> 
> #define EMIT_FLUSH_INVALIDATE_DW 4 ?
> 
> s/tmp_dw/cs ?
Fixed in new revision.>
>> +
>> +	tmp_dw[j++] = MI_FLUSH_DW | MI_INVALIDATE_TLB | MI_FLUSH_DW_OP_STOREDW |
>> +		      MI_FLUSH_IMM_DW | flags;
>> +	tmp_dw[j++] = lower_32_bits(addr);
>> +	tmp_dw[j++] = upper_32_bits(addr);
>> +	tmp_dw[j++] = MI_NOOP;
>>   
>> -	dw[i++] = MI_FLUSH_DW | MI_INVALIDATE_TLB | MI_FLUSH_DW_OP_STOREDW |
>> -		  MI_FLUSH_IMM_DW | flags;
>> -	dw[i++] = lower_32_bits(addr);
>> -	dw[i++] = upper_32_bits(addr);
>> -	dw[i++] = MI_NOOP;
>> -	dw[i++] = MI_NOOP;
>> +	emit_atomic(q->gt, &dw[i], tmp_dw, sizeof(tmp_dw));
>>   
>> -	return i;
>> +	return i + j;
>>   }
>>   
>>   /**
>> @@ -1049,7 +1099,7 @@ int xe_migrate_ccs_rw_copy(struct xe_tile *tile, struct xe_exec_queue *q,
>>   	/* Calculate Batch buffer size */
>>   	batch_size = 0;
>>   	while (size) {
>> -		batch_size += 10; /* Flush + ggtt addr + 2 NOP */
>> +		batch_size += 8; /* Flush + ggtt addr + 1 NOP */
>>   		u64 ccs_ofs, ccs_size;
>>   		u32 ccs_pt;
>>   
>> @@ -1090,7 +1140,7 @@ int xe_migrate_ccs_rw_copy(struct xe_tile *tile, struct xe_exec_queue *q,
>>   	 * sizes here again before copy command is emitted.
>>   	 */
>>   	while (size) {
>> -		batch_size += 10; /* Flush + ggtt addr + 2 NOP */
>> +		batch_size += 8; /* Flush + ggtt addr + 1 NOP */
> 
> EMIT_FLUSH_INVALIDATE_DW * 2 ?
> 
>>   		u32 flush_flags = 0;
>>   		u64 ccs_ofs, ccs_size;
>>   		u32 ccs_pt;
>> @@ -1113,11 +1163,11 @@ int xe_migrate_ccs_rw_copy(struct xe_tile *tile, struct xe_exec_queue *q,
>>   
>>   		emit_pte(m, bb, ccs_pt, false, false, &ccs_it, ccs_size, src);
>>   
>> -		bb->len = emit_flush_invalidate(bb->cs, bb->len, flush_flags);
>> +		bb->len = emit_flush_invalidate(q, bb->cs, bb->len, flush_flags);
>>   		flush_flags = xe_migrate_ccs_copy(m, bb, src_L0_ofs, src_is_pltt,
>>   						  src_L0_ofs, dst_is_pltt,
>>   						  src_L0, ccs_ofs, true);
>> -		bb->len = emit_flush_invalidate(bb->cs, bb->len, flush_flags);
>> +		bb->len = emit_flush_invalidate(q, bb->cs, bb->len, flush_flags);
> 
> Side note: I don't think the second emit_flush_invalidate is actually
> necessary here. Removing it is probably out of scope for this series,
> but once this is merged and testing is stable, we can try removing it in
> a follow-up and see what happens.
> 
> Matt
> 
>>   
>>   		size -= src_L0;
>>   	}
>> -- 
>> 2.51.0
>>


^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2025-10-08  9:50 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-10-06 15:24 [PATCH v4 0/3] drm/xe/migrate: Atomicize CCS copy command setup Satyanarayana K V P
2025-10-06 15:24 ` [PATCH v4 1/3] " Satyanarayana K V P
2025-10-06 19:58   ` Matthew Brost
2025-10-08  9:50     ` K V P, Satyanarayana
2025-10-06 15:24 ` [PATCH v4 2/3] drm/xe/migrate: Make emit_pte() header write atomic Satyanarayana K V P
2025-10-06 19:42   ` Matthew Brost
2025-10-06 15:24 ` [PATCH v4 3/3] drm/xe/vf: Clear CCS read/write buffers in atomic way Satyanarayana K V P
2025-10-06 20:12   ` Matthew Brost
2025-10-08  1:40     ` Matthew Brost

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox