[PATCH 0/2] Large devcoredump file support

Intel-XE Archive on lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH 0/2] Large devcoredump file support
@ 2025-04-01 20:43 Matthew Brost
  2025-04-01 20:43 ` [PATCH 1/2] drm/xe: Add devcoredump chunking Matthew Brost
                   ` (4 more replies)
  0 siblings, 5 replies; 7+ messages in thread
From: Matthew Brost @ 2025-04-01 20:43 UTC (permalink / raw)
  To: intel-xe

Devcoredump were truncated at 2G, remove this restriction. While here,
add support for GPU copies of BOs to increase devcoredump speed.
 
Matthew Brost (2):
  drm/xe: Add devcoredump chunking
  drm/xe: Update xe_ttm_access_memory to use GPU for non-visible access

 drivers/gpu/drm/xe/xe_bo.c                |  15 +-
 drivers/gpu/drm/xe/xe_devcoredump.c       |  59 ++++--
 drivers/gpu/drm/xe/xe_devcoredump_types.h |   2 +
 drivers/gpu/drm/xe/xe_guc_hwconfig.c      |   2 +-
 drivers/gpu/drm/xe/xe_migrate.c           | 219 ++++++++++++++++++++--
 drivers/gpu/drm/xe/xe_migrate.h           |   4 +
 6 files changed, 271 insertions(+), 30 deletions(-)

-- 
2.34.1


^ permalink raw reply	[flat|nested] 7+ messages in thread

* [PATCH 1/2] drm/xe: Add devcoredump chunking
  2025-04-01 20:43 [PATCH 0/2] Large devcoredump file support Matthew Brost
@ 2025-04-01 20:43 ` Matthew Brost
  2025-04-01 20:59   ` Matthew Brost
  2025-04-01 20:43 ` [PATCH 2/2] drm/xe: Update xe_ttm_access_memory to use GPU for non-visible access Matthew Brost
                   ` (3 subsequent siblings)
  4 siblings, 1 reply; 7+ messages in thread
From: Matthew Brost @ 2025-04-01 20:43 UTC (permalink / raw)
  To: intel-xe

Chunk devcoredump into 1.5G pieces to avoid hitting the kvmalloc limit
of 2G. Simple algorithm reads 1.5G at time in xe_devcoredump_read
callback as needed.

Signed-off-by: Matthew Brost <matthew.brost@intel.com>
---
 drivers/gpu/drm/xe/xe_devcoredump.c       | 59 ++++++++++++++++++-----
 drivers/gpu/drm/xe/xe_devcoredump_types.h |  2 +
 drivers/gpu/drm/xe/xe_guc_hwconfig.c      |  2 +-
 3 files changed, 50 insertions(+), 13 deletions(-)

diff --git a/drivers/gpu/drm/xe/xe_devcoredump.c b/drivers/gpu/drm/xe/xe_devcoredump.c
index 81b9d9bb3f57..a9e618abf8ac 100644
--- a/drivers/gpu/drm/xe/xe_devcoredump.c
+++ b/drivers/gpu/drm/xe/xe_devcoredump.c
@@ -80,7 +80,8 @@ static struct xe_guc *exec_queue_to_guc(struct xe_exec_queue *q)
 	return &q->gt->uc.guc;
 }
 
-static ssize_t __xe_devcoredump_read(char *buffer, size_t count,
+static ssize_t __xe_devcoredump_read(char *buffer, ssize_t count,
+				     ssize_t start,
 				     struct xe_devcoredump *coredump)
 {
 	struct xe_device *xe;
@@ -94,7 +95,7 @@ static ssize_t __xe_devcoredump_read(char *buffer, size_t count,
 	ss = &coredump->snapshot;
 
 	iter.data = buffer;
-	iter.start = 0;
+	iter.start = start;
 	iter.remain = count;
 
 	p = drm_coredump_printer(&iter);
@@ -168,6 +169,8 @@ static void xe_devcoredump_snapshot_free(struct xe_devcoredump_snapshot *ss)
 	ss->vm = NULL;
 }
 
+#define XE_DEVCOREDUMP_CHUNK_MAX	(SZ_512M + SZ_1G)
+
 static ssize_t xe_devcoredump_read(char *buffer, loff_t offset,
 				   size_t count, void *data, size_t datalen)
 {
@@ -183,6 +186,9 @@ static ssize_t xe_devcoredump_read(char *buffer, loff_t offset,
 	/* Ensure delayed work is captured before continuing */
 	flush_work(&ss->work);
 
+	if (ss->read.size > XE_DEVCOREDUMP_CHUNK_MAX)
+		xe_pm_runtime_get(gt_to_xe(ss->gt));
+
 	mutex_lock(&coredump->lock);
 
 	if (!ss->read.buffer) {
@@ -195,12 +201,26 @@ static ssize_t xe_devcoredump_read(char *buffer, loff_t offset,
 		return 0;
 	}
 
+	if (offset >= ss->read.chunk_position + XE_DEVCOREDUMP_CHUNK_MAX ||
+	    offset < ss->read.chunk_position) {
+		ss->read.chunk_position =
+			ALIGN_DOWN(offset, XE_DEVCOREDUMP_CHUNK_MAX);
+
+		__xe_devcoredump_read(ss->read.buffer,
+				      XE_DEVCOREDUMP_CHUNK_MAX,
+				      ss->read.chunk_position, coredump);
+	}
+
 	byte_copied = count < ss->read.size - offset ? count :
 		ss->read.size - offset;
-	memcpy(buffer, ss->read.buffer + offset, byte_copied);
+	memcpy(buffer, ss->read.buffer +
+	       (offset % XE_DEVCOREDUMP_CHUNK_MAX), byte_copied);
 
 	mutex_unlock(&coredump->lock);
 
+	if (ss->read.size > XE_DEVCOREDUMP_CHUNK_MAX)
+		xe_pm_runtime_put(gt_to_xe(ss->gt));
+
 	return byte_copied;
 }
 
@@ -254,17 +274,32 @@ static void xe_devcoredump_deferred_snap_work(struct work_struct *work)
 	xe_guc_exec_queue_snapshot_capture_delayed(ss->ge);
 	xe_force_wake_put(gt_to_fw(ss->gt), fw_ref);
 
-	xe_pm_runtime_put(xe);
+	ss->read.chunk_position = 0;
 
 	/* Calculate devcoredump size */
-	ss->read.size = __xe_devcoredump_read(NULL, INT_MAX, coredump);
-
-	ss->read.buffer = kvmalloc(ss->read.size, GFP_USER);
-	if (!ss->read.buffer)
-		return;
+	ss->read.size = __xe_devcoredump_read(NULL, LONG_MAX, 0, coredump);
+
+	if (ss->read.size > XE_DEVCOREDUMP_CHUNK_MAX) {
+		ss->read.buffer = kvmalloc(XE_DEVCOREDUMP_CHUNK_MAX,
+					   GFP_USER);
+		if (!ss->read.buffer)
+			goto put_pm;
+
+		__xe_devcoredump_read(ss->read.buffer,
+				      XE_DEVCOREDUMP_CHUNK_MAX,
+				      0, coredump);
+	} else {
+		ss->read.buffer = kvmalloc(ss->read.size, GFP_USER);
+		if (!ss->read.buffer)
+			goto put_pm;
+
+		__xe_devcoredump_read(ss->read.buffer, ss->read.size, 0,
+				      coredump);
+		xe_devcoredump_snapshot_free(ss);
+	}
 
-	__xe_devcoredump_read(ss->read.buffer, ss->read.size, coredump);
-	xe_devcoredump_snapshot_free(ss);
+put_pm:
+	xe_pm_runtime_put(xe);
 }
 
 static void devcoredump_snapshot(struct xe_devcoredump *coredump,
@@ -425,7 +460,7 @@ void xe_print_blob_ascii85(struct drm_printer *p, const char *prefix, char suffi
 	if (offset & 3)
 		drm_printf(p, "Offset not word aligned: %zu", offset);
 
-	line_buff = kzalloc(DMESG_MAX_LINE_LEN, GFP_KERNEL);
+	line_buff = kzalloc(DMESG_MAX_LINE_LEN, GFP_ATOMIC);
 	if (!line_buff) {
 		drm_printf(p, "Failed to allocate line buffer\n");
 		return;
diff --git a/drivers/gpu/drm/xe/xe_devcoredump_types.h b/drivers/gpu/drm/xe/xe_devcoredump_types.h
index 1a1d16a96b2d..a174385a6d83 100644
--- a/drivers/gpu/drm/xe/xe_devcoredump_types.h
+++ b/drivers/gpu/drm/xe/xe_devcoredump_types.h
@@ -66,6 +66,8 @@ struct xe_devcoredump_snapshot {
 	struct {
 		/** @read.size: size of devcoredump in human readable format */
 		ssize_t size;
+		/** @read.chunk_position: position of devcoredump chunk */
+		ssize_t chunk_position;
 		/** @read.buffer: buffer of devcoredump in human readable format */
 		char *buffer;
 	} read;
diff --git a/drivers/gpu/drm/xe/xe_guc_hwconfig.c b/drivers/gpu/drm/xe/xe_guc_hwconfig.c
index af2c817d552c..21403a250834 100644
--- a/drivers/gpu/drm/xe/xe_guc_hwconfig.c
+++ b/drivers/gpu/drm/xe/xe_guc_hwconfig.c
@@ -175,7 +175,7 @@ int xe_guc_hwconfig_lookup_u32(struct xe_guc *guc, u32 attribute, u32 *val)
 	if (num_dw == 0)
 		return -EINVAL;
 
-	hwconfig = kzalloc(size, GFP_KERNEL);
+	hwconfig = kzalloc(size, GFP_ATOMIC);
 	if (!hwconfig)
 		return -ENOMEM;
 
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCH 1/2] drm/xe: Add devcoredump chunking
  2025-04-01 20:43 ` [PATCH 1/2] drm/xe: Add devcoredump chunking Matthew Brost
@ 2025-04-01 20:59   ` Matthew Brost
  0 siblings, 0 replies; 7+ messages in thread
From: Matthew Brost @ 2025-04-01 20:59 UTC (permalink / raw)
  To: intel-xe

On Tue, Apr 01, 2025 at 01:43:50PM -0700, Matthew Brost wrote:
> Chunk devcoredump into 1.5G pieces to avoid hitting the kvmalloc limit
> of 2G. Simple algorithm reads 1.5G at time in xe_devcoredump_read
> callback as needed.
> 
> Signed-off-by: Matthew Brost <matthew.brost@intel.com>

Worth noting with this, we still have some O(N^2) characteristics within
the xe_devcoredump_read callback so large devcoredump, say 6G, take
about 1.5 minutes to read out on BMG. We could move this to snap worker
by allocating multiple buffers via kvalloc - it would take the same amount
of time but would be in the worker. The readout would be faster then
once the worker completes. Easy to make this change if we decide on that
direction.

Matt

> ---
>  drivers/gpu/drm/xe/xe_devcoredump.c       | 59 ++++++++++++++++++-----
>  drivers/gpu/drm/xe/xe_devcoredump_types.h |  2 +
>  drivers/gpu/drm/xe/xe_guc_hwconfig.c      |  2 +-
>  3 files changed, 50 insertions(+), 13 deletions(-)
> 
> diff --git a/drivers/gpu/drm/xe/xe_devcoredump.c b/drivers/gpu/drm/xe/xe_devcoredump.c
> index 81b9d9bb3f57..a9e618abf8ac 100644
> --- a/drivers/gpu/drm/xe/xe_devcoredump.c
> +++ b/drivers/gpu/drm/xe/xe_devcoredump.c
> @@ -80,7 +80,8 @@ static struct xe_guc *exec_queue_to_guc(struct xe_exec_queue *q)
>  	return &q->gt->uc.guc;
>  }
>  
> -static ssize_t __xe_devcoredump_read(char *buffer, size_t count,
> +static ssize_t __xe_devcoredump_read(char *buffer, ssize_t count,
> +				     ssize_t start,
>  				     struct xe_devcoredump *coredump)
>  {
>  	struct xe_device *xe;
> @@ -94,7 +95,7 @@ static ssize_t __xe_devcoredump_read(char *buffer, size_t count,
>  	ss = &coredump->snapshot;
>  
>  	iter.data = buffer;
> -	iter.start = 0;
> +	iter.start = start;
>  	iter.remain = count;
>  
>  	p = drm_coredump_printer(&iter);
> @@ -168,6 +169,8 @@ static void xe_devcoredump_snapshot_free(struct xe_devcoredump_snapshot *ss)
>  	ss->vm = NULL;
>  }
>  
> +#define XE_DEVCOREDUMP_CHUNK_MAX	(SZ_512M + SZ_1G)
> +
>  static ssize_t xe_devcoredump_read(char *buffer, loff_t offset,
>  				   size_t count, void *data, size_t datalen)
>  {
> @@ -183,6 +186,9 @@ static ssize_t xe_devcoredump_read(char *buffer, loff_t offset,
>  	/* Ensure delayed work is captured before continuing */
>  	flush_work(&ss->work);
>  
> +	if (ss->read.size > XE_DEVCOREDUMP_CHUNK_MAX)
> +		xe_pm_runtime_get(gt_to_xe(ss->gt));
> +
>  	mutex_lock(&coredump->lock);
>  
>  	if (!ss->read.buffer) {
> @@ -195,12 +201,26 @@ static ssize_t xe_devcoredump_read(char *buffer, loff_t offset,
>  		return 0;
>  	}
>  
> +	if (offset >= ss->read.chunk_position + XE_DEVCOREDUMP_CHUNK_MAX ||
> +	    offset < ss->read.chunk_position) {
> +		ss->read.chunk_position =
> +			ALIGN_DOWN(offset, XE_DEVCOREDUMP_CHUNK_MAX);
> +
> +		__xe_devcoredump_read(ss->read.buffer,
> +				      XE_DEVCOREDUMP_CHUNK_MAX,
> +				      ss->read.chunk_position, coredump);
> +	}
> +
>  	byte_copied = count < ss->read.size - offset ? count :
>  		ss->read.size - offset;
> -	memcpy(buffer, ss->read.buffer + offset, byte_copied);
> +	memcpy(buffer, ss->read.buffer +
> +	       (offset % XE_DEVCOREDUMP_CHUNK_MAX), byte_copied);
>  
>  	mutex_unlock(&coredump->lock);
>  
> +	if (ss->read.size > XE_DEVCOREDUMP_CHUNK_MAX)
> +		xe_pm_runtime_put(gt_to_xe(ss->gt));
> +
>  	return byte_copied;
>  }
>  
> @@ -254,17 +274,32 @@ static void xe_devcoredump_deferred_snap_work(struct work_struct *work)
>  	xe_guc_exec_queue_snapshot_capture_delayed(ss->ge);
>  	xe_force_wake_put(gt_to_fw(ss->gt), fw_ref);
>  
> -	xe_pm_runtime_put(xe);
> +	ss->read.chunk_position = 0;
>  
>  	/* Calculate devcoredump size */
> -	ss->read.size = __xe_devcoredump_read(NULL, INT_MAX, coredump);
> -
> -	ss->read.buffer = kvmalloc(ss->read.size, GFP_USER);
> -	if (!ss->read.buffer)
> -		return;
> +	ss->read.size = __xe_devcoredump_read(NULL, LONG_MAX, 0, coredump);
> +
> +	if (ss->read.size > XE_DEVCOREDUMP_CHUNK_MAX) {
> +		ss->read.buffer = kvmalloc(XE_DEVCOREDUMP_CHUNK_MAX,
> +					   GFP_USER);
> +		if (!ss->read.buffer)
> +			goto put_pm;
> +
> +		__xe_devcoredump_read(ss->read.buffer,
> +				      XE_DEVCOREDUMP_CHUNK_MAX,
> +				      0, coredump);
> +	} else {
> +		ss->read.buffer = kvmalloc(ss->read.size, GFP_USER);
> +		if (!ss->read.buffer)
> +			goto put_pm;
> +
> +		__xe_devcoredump_read(ss->read.buffer, ss->read.size, 0,
> +				      coredump);
> +		xe_devcoredump_snapshot_free(ss);
> +	}
>  
> -	__xe_devcoredump_read(ss->read.buffer, ss->read.size, coredump);
> -	xe_devcoredump_snapshot_free(ss);
> +put_pm:
> +	xe_pm_runtime_put(xe);
>  }
>  
>  static void devcoredump_snapshot(struct xe_devcoredump *coredump,
> @@ -425,7 +460,7 @@ void xe_print_blob_ascii85(struct drm_printer *p, const char *prefix, char suffi
>  	if (offset & 3)
>  		drm_printf(p, "Offset not word aligned: %zu", offset);
>  
> -	line_buff = kzalloc(DMESG_MAX_LINE_LEN, GFP_KERNEL);
> +	line_buff = kzalloc(DMESG_MAX_LINE_LEN, GFP_ATOMIC);
>  	if (!line_buff) {
>  		drm_printf(p, "Failed to allocate line buffer\n");
>  		return;
> diff --git a/drivers/gpu/drm/xe/xe_devcoredump_types.h b/drivers/gpu/drm/xe/xe_devcoredump_types.h
> index 1a1d16a96b2d..a174385a6d83 100644
> --- a/drivers/gpu/drm/xe/xe_devcoredump_types.h
> +++ b/drivers/gpu/drm/xe/xe_devcoredump_types.h
> @@ -66,6 +66,8 @@ struct xe_devcoredump_snapshot {
>  	struct {
>  		/** @read.size: size of devcoredump in human readable format */
>  		ssize_t size;
> +		/** @read.chunk_position: position of devcoredump chunk */
> +		ssize_t chunk_position;
>  		/** @read.buffer: buffer of devcoredump in human readable format */
>  		char *buffer;
>  	} read;
> diff --git a/drivers/gpu/drm/xe/xe_guc_hwconfig.c b/drivers/gpu/drm/xe/xe_guc_hwconfig.c
> index af2c817d552c..21403a250834 100644
> --- a/drivers/gpu/drm/xe/xe_guc_hwconfig.c
> +++ b/drivers/gpu/drm/xe/xe_guc_hwconfig.c
> @@ -175,7 +175,7 @@ int xe_guc_hwconfig_lookup_u32(struct xe_guc *guc, u32 attribute, u32 *val)
>  	if (num_dw == 0)
>  		return -EINVAL;
>  
> -	hwconfig = kzalloc(size, GFP_KERNEL);
> +	hwconfig = kzalloc(size, GFP_ATOMIC);
>  	if (!hwconfig)
>  		return -ENOMEM;
>  
> -- 
> 2.34.1
> 

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [PATCH 2/2] drm/xe: Update xe_ttm_access_memory to use GPU for non-visible access
  2025-04-01 20:43 [PATCH 0/2] Large devcoredump file support Matthew Brost
  2025-04-01 20:43 ` [PATCH 1/2] drm/xe: Add devcoredump chunking Matthew Brost
@ 2025-04-01 20:43 ` Matthew Brost
  2025-04-01 20:47 ` ✓ CI.Patch_applied: success for Large devcoredump file support Patchwork
                   ` (2 subsequent siblings)
  4 siblings, 0 replies; 7+ messages in thread
From: Matthew Brost @ 2025-04-01 20:43 UTC (permalink / raw)
  To: intel-xe

Add migrate layer functions to access VRAM and update
xe_ttm_access_memory to use for non-visible access and large (more than
16k) BO access. 8G devcoreump on BMG observed 3 minute CPU copy time vs.
3s GPU copy time.

v4:
 - Fix non-page aligned accesses
 - Add support for small / unaligned access
 - Update commit message indicating migrate used for large accesses (Auld)
 - Fix warning in xe_res_cursor for non-zero offset
v5:
 - Fix 32 bit build (CI)
v6:
 - Rebase and use SVM migration copy functions

Signed-off-by: Matthew Brost <matthew.brost@intel.com>
---
 drivers/gpu/drm/xe/xe_bo.c      |  15 ++-
 drivers/gpu/drm/xe/xe_migrate.c | 219 ++++++++++++++++++++++++++++++--
 drivers/gpu/drm/xe/xe_migrate.h |   4 +
 3 files changed, 221 insertions(+), 17 deletions(-)

diff --git a/drivers/gpu/drm/xe/xe_bo.c b/drivers/gpu/drm/xe/xe_bo.c
index 3c7c2353d3c8..c7e6b03d4aef 100644
--- a/drivers/gpu/drm/xe/xe_bo.c
+++ b/drivers/gpu/drm/xe/xe_bo.c
@@ -1414,6 +1414,7 @@ static int xe_ttm_access_memory(struct ttm_buffer_object *ttm_bo,
 	struct xe_res_cursor cursor;
 	struct xe_vram_region *vram;
 	int bytes_left = len;
+	int err = 0;
 
 	xe_bo_assert_held(bo);
 	xe_device_assert_mem_access(xe);
@@ -1421,9 +1422,14 @@ static int xe_ttm_access_memory(struct ttm_buffer_object *ttm_bo,
 	if (!mem_type_is_vram(ttm_bo->resource->mem_type))
 		return -EIO;
 
-	/* FIXME: Use GPU for non-visible VRAM */
-	if (!xe_ttm_resource_visible(ttm_bo->resource))
-		return -EIO;
+	if (!xe_ttm_resource_visible(ttm_bo->resource) || len >= SZ_16K) {
+		struct xe_migrate *migrate =
+			mem_type_to_migrate(xe, ttm_bo->resource->mem_type);
+
+		err = xe_migrate_access_memory(migrate, bo, offset, buf, len,
+					       write);
+		goto out;
+	}
 
 	vram = res_to_mem_region(ttm_bo->resource);
 	xe_res_first(ttm_bo->resource, offset & PAGE_MASK,
@@ -1447,7 +1453,8 @@ static int xe_ttm_access_memory(struct ttm_buffer_object *ttm_bo,
 			xe_res_next(&cursor, PAGE_SIZE);
 	} while (bytes_left);
 
-	return len;
+out:
+	return err ?: len;
 }
 
 const struct ttm_device_funcs xe_ttm_funcs = {
diff --git a/drivers/gpu/drm/xe/xe_migrate.c b/drivers/gpu/drm/xe/xe_migrate.c
index ff0fc2fb0eb9..f1f28564dbda 100644
--- a/drivers/gpu/drm/xe/xe_migrate.c
+++ b/drivers/gpu/drm/xe/xe_migrate.c
@@ -670,6 +670,7 @@ static void emit_copy(struct xe_gt *gt, struct xe_bb *bb,
 	u32 mocs = 0;
 	u32 tile_y = 0;
 
+	xe_gt_assert(gt, !(pitch & 3));
 	xe_gt_assert(gt, size / pitch <= S16_MAX);
 	xe_gt_assert(gt, pitch / 4 <= S16_MAX);
 	xe_gt_assert(gt, pitch <= U16_MAX);
@@ -1602,8 +1603,12 @@ enum xe_migrate_copy_dir {
 	XE_MIGRATE_COPY_TO_SRAM,
 };
 
+#define CACHELINE_BYTES	64ull
+#define CACHELINE_MASK	(CACHELINE_BYTES - 1)
+
 static struct dma_fence *xe_migrate_vram(struct xe_migrate *m,
-					 unsigned long npages,
+					 unsigned long len,
+					 unsigned long sram_offset,
 					 dma_addr_t *sram_addr, u64 vram_addr,
 					 const enum xe_migrate_copy_dir dir)
 {
@@ -1613,17 +1618,21 @@ static struct dma_fence *xe_migrate_vram(struct xe_migrate *m,
 	struct dma_fence *fence = NULL;
 	u32 batch_size = 2;
 	u64 src_L0_ofs, dst_L0_ofs;
-	u64 round_update_size;
 	struct xe_sched_job *job;
 	struct xe_bb *bb;
 	u32 update_idx, pt_slot = 0;
+	unsigned long npages = DIV_ROUND_UP(len + sram_offset, PAGE_SIZE);
+	unsigned int pitch = len >= PAGE_SIZE && !(len & ~PAGE_MASK) ?
+		PAGE_SIZE : 4;
 	int err;
 
-	if (npages * PAGE_SIZE > MAX_PREEMPTDISABLE_TRANSFER)
-		return ERR_PTR(-EINVAL);
+	if (drm_WARN_ON(&xe->drm, (len & CACHELINE_MASK) ||
+			(sram_offset | vram_addr) & CACHELINE_MASK))
+		return ERR_PTR(-EOPNOTSUPP);
 
-	round_update_size = npages * PAGE_SIZE;
-	batch_size += pte_update_cmd_size(round_update_size);
+	xe_assert(xe, npages * PAGE_SIZE <= MAX_PREEMPTDISABLE_TRANSFER);
+
+	batch_size += pte_update_cmd_size(len);
 	batch_size += EMIT_COPY_DW;
 
 	bb = xe_bb_new(gt, batch_size, use_usm_batch);
@@ -1633,22 +1642,21 @@ static struct dma_fence *xe_migrate_vram(struct xe_migrate *m,
 	}
 
 	build_pt_update_batch_sram(m, bb, pt_slot * XE_PAGE_SIZE,
-				   sram_addr, round_update_size);
+				   sram_addr, len + sram_offset);
 
 	if (dir == XE_MIGRATE_COPY_TO_VRAM) {
-		src_L0_ofs = xe_migrate_vm_addr(pt_slot, 0);
+		src_L0_ofs = xe_migrate_vm_addr(pt_slot, 0) + sram_offset;
 		dst_L0_ofs = xe_migrate_vram_ofs(xe, vram_addr, false);
 
 	} else {
 		src_L0_ofs = xe_migrate_vram_ofs(xe, vram_addr, false);
-		dst_L0_ofs = xe_migrate_vm_addr(pt_slot, 0);
+		dst_L0_ofs = xe_migrate_vm_addr(pt_slot, 0) + sram_offset;
 	}
 
 	bb->cs[bb->len++] = MI_BATCH_BUFFER_END;
 	update_idx = bb->len;
 
-	emit_copy(gt, bb, src_L0_ofs, dst_L0_ofs, round_update_size,
-		  XE_PAGE_SIZE);
+	emit_copy(gt, bb, src_L0_ofs, dst_L0_ofs, len, pitch);
 
 	job = xe_bb_create_migration_job(m->q, bb,
 					 xe_migrate_batch_base(m, use_usm_batch),
@@ -1696,7 +1704,7 @@ struct dma_fence *xe_migrate_to_vram(struct xe_migrate *m,
 				     dma_addr_t *src_addr,
 				     u64 dst_addr)
 {
-	return xe_migrate_vram(m, npages, src_addr, dst_addr,
+	return xe_migrate_vram(m, npages * PAGE_SIZE, 0, src_addr, dst_addr,
 			       XE_MIGRATE_COPY_TO_VRAM);
 }
 
@@ -1717,12 +1725,197 @@ struct dma_fence *xe_migrate_from_vram(struct xe_migrate *m,
 				       u64 src_addr,
 				       dma_addr_t *dst_addr)
 {
-	return xe_migrate_vram(m, npages, dst_addr, src_addr,
+	return xe_migrate_vram(m, npages * PAGE_SIZE, 0, dst_addr, src_addr,
 			       XE_MIGRATE_COPY_TO_SRAM);
 }
 
 #endif
 
+static void xe_migrate_dma_unmap(struct xe_device *xe, dma_addr_t *dma_addr,
+				 int len, int write)
+{
+	unsigned long i, npages = DIV_ROUND_UP(len, PAGE_SIZE);
+
+	for (i = 0; i < npages; ++i) {
+		if (!dma_addr[i])
+			continue;
+
+		dma_unmap_page(xe->drm.dev, dma_addr[i], PAGE_SIZE,
+			       write ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
+	}
+	kfree(dma_addr);
+}
+
+static dma_addr_t *xe_migrate_dma_map(struct xe_device *xe,
+				      void *buf, int len, int write)
+{
+	dma_addr_t *dma_addr;
+	unsigned long i, npages = DIV_ROUND_UP(len, PAGE_SIZE);
+
+	dma_addr = kcalloc(npages, sizeof(*dma_addr), GFP_KERNEL);
+	if (!dma_addr)
+		return ERR_PTR(-ENOMEM);
+
+	for (i = 0; i < npages; ++i) {
+		dma_addr_t addr;
+		struct page *page;
+
+		if (is_vmalloc_addr(buf))
+			page = vmalloc_to_page(buf);
+		else
+			page = virt_to_page(buf);
+
+		addr = dma_map_page(xe->drm.dev,
+				    page, 0, PAGE_SIZE,
+				    write ? DMA_TO_DEVICE :
+				    DMA_FROM_DEVICE);
+		if (dma_mapping_error(xe->drm.dev, addr))
+			goto err_fault;
+
+		dma_addr[i] = addr;
+		buf += PAGE_SIZE;
+	}
+
+	return dma_addr;
+
+err_fault:
+	xe_migrate_dma_unmap(xe, dma_addr, len, write);
+	return ERR_PTR(-EFAULT);
+}
+
+/**
+ * xe_migrate_access_memory - Access memory of a BO via GPU
+ *
+ * @m: The migration context.
+ * @bo: buffer object
+ * @offset: access offset into buffer object
+ * @buf: pointer to caller memory to read into or write from
+ * @len: length of access
+ * @write: write access
+ *
+ * Access memory of a BO via GPU either reading in or writing from a passed in
+ * pointer. Pointer is dma mapped for GPU access and GPU commands are issued to
+ * read to or write from pointer.
+ *
+ * Returns:
+ * 0 if successful, negative error code on failure.
+ */
+int xe_migrate_access_memory(struct xe_migrate *m, struct xe_bo *bo,
+			     unsigned long offset, void *buf, int len,
+			     int write)
+{
+	struct xe_tile *tile = m->tile;
+	struct xe_device *xe = tile_to_xe(tile);
+	struct xe_res_cursor cursor;
+	struct dma_fence *fence = NULL;
+	dma_addr_t *dma_addr;
+	unsigned long page_offset = (unsigned long)buf & ~PAGE_MASK;
+	int bytes_left = len, current_page = 0;
+	void *orig_buf = buf;
+
+	xe_bo_assert_held(bo);
+
+	/* Use bounce buffer for small access and unaligned access */
+	if (len & CACHELINE_MASK || ((uintptr_t)buf | offset) & CACHELINE_MASK) {
+		int buf_offset = 0;
+
+		/*
+		 * Less than ideal for large unaligned access but this should be
+		 * fairly rare, can fixup if this becomes common.
+		 */
+		do {
+			u8 bounce[CACHELINE_BYTES];
+			void *ptr = (void *)bounce;
+			int err;
+			int copy_bytes = min_t(int, bytes_left,
+					       CACHELINE_BYTES -
+					       (offset & CACHELINE_MASK));
+			int ptr_offset = offset & CACHELINE_MASK;
+
+			err = xe_migrate_access_memory(m, bo,
+						       offset & ~CACHELINE_MASK,
+						       (void *)ptr,
+						       sizeof(bounce), 0);
+			if (err)
+				return err;
+
+			if (!write) {
+				memcpy(buf + buf_offset, ptr + ptr_offset,
+				       copy_bytes);
+				goto next;
+			}
+
+			memcpy(ptr + ptr_offset, buf + buf_offset, copy_bytes);
+			err = xe_migrate_access_memory(m, bo,
+						       offset & ~CACHELINE_MASK,
+						       (void *)ptr,
+						       sizeof(bounce), 0);
+			if (err)
+				return err;
+
+next:
+			bytes_left -= copy_bytes;
+			buf_offset += copy_bytes;
+			offset += copy_bytes;
+		} while (bytes_left);
+
+		return 0;
+	}
+
+	dma_addr = xe_migrate_dma_map(xe, buf, len + page_offset, write);
+	if (IS_ERR(dma_addr))
+		return PTR_ERR(dma_addr);
+
+	xe_res_first(bo->ttm.resource, offset, bo->size - offset, &cursor);
+
+	do {
+		struct dma_fence *__fence;
+		u64 vram_addr = vram_region_gpu_offset(bo->ttm.resource) +
+			cursor.start;
+		int current_bytes;
+
+		if (cursor.size > MAX_PREEMPTDISABLE_TRANSFER)
+			current_bytes = min_t(int, bytes_left,
+					      MAX_PREEMPTDISABLE_TRANSFER);
+		else
+			current_bytes = min_t(int, bytes_left, cursor.size);
+
+		if (fence)
+			dma_fence_put(fence);
+
+		__fence = xe_migrate_vram(m, current_bytes,
+					  (unsigned long)buf & ~PAGE_MASK,
+					  dma_addr + current_page,
+					  vram_addr, write ?
+					  XE_MIGRATE_COPY_TO_VRAM :
+					  XE_MIGRATE_COPY_TO_SRAM);
+		if (IS_ERR(__fence)) {
+			if (fence)
+				dma_fence_wait(fence, false);
+			fence = __fence;
+			goto out_err;
+		}
+		fence = __fence;
+
+		buf += current_bytes;
+		offset += current_bytes;
+		current_page = (int)(buf - orig_buf) / PAGE_SIZE;
+		bytes_left -= current_bytes;
+		if (bytes_left)
+			xe_res_next(&cursor, current_bytes);
+	} while (bytes_left);
+
+	dma_fence_wait(fence, false);
+	dma_fence_put(fence);
+	xe_migrate_dma_unmap(xe, dma_addr, len + page_offset, write);
+
+	return 0;
+
+out_err:
+	xe_migrate_dma_unmap(xe, dma_addr, len + page_offset, write);
+	return PTR_ERR(fence);
+}
+
 #if IS_ENABLED(CONFIG_DRM_XE_KUNIT_TEST)
 #include "tests/xe_migrate.c"
 #endif
diff --git a/drivers/gpu/drm/xe/xe_migrate.h b/drivers/gpu/drm/xe/xe_migrate.h
index 6ff9a963425c..fb9839c1bae0 100644
--- a/drivers/gpu/drm/xe/xe_migrate.h
+++ b/drivers/gpu/drm/xe/xe_migrate.h
@@ -112,6 +112,10 @@ struct dma_fence *xe_migrate_copy(struct xe_migrate *m,
 				  struct ttm_resource *dst,
 				  bool copy_only_ccs);
 
+int xe_migrate_access_memory(struct xe_migrate *m, struct xe_bo *bo,
+			     unsigned long offset, void *buf, int len,
+			     int write);
+
 #define XE_MIGRATE_CLEAR_FLAG_BO_DATA		BIT(0)
 #define XE_MIGRATE_CLEAR_FLAG_CCS_DATA		BIT(1)
 #define XE_MIGRATE_CLEAR_FLAG_FULL	(XE_MIGRATE_CLEAR_FLAG_BO_DATA | \
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* ✓ CI.Patch_applied: success for Large devcoredump file support
  2025-04-01 20:43 [PATCH 0/2] Large devcoredump file support Matthew Brost
  2025-04-01 20:43 ` [PATCH 1/2] drm/xe: Add devcoredump chunking Matthew Brost
  2025-04-01 20:43 ` [PATCH 2/2] drm/xe: Update xe_ttm_access_memory to use GPU for non-visible access Matthew Brost
@ 2025-04-01 20:47 ` Patchwork
  2025-04-01 20:48 ` ✓ CI.checkpatch: " Patchwork
  2025-04-01 20:48 ` ✗ CI.KUnit: failure " Patchwork
  4 siblings, 0 replies; 7+ messages in thread
From: Patchwork @ 2025-04-01 20:47 UTC (permalink / raw)
  To: Matthew Brost; +Cc: intel-xe

== Series Details ==

Series: Large devcoredump file support
URL   : https://patchwork.freedesktop.org/series/147085/
State : success

== Summary ==

=== Applying kernel patches on branch 'drm-tip' with base: ===
Base commit: 2dda6162c8d9 drm-tip: 2025y-04m-01d-20h-22m-36s UTC integration manifest
=== git am output follows ===
Applying: drm/xe: Add devcoredump chunking
Applying: drm/xe: Update xe_ttm_access_memory to use GPU for non-visible access



^ permalink raw reply	[flat|nested] 7+ messages in thread

* ✓ CI.checkpatch: success for Large devcoredump file support
  2025-04-01 20:43 [PATCH 0/2] Large devcoredump file support Matthew Brost
                   ` (2 preceding siblings ...)
  2025-04-01 20:47 ` ✓ CI.Patch_applied: success for Large devcoredump file support Patchwork
@ 2025-04-01 20:48 ` Patchwork
  2025-04-01 20:48 ` ✗ CI.KUnit: failure " Patchwork
  4 siblings, 0 replies; 7+ messages in thread
From: Patchwork @ 2025-04-01 20:48 UTC (permalink / raw)
  To: Matthew Brost; +Cc: intel-xe

== Series Details ==

Series: Large devcoredump file support
URL   : https://patchwork.freedesktop.org/series/147085/
State : success

== Summary ==

+ KERNEL=/kernel
+ git clone https://gitlab.freedesktop.org/drm/maintainer-tools mt
Cloning into 'mt'...
warning: redirecting to https://gitlab.freedesktop.org/drm/maintainer-tools.git/
+ git -C mt rev-list -n1 origin/master
99e5a866b5e13f134e606a3e29d9508d97826fb3
+ cd /kernel
+ git config --global --add safe.directory /kernel
+ git log -n1
commit b1b8be18d6473f947deba912f8b7f3f461a1de06
Author: Matthew Brost <matthew.brost@intel.com>
Date:   Tue Apr 1 13:43:51 2025 -0700

    drm/xe: Update xe_ttm_access_memory to use GPU for non-visible access
    
    Add migrate layer functions to access VRAM and update
    xe_ttm_access_memory to use for non-visible access and large (more than
    16k) BO access. 8G devcoreump on BMG observed 3 minute CPU copy time vs.
    3s GPU copy time.
    
    v4:
     - Fix non-page aligned accesses
     - Add support for small / unaligned access
     - Update commit message indicating migrate used for large accesses (Auld)
     - Fix warning in xe_res_cursor for non-zero offset
    v5:
     - Fix 32 bit build (CI)
    v6:
     - Rebase and use SVM migration copy functions
    
    Signed-off-by: Matthew Brost <matthew.brost@intel.com>
+ /mt/dim checkpatch 2dda6162c8d942406173d92a4cd3f23ea8a45696 drm-intel
054b7afbea46 drm/xe: Add devcoredump chunking
b1b8be18d647 drm/xe: Update xe_ttm_access_memory to use GPU for non-visible access



^ permalink raw reply	[flat|nested] 7+ messages in thread

* ✗ CI.KUnit: failure for Large devcoredump file support
  2025-04-01 20:43 [PATCH 0/2] Large devcoredump file support Matthew Brost
                   ` (3 preceding siblings ...)
  2025-04-01 20:48 ` ✓ CI.checkpatch: " Patchwork
@ 2025-04-01 20:48 ` Patchwork
  4 siblings, 0 replies; 7+ messages in thread
From: Patchwork @ 2025-04-01 20:48 UTC (permalink / raw)
  To: Matthew Brost; +Cc: intel-xe

== Series Details ==

Series: Large devcoredump file support
URL   : https://patchwork.freedesktop.org/series/147085/
State : failure

== Summary ==

+ trap cleanup EXIT
+ /kernel/tools/testing/kunit/kunit.py run --kunitconfig /kernel/drivers/gpu/drm/xe/.kunitconfig
ERROR:root:../drivers/gpu/drm/xe/xe_migrate.c: In function ‘xe_migrate_access_memory’:
../drivers/gpu/drm/xe/xe_migrate.c:1819:19: error: ‘CACHELINE_MASK’ undeclared (first use in this function)
 1819 |         if (len & CACHELINE_MASK || ((uintptr_t)buf | offset) & CACHELINE_MASK) {
      |                   ^~~~~~~~~~~~~~
../drivers/gpu/drm/xe/xe_migrate.c:1819:19: note: each undeclared identifier is reported only once for each function it appears in
../drivers/gpu/drm/xe/xe_migrate.c:1827:35: error: ‘CACHELINE_BYTES’ undeclared (first use in this function)
 1827 |                         u8 bounce[CACHELINE_BYTES];
      |                                   ^~~~~~~~~~~~~~~
../drivers/gpu/drm/xe/xe_migrate.c:1827:28: warning: unused variable ‘bounce’ [-Wunused-variable]
 1827 |                         u8 bounce[CACHELINE_BYTES];
      |                            ^~~~~~
../drivers/gpu/drm/xe/xe_migrate.c:1886:27: error: implicit declaration of function ‘xe_migrate_vram’; did you mean ‘xe_migrate_to_vram’? [-Werror=implicit-function-declaration]
 1886 |                 __fence = xe_migrate_vram(m, current_bytes,
      |                           ^~~~~~~~~~~~~~~
      |                           xe_migrate_to_vram
../drivers/gpu/drm/xe/xe_migrate.c:1890:43: error: ‘XE_MIGRATE_COPY_TO_VRAM’ undeclared (first use in this function)
 1890 |                                           XE_MIGRATE_COPY_TO_VRAM :
      |                                           ^~~~~~~~~~~~~~~~~~~~~~~
../drivers/gpu/drm/xe/xe_migrate.c:1891:43: error: ‘XE_MIGRATE_COPY_TO_SRAM’ undeclared (first use in this function)
 1891 |                                           XE_MIGRATE_COPY_TO_SRAM);
      |                                           ^~~~~~~~~~~~~~~~~~~~~~~
cc1: some warnings being treated as errors
make[7]: *** [../scripts/Makefile.build:207: drivers/gpu/drm/xe/xe_migrate.o] Error 1
make[7]: *** Waiting for unfinished jobs....
../lib/iomap.c:156:5: warning: no previous prototype for ‘ioread64_lo_hi’ [-Wmissing-prototypes]
  156 | u64 ioread64_lo_hi(const void __iomem *addr)
      |     ^~~~~~~~~~~~~~
../lib/iomap.c:163:5: warning: no previous prototype for ‘ioread64_hi_lo’ [-Wmissing-prototypes]
  163 | u64 ioread64_hi_lo(const void __iomem *addr)
      |     ^~~~~~~~~~~~~~
../lib/iomap.c:170:5: warning: no previous prototype for ‘ioread64be_lo_hi’ [-Wmissing-prototypes]
  170 | u64 ioread64be_lo_hi(const void __iomem *addr)
      |     ^~~~~~~~~~~~~~~~
../lib/iomap.c:178:5: warning: no previous prototype for ‘ioread64be_hi_lo’ [-Wmissing-prototypes]
  178 | u64 ioread64be_hi_lo(const void __iomem *addr)
      |     ^~~~~~~~~~~~~~~~
../lib/iomap.c:264:6: warning: no previous prototype for ‘iowrite64_lo_hi’ [-Wmissing-prototypes]
  264 | void iowrite64_lo_hi(u64 val, void __iomem *addr)
      |      ^~~~~~~~~~~~~~~
../lib/iomap.c:272:6: warning: no previous prototype for ‘iowrite64_hi_lo’ [-Wmissing-prototypes]
  272 | void iowrite64_hi_lo(u64 val, void __iomem *addr)
      |      ^~~~~~~~~~~~~~~
../lib/iomap.c:280:6: warning: no previous prototype for ‘iowrite64be_lo_hi’ [-Wmissing-prototypes]
  280 | void iowrite64be_lo_hi(u64 val, void __iomem *addr)
      |      ^~~~~~~~~~~~~~~~~
../lib/iomap.c:288:6: warning: no previous prototype for ‘iowrite64be_hi_lo’ [-Wmissing-prototypes]
  288 | void iowrite64be_hi_lo(u64 val, void __iomem *addr)
      |      ^~~~~~~~~~~~~~~~~
make[6]: *** [../scripts/Makefile.build:465: drivers/gpu/drm/xe] Error 2
make[6]: *** Waiting for unfinished jobs....
make[5]: *** [../scripts/Makefile.build:465: drivers/gpu/drm] Error 2
make[4]: *** [../scripts/Makefile.build:465: drivers/gpu] Error 2
make[3]: *** [../scripts/Makefile.build:465: drivers] Error 2
make[2]: *** [/kernel/Makefile:1994: .] Error 2
make[1]: *** [/kernel/Makefile:251: __sub-make] Error 2
make: *** [Makefile:251: __sub-make] Error 2

[20:48:05] Configuring KUnit Kernel ...
Generating .config ...
Populating config with:
$ make ARCH=um O=.kunit olddefconfig
[20:48:09] Building KUnit Kernel ...
Populating config with:
$ make ARCH=um O=.kunit olddefconfig
Building with:
$ make all compile_commands.json ARCH=um O=.kunit --jobs=48
+ cleanup
++ stat -c %u:%g /kernel
+ chown -R 1003:1003 /kernel



^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2025-04-01 20:58 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-04-01 20:43 [PATCH 0/2] Large devcoredump file support Matthew Brost
2025-04-01 20:43 ` [PATCH 1/2] drm/xe: Add devcoredump chunking Matthew Brost
2025-04-01 20:59   ` Matthew Brost
2025-04-01 20:43 ` [PATCH 2/2] drm/xe: Update xe_ttm_access_memory to use GPU for non-visible access Matthew Brost
2025-04-01 20:47 ` ✓ CI.Patch_applied: success for Large devcoredump file support Patchwork
2025-04-01 20:48 ` ✓ CI.checkpatch: " Patchwork
2025-04-01 20:48 ` ✗ CI.KUnit: failure " Patchwork

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox