[PATCH v2 0/4] Large devcoredump file support

All of lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH v2 0/4] Large devcoredump file support
@ 2025-04-03 20:27 Matthew Brost
  2025-04-03 20:27 ` [PATCH v2 1/4] drm/xe: Add devcoredump chunking Matthew Brost
                   ` (6 more replies)
  0 siblings, 7 replies; 20+ messages in thread
From: Matthew Brost @ 2025-04-03 20:27 UTC (permalink / raw)
  To: intel-xe; +Cc: dri-devel

Devcoredump were truncated at 2G, remove this restriction. While here,
add support for GPU copies of BOs to increase devcoredump speed.

v2:
 - Fix build error
 - Abort printing once printer if full

Matthew Brost (4):
  drm/xe: Add devcoredump chunking
  drm/xe: Update xe_ttm_access_memory to use GPU for non-visible access
  drm/print: Add drm_printer_is_full
  drm/xe: Abort printing coredump in VM printer output if full

 drivers/gpu/drm/xe/xe_bo.c                |  15 +-
 drivers/gpu/drm/xe/xe_devcoredump.c       |  59 ++++--
 drivers/gpu/drm/xe/xe_devcoredump_types.h |   2 +
 drivers/gpu/drm/xe/xe_guc_hwconfig.c      |   2 +-
 drivers/gpu/drm/xe/xe_migrate.c           | 221 ++++++++++++++++++++--
 drivers/gpu/drm/xe/xe_migrate.h           |   4 +
 drivers/gpu/drm/xe/xe_vm.c                |   3 +
 include/drm/drm_print.h                   |  17 ++
 8 files changed, 293 insertions(+), 30 deletions(-)

-- 
2.34.1


^ permalink raw reply	[flat|nested] 20+ messages in thread

* [PATCH v2 1/4] drm/xe: Add devcoredump chunking
  2025-04-03 20:27 [PATCH v2 0/4] Large devcoredump file support Matthew Brost
@ 2025-04-03 20:27 ` Matthew Brost
  2025-04-03 21:28   ` Cavitt, Jonathan
  2025-04-03 21:39   ` John Harrison
  2025-04-03 20:27 ` [PATCH v2 2/4] drm/xe: Update xe_ttm_access_memory to use GPU for non-visible access Matthew Brost
                   ` (5 subsequent siblings)
  6 siblings, 2 replies; 20+ messages in thread
From: Matthew Brost @ 2025-04-03 20:27 UTC (permalink / raw)
  To: intel-xe; +Cc: dri-devel

Chunk devcoredump into 1.5G pieces to avoid hitting the kvmalloc limit
of 2G. Simple algorithm reads 1.5G at time in xe_devcoredump_read
callback as needed.

Signed-off-by: Matthew Brost <matthew.brost@intel.com>
---
 drivers/gpu/drm/xe/xe_devcoredump.c       | 59 ++++++++++++++++++-----
 drivers/gpu/drm/xe/xe_devcoredump_types.h |  2 +
 drivers/gpu/drm/xe/xe_guc_hwconfig.c      |  2 +-
 3 files changed, 50 insertions(+), 13 deletions(-)

diff --git a/drivers/gpu/drm/xe/xe_devcoredump.c b/drivers/gpu/drm/xe/xe_devcoredump.c
index 81b9d9bb3f57..a9e618abf8ac 100644
--- a/drivers/gpu/drm/xe/xe_devcoredump.c
+++ b/drivers/gpu/drm/xe/xe_devcoredump.c
@@ -80,7 +80,8 @@ static struct xe_guc *exec_queue_to_guc(struct xe_exec_queue *q)
 	return &q->gt->uc.guc;
 }
 
-static ssize_t __xe_devcoredump_read(char *buffer, size_t count,
+static ssize_t __xe_devcoredump_read(char *buffer, ssize_t count,
+				     ssize_t start,
 				     struct xe_devcoredump *coredump)
 {
 	struct xe_device *xe;
@@ -94,7 +95,7 @@ static ssize_t __xe_devcoredump_read(char *buffer, size_t count,
 	ss = &coredump->snapshot;
 
 	iter.data = buffer;
-	iter.start = 0;
+	iter.start = start;
 	iter.remain = count;
 
 	p = drm_coredump_printer(&iter);
@@ -168,6 +169,8 @@ static void xe_devcoredump_snapshot_free(struct xe_devcoredump_snapshot *ss)
 	ss->vm = NULL;
 }
 
+#define XE_DEVCOREDUMP_CHUNK_MAX	(SZ_512M + SZ_1G)
+
 static ssize_t xe_devcoredump_read(char *buffer, loff_t offset,
 				   size_t count, void *data, size_t datalen)
 {
@@ -183,6 +186,9 @@ static ssize_t xe_devcoredump_read(char *buffer, loff_t offset,
 	/* Ensure delayed work is captured before continuing */
 	flush_work(&ss->work);
 
+	if (ss->read.size > XE_DEVCOREDUMP_CHUNK_MAX)
+		xe_pm_runtime_get(gt_to_xe(ss->gt));
+
 	mutex_lock(&coredump->lock);
 
 	if (!ss->read.buffer) {
@@ -195,12 +201,26 @@ static ssize_t xe_devcoredump_read(char *buffer, loff_t offset,
 		return 0;
 	}
 
+	if (offset >= ss->read.chunk_position + XE_DEVCOREDUMP_CHUNK_MAX ||
+	    offset < ss->read.chunk_position) {
+		ss->read.chunk_position =
+			ALIGN_DOWN(offset, XE_DEVCOREDUMP_CHUNK_MAX);
+
+		__xe_devcoredump_read(ss->read.buffer,
+				      XE_DEVCOREDUMP_CHUNK_MAX,
+				      ss->read.chunk_position, coredump);
+	}
+
 	byte_copied = count < ss->read.size - offset ? count :
 		ss->read.size - offset;
-	memcpy(buffer, ss->read.buffer + offset, byte_copied);
+	memcpy(buffer, ss->read.buffer +
+	       (offset % XE_DEVCOREDUMP_CHUNK_MAX), byte_copied);
 
 	mutex_unlock(&coredump->lock);
 
+	if (ss->read.size > XE_DEVCOREDUMP_CHUNK_MAX)
+		xe_pm_runtime_put(gt_to_xe(ss->gt));
+
 	return byte_copied;
 }
 
@@ -254,17 +274,32 @@ static void xe_devcoredump_deferred_snap_work(struct work_struct *work)
 	xe_guc_exec_queue_snapshot_capture_delayed(ss->ge);
 	xe_force_wake_put(gt_to_fw(ss->gt), fw_ref);
 
-	xe_pm_runtime_put(xe);
+	ss->read.chunk_position = 0;
 
 	/* Calculate devcoredump size */
-	ss->read.size = __xe_devcoredump_read(NULL, INT_MAX, coredump);
-
-	ss->read.buffer = kvmalloc(ss->read.size, GFP_USER);
-	if (!ss->read.buffer)
-		return;
+	ss->read.size = __xe_devcoredump_read(NULL, LONG_MAX, 0, coredump);
+
+	if (ss->read.size > XE_DEVCOREDUMP_CHUNK_MAX) {
+		ss->read.buffer = kvmalloc(XE_DEVCOREDUMP_CHUNK_MAX,
+					   GFP_USER);
+		if (!ss->read.buffer)
+			goto put_pm;
+
+		__xe_devcoredump_read(ss->read.buffer,
+				      XE_DEVCOREDUMP_CHUNK_MAX,
+				      0, coredump);
+	} else {
+		ss->read.buffer = kvmalloc(ss->read.size, GFP_USER);
+		if (!ss->read.buffer)
+			goto put_pm;
+
+		__xe_devcoredump_read(ss->read.buffer, ss->read.size, 0,
+				      coredump);
+		xe_devcoredump_snapshot_free(ss);
+	}
 
-	__xe_devcoredump_read(ss->read.buffer, ss->read.size, coredump);
-	xe_devcoredump_snapshot_free(ss);
+put_pm:
+	xe_pm_runtime_put(xe);
 }
 
 static void devcoredump_snapshot(struct xe_devcoredump *coredump,
@@ -425,7 +460,7 @@ void xe_print_blob_ascii85(struct drm_printer *p, const char *prefix, char suffi
 	if (offset & 3)
 		drm_printf(p, "Offset not word aligned: %zu", offset);
 
-	line_buff = kzalloc(DMESG_MAX_LINE_LEN, GFP_KERNEL);
+	line_buff = kzalloc(DMESG_MAX_LINE_LEN, GFP_ATOMIC);
 	if (!line_buff) {
 		drm_printf(p, "Failed to allocate line buffer\n");
 		return;
diff --git a/drivers/gpu/drm/xe/xe_devcoredump_types.h b/drivers/gpu/drm/xe/xe_devcoredump_types.h
index 1a1d16a96b2d..a174385a6d83 100644
--- a/drivers/gpu/drm/xe/xe_devcoredump_types.h
+++ b/drivers/gpu/drm/xe/xe_devcoredump_types.h
@@ -66,6 +66,8 @@ struct xe_devcoredump_snapshot {
 	struct {
 		/** @read.size: size of devcoredump in human readable format */
 		ssize_t size;
+		/** @read.chunk_position: position of devcoredump chunk */
+		ssize_t chunk_position;
 		/** @read.buffer: buffer of devcoredump in human readable format */
 		char *buffer;
 	} read;
diff --git a/drivers/gpu/drm/xe/xe_guc_hwconfig.c b/drivers/gpu/drm/xe/xe_guc_hwconfig.c
index af2c817d552c..21403a250834 100644
--- a/drivers/gpu/drm/xe/xe_guc_hwconfig.c
+++ b/drivers/gpu/drm/xe/xe_guc_hwconfig.c
@@ -175,7 +175,7 @@ int xe_guc_hwconfig_lookup_u32(struct xe_guc *guc, u32 attribute, u32 *val)
 	if (num_dw == 0)
 		return -EINVAL;
 
-	hwconfig = kzalloc(size, GFP_KERNEL);
+	hwconfig = kzalloc(size, GFP_ATOMIC);
 	if (!hwconfig)
 		return -ENOMEM;
 
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH v2 2/4] drm/xe: Update xe_ttm_access_memory to use GPU for non-visible access
  2025-04-03 20:27 [PATCH v2 0/4] Large devcoredump file support Matthew Brost
  2025-04-03 20:27 ` [PATCH v2 1/4] drm/xe: Add devcoredump chunking Matthew Brost
@ 2025-04-03 20:27 ` Matthew Brost
  2025-04-03 21:28   ` Cavitt, Jonathan
                     ` (3 more replies)
  2025-04-03 20:27 ` [PATCH v2 3/4] drm/print: Add drm_printer_is_full Matthew Brost
                   ` (4 subsequent siblings)
  6 siblings, 4 replies; 20+ messages in thread
From: Matthew Brost @ 2025-04-03 20:27 UTC (permalink / raw)
  To: intel-xe; +Cc: dri-devel

Add migrate layer functions to access VRAM and update
xe_ttm_access_memory to use for non-visible access and large (more than
16k) BO access. 8G devcoreump on BMG observed 3 minute CPU copy time vs.
3s GPU copy time.

v4:
 - Fix non-page aligned accesses
 - Add support for small / unaligned access
 - Update commit message indicating migrate used for large accesses (Auld)
 - Fix warning in xe_res_cursor for non-zero offset
v5:
 - Fix 32 bit build (CI)
v6:
 - Rebase and use SVM migration copy functions
v7:
 - Fix build error (CI)

Signed-off-by: Matthew Brost <matthew.brost@intel.com>
---
 drivers/gpu/drm/xe/xe_bo.c      |  15 ++-
 drivers/gpu/drm/xe/xe_migrate.c | 221 ++++++++++++++++++++++++++++++--
 drivers/gpu/drm/xe/xe_migrate.h |   4 +
 3 files changed, 223 insertions(+), 17 deletions(-)

diff --git a/drivers/gpu/drm/xe/xe_bo.c b/drivers/gpu/drm/xe/xe_bo.c
index 3c7c2353d3c8..c7e6b03d4aef 100644
--- a/drivers/gpu/drm/xe/xe_bo.c
+++ b/drivers/gpu/drm/xe/xe_bo.c
@@ -1414,6 +1414,7 @@ static int xe_ttm_access_memory(struct ttm_buffer_object *ttm_bo,
 	struct xe_res_cursor cursor;
 	struct xe_vram_region *vram;
 	int bytes_left = len;
+	int err = 0;
 
 	xe_bo_assert_held(bo);
 	xe_device_assert_mem_access(xe);
@@ -1421,9 +1422,14 @@ static int xe_ttm_access_memory(struct ttm_buffer_object *ttm_bo,
 	if (!mem_type_is_vram(ttm_bo->resource->mem_type))
 		return -EIO;
 
-	/* FIXME: Use GPU for non-visible VRAM */
-	if (!xe_ttm_resource_visible(ttm_bo->resource))
-		return -EIO;
+	if (!xe_ttm_resource_visible(ttm_bo->resource) || len >= SZ_16K) {
+		struct xe_migrate *migrate =
+			mem_type_to_migrate(xe, ttm_bo->resource->mem_type);
+
+		err = xe_migrate_access_memory(migrate, bo, offset, buf, len,
+					       write);
+		goto out;
+	}
 
 	vram = res_to_mem_region(ttm_bo->resource);
 	xe_res_first(ttm_bo->resource, offset & PAGE_MASK,
@@ -1447,7 +1453,8 @@ static int xe_ttm_access_memory(struct ttm_buffer_object *ttm_bo,
 			xe_res_next(&cursor, PAGE_SIZE);
 	} while (bytes_left);
 
-	return len;
+out:
+	return err ?: len;
 }
 
 const struct ttm_device_funcs xe_ttm_funcs = {
diff --git a/drivers/gpu/drm/xe/xe_migrate.c b/drivers/gpu/drm/xe/xe_migrate.c
index ff0fc2fb0eb9..ba8568691d99 100644
--- a/drivers/gpu/drm/xe/xe_migrate.c
+++ b/drivers/gpu/drm/xe/xe_migrate.c
@@ -670,6 +670,7 @@ static void emit_copy(struct xe_gt *gt, struct xe_bb *bb,
 	u32 mocs = 0;
 	u32 tile_y = 0;
 
+	xe_gt_assert(gt, !(pitch & 3));
 	xe_gt_assert(gt, size / pitch <= S16_MAX);
 	xe_gt_assert(gt, pitch / 4 <= S16_MAX);
 	xe_gt_assert(gt, pitch <= U16_MAX);
@@ -1602,8 +1603,12 @@ enum xe_migrate_copy_dir {
 	XE_MIGRATE_COPY_TO_SRAM,
 };
 
+#define XE_CACHELINE_BYTES	64ull
+#define XE_CACHELINE_MASK	(XE_CACHELINE_BYTES - 1)
+
 static struct dma_fence *xe_migrate_vram(struct xe_migrate *m,
-					 unsigned long npages,
+					 unsigned long len,
+					 unsigned long sram_offset,
 					 dma_addr_t *sram_addr, u64 vram_addr,
 					 const enum xe_migrate_copy_dir dir)
 {
@@ -1613,17 +1618,21 @@ static struct dma_fence *xe_migrate_vram(struct xe_migrate *m,
 	struct dma_fence *fence = NULL;
 	u32 batch_size = 2;
 	u64 src_L0_ofs, dst_L0_ofs;
-	u64 round_update_size;
 	struct xe_sched_job *job;
 	struct xe_bb *bb;
 	u32 update_idx, pt_slot = 0;
+	unsigned long npages = DIV_ROUND_UP(len + sram_offset, PAGE_SIZE);
+	unsigned int pitch = len >= PAGE_SIZE && !(len & ~PAGE_MASK) ?
+		PAGE_SIZE : 4;
 	int err;
 
-	if (npages * PAGE_SIZE > MAX_PREEMPTDISABLE_TRANSFER)
-		return ERR_PTR(-EINVAL);
+	if (drm_WARN_ON(&xe->drm, (len & XE_CACHELINE_MASK) ||
+			(sram_offset | vram_addr) & XE_CACHELINE_MASK))
+		return ERR_PTR(-EOPNOTSUPP);
 
-	round_update_size = npages * PAGE_SIZE;
-	batch_size += pte_update_cmd_size(round_update_size);
+	xe_assert(xe, npages * PAGE_SIZE <= MAX_PREEMPTDISABLE_TRANSFER);
+
+	batch_size += pte_update_cmd_size(len);
 	batch_size += EMIT_COPY_DW;
 
 	bb = xe_bb_new(gt, batch_size, use_usm_batch);
@@ -1633,22 +1642,21 @@ static struct dma_fence *xe_migrate_vram(struct xe_migrate *m,
 	}
 
 	build_pt_update_batch_sram(m, bb, pt_slot * XE_PAGE_SIZE,
-				   sram_addr, round_update_size);
+				   sram_addr, len + sram_offset);
 
 	if (dir == XE_MIGRATE_COPY_TO_VRAM) {
-		src_L0_ofs = xe_migrate_vm_addr(pt_slot, 0);
+		src_L0_ofs = xe_migrate_vm_addr(pt_slot, 0) + sram_offset;
 		dst_L0_ofs = xe_migrate_vram_ofs(xe, vram_addr, false);
 
 	} else {
 		src_L0_ofs = xe_migrate_vram_ofs(xe, vram_addr, false);
-		dst_L0_ofs = xe_migrate_vm_addr(pt_slot, 0);
+		dst_L0_ofs = xe_migrate_vm_addr(pt_slot, 0) + sram_offset;
 	}
 
 	bb->cs[bb->len++] = MI_BATCH_BUFFER_END;
 	update_idx = bb->len;
 
-	emit_copy(gt, bb, src_L0_ofs, dst_L0_ofs, round_update_size,
-		  XE_PAGE_SIZE);
+	emit_copy(gt, bb, src_L0_ofs, dst_L0_ofs, len, pitch);
 
 	job = xe_bb_create_migration_job(m->q, bb,
 					 xe_migrate_batch_base(m, use_usm_batch),
@@ -1696,7 +1704,7 @@ struct dma_fence *xe_migrate_to_vram(struct xe_migrate *m,
 				     dma_addr_t *src_addr,
 				     u64 dst_addr)
 {
-	return xe_migrate_vram(m, npages, src_addr, dst_addr,
+	return xe_migrate_vram(m, npages * PAGE_SIZE, 0, src_addr, dst_addr,
 			       XE_MIGRATE_COPY_TO_VRAM);
 }
 
@@ -1717,12 +1725,199 @@ struct dma_fence *xe_migrate_from_vram(struct xe_migrate *m,
 				       u64 src_addr,
 				       dma_addr_t *dst_addr)
 {
-	return xe_migrate_vram(m, npages, dst_addr, src_addr,
+	return xe_migrate_vram(m, npages * PAGE_SIZE, 0, dst_addr, src_addr,
 			       XE_MIGRATE_COPY_TO_SRAM);
 }
 
 #endif
 
+static void xe_migrate_dma_unmap(struct xe_device *xe, dma_addr_t *dma_addr,
+				 int len, int write)
+{
+	unsigned long i, npages = DIV_ROUND_UP(len, PAGE_SIZE);
+
+	for (i = 0; i < npages; ++i) {
+		if (!dma_addr[i])
+			continue;
+
+		dma_unmap_page(xe->drm.dev, dma_addr[i], PAGE_SIZE,
+			       write ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
+	}
+	kfree(dma_addr);
+}
+
+static dma_addr_t *xe_migrate_dma_map(struct xe_device *xe,
+				      void *buf, int len, int write)
+{
+	dma_addr_t *dma_addr;
+	unsigned long i, npages = DIV_ROUND_UP(len, PAGE_SIZE);
+
+	dma_addr = kcalloc(npages, sizeof(*dma_addr), GFP_KERNEL);
+	if (!dma_addr)
+		return ERR_PTR(-ENOMEM);
+
+	for (i = 0; i < npages; ++i) {
+		dma_addr_t addr;
+		struct page *page;
+
+		if (is_vmalloc_addr(buf))
+			page = vmalloc_to_page(buf);
+		else
+			page = virt_to_page(buf);
+
+		addr = dma_map_page(xe->drm.dev,
+				    page, 0, PAGE_SIZE,
+				    write ? DMA_TO_DEVICE :
+				    DMA_FROM_DEVICE);
+		if (dma_mapping_error(xe->drm.dev, addr))
+			goto err_fault;
+
+		dma_addr[i] = addr;
+		buf += PAGE_SIZE;
+	}
+
+	return dma_addr;
+
+err_fault:
+	xe_migrate_dma_unmap(xe, dma_addr, len, write);
+	return ERR_PTR(-EFAULT);
+}
+
+/**
+ * xe_migrate_access_memory - Access memory of a BO via GPU
+ *
+ * @m: The migration context.
+ * @bo: buffer object
+ * @offset: access offset into buffer object
+ * @buf: pointer to caller memory to read into or write from
+ * @len: length of access
+ * @write: write access
+ *
+ * Access memory of a BO via GPU either reading in or writing from a passed in
+ * pointer. Pointer is dma mapped for GPU access and GPU commands are issued to
+ * read to or write from pointer.
+ *
+ * Returns:
+ * 0 if successful, negative error code on failure.
+ */
+int xe_migrate_access_memory(struct xe_migrate *m, struct xe_bo *bo,
+			     unsigned long offset, void *buf, int len,
+			     int write)
+{
+	struct xe_tile *tile = m->tile;
+	struct xe_device *xe = tile_to_xe(tile);
+	struct xe_res_cursor cursor;
+	struct dma_fence *fence = NULL;
+	dma_addr_t *dma_addr;
+	unsigned long page_offset = (unsigned long)buf & ~PAGE_MASK;
+	int bytes_left = len, current_page = 0;
+	void *orig_buf = buf;
+
+	xe_bo_assert_held(bo);
+
+	/* Use bounce buffer for small access and unaligned access */
+	if (len & XE_CACHELINE_MASK ||
+	    ((uintptr_t)buf | offset) & XE_CACHELINE_MASK) {
+		int buf_offset = 0;
+
+		/*
+		 * Less than ideal for large unaligned access but this should be
+		 * fairly rare, can fixup if this becomes common.
+		 */
+		do {
+			u8 bounce[XE_CACHELINE_BYTES];
+			void *ptr = (void *)bounce;
+			int err;
+			int copy_bytes = min_t(int, bytes_left,
+					       XE_CACHELINE_BYTES -
+					       (offset & XE_CACHELINE_MASK));
+			int ptr_offset = offset & XE_CACHELINE_MASK;
+
+			err = xe_migrate_access_memory(m, bo,
+						       offset &
+						       ~XE_CACHELINE_MASK,
+						       (void *)ptr,
+						       sizeof(bounce), 0);
+			if (err)
+				return err;
+
+			if (!write) {
+				memcpy(buf + buf_offset, ptr + ptr_offset,
+				       copy_bytes);
+				goto next;
+			}
+
+			memcpy(ptr + ptr_offset, buf + buf_offset, copy_bytes);
+			err = xe_migrate_access_memory(m, bo,
+						       offset & ~XE_CACHELINE_MASK,
+						       (void *)ptr,
+						       sizeof(bounce), 0);
+			if (err)
+				return err;
+
+next:
+			bytes_left -= copy_bytes;
+			buf_offset += copy_bytes;
+			offset += copy_bytes;
+		} while (bytes_left);
+
+		return 0;
+	}
+
+	dma_addr = xe_migrate_dma_map(xe, buf, len + page_offset, write);
+	if (IS_ERR(dma_addr))
+		return PTR_ERR(dma_addr);
+
+	xe_res_first(bo->ttm.resource, offset, bo->size - offset, &cursor);
+
+	do {
+		struct dma_fence *__fence;
+		u64 vram_addr = vram_region_gpu_offset(bo->ttm.resource) +
+			cursor.start;
+		int current_bytes;
+
+		if (cursor.size > MAX_PREEMPTDISABLE_TRANSFER)
+			current_bytes = min_t(int, bytes_left,
+					      MAX_PREEMPTDISABLE_TRANSFER);
+		else
+			current_bytes = min_t(int, bytes_left, cursor.size);
+
+		if (fence)
+			dma_fence_put(fence);
+
+		__fence = xe_migrate_vram(m, current_bytes,
+					  (unsigned long)buf & ~PAGE_MASK,
+					  dma_addr + current_page,
+					  vram_addr, write ?
+					  XE_MIGRATE_COPY_TO_VRAM :
+					  XE_MIGRATE_COPY_TO_SRAM);
+		if (IS_ERR(__fence)) {
+			if (fence)
+				dma_fence_wait(fence, false);
+			fence = __fence;
+			goto out_err;
+		}
+		fence = __fence;
+
+		buf += current_bytes;
+		offset += current_bytes;
+		current_page = (int)(buf - orig_buf) / PAGE_SIZE;
+		bytes_left -= current_bytes;
+		if (bytes_left)
+			xe_res_next(&cursor, current_bytes);
+	} while (bytes_left);
+
+	dma_fence_wait(fence, false);
+	dma_fence_put(fence);
+	xe_migrate_dma_unmap(xe, dma_addr, len + page_offset, write);
+
+	return 0;
+
+out_err:
+	xe_migrate_dma_unmap(xe, dma_addr, len + page_offset, write);
+	return PTR_ERR(fence);
+}
+
 #if IS_ENABLED(CONFIG_DRM_XE_KUNIT_TEST)
 #include "tests/xe_migrate.c"
 #endif
diff --git a/drivers/gpu/drm/xe/xe_migrate.h b/drivers/gpu/drm/xe/xe_migrate.h
index 6ff9a963425c..fb9839c1bae0 100644
--- a/drivers/gpu/drm/xe/xe_migrate.h
+++ b/drivers/gpu/drm/xe/xe_migrate.h
@@ -112,6 +112,10 @@ struct dma_fence *xe_migrate_copy(struct xe_migrate *m,
 				  struct ttm_resource *dst,
 				  bool copy_only_ccs);
 
+int xe_migrate_access_memory(struct xe_migrate *m, struct xe_bo *bo,
+			     unsigned long offset, void *buf, int len,
+			     int write);
+
 #define XE_MIGRATE_CLEAR_FLAG_BO_DATA		BIT(0)
 #define XE_MIGRATE_CLEAR_FLAG_CCS_DATA		BIT(1)
 #define XE_MIGRATE_CLEAR_FLAG_FULL	(XE_MIGRATE_CLEAR_FLAG_BO_DATA | \
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH v2 3/4] drm/print: Add drm_printer_is_full
  2025-04-03 20:27 [PATCH v2 0/4] Large devcoredump file support Matthew Brost
  2025-04-03 20:27 ` [PATCH v2 1/4] drm/xe: Add devcoredump chunking Matthew Brost
  2025-04-03 20:27 ` [PATCH v2 2/4] drm/xe: Update xe_ttm_access_memory to use GPU for non-visible access Matthew Brost
@ 2025-04-03 20:27 ` Matthew Brost
  2025-04-03 21:28   ` Cavitt, Jonathan
  2025-04-03 20:27 ` [PATCH v2 4/4] drm/xe: Abort printing coredump in VM printer output if full Matthew Brost
                   ` (3 subsequent siblings)
  6 siblings, 1 reply; 20+ messages in thread
From: Matthew Brost @ 2025-04-03 20:27 UTC (permalink / raw)
  To: intel-xe; +Cc: dri-devel

Add drm_printer_is_full which indicates if a drm printer's output is
full. Useful to short circuit coredump printing once printer's output is
full.

Signed-off-by: Matthew Brost <matthew.brost@intel.com>
---
 include/drm/drm_print.h | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/include/drm/drm_print.h b/include/drm/drm_print.h
index f31eba1c7cab..db7517ee1b19 100644
--- a/include/drm/drm_print.h
+++ b/include/drm/drm_print.h
@@ -242,6 +242,23 @@ struct drm_print_iterator {
 	ssize_t offset;
 };
 
+/**
+ * drm_printer_is_full() - DRM printer output is full
+ * @p: DRM printer
+ *
+ * DRM printer output is full, useful to short circuit coredump printing once
+ * printer is full.
+ *
+ * RETURNS:
+ * True if DRM printer output buffer is full, False otherwise
+ */
+static inline bool drm_printer_is_full(struct drm_printer *p)
+{
+	struct drm_print_iterator *iterator = p->arg;
+
+	return !iterator->remain;
+}
+
 /**
  * drm_coredump_printer - construct a &drm_printer that can output to a buffer
  * from the read function for devcoredump
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH v2 4/4] drm/xe: Abort printing coredump in VM printer output if full
  2025-04-03 20:27 [PATCH v2 0/4] Large devcoredump file support Matthew Brost
                   ` (2 preceding siblings ...)
  2025-04-03 20:27 ` [PATCH v2 3/4] drm/print: Add drm_printer_is_full Matthew Brost
@ 2025-04-03 20:27 ` Matthew Brost
  2025-04-03 21:28   ` Cavitt, Jonathan
  2025-04-04  3:31 ` ✓ CI.Patch_applied: success for Large devcoredump file support (rev2) Patchwork
                   ` (2 subsequent siblings)
  6 siblings, 1 reply; 20+ messages in thread
From: Matthew Brost @ 2025-04-03 20:27 UTC (permalink / raw)
  To: intel-xe; +Cc: dri-devel

Abort printing coredump in VM printer output if full. Helps speedup
large coredumps which need to walked multiple times in
xe_devcoredump_read.

Signed-off-by: Matthew Brost <matthew.brost@intel.com>
---
 drivers/gpu/drm/xe/xe_vm.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c
index 864266e38aa7..164824617a2e 100644
--- a/drivers/gpu/drm/xe/xe_vm.c
+++ b/drivers/gpu/drm/xe/xe_vm.c
@@ -3847,6 +3847,9 @@ void xe_vm_snapshot_print(struct xe_vm_snapshot *snap, struct drm_printer *p)
 		}
 
 		drm_puts(p, "\n");
+
+		if (drm_printer_is_full(p))
+			return;
 	}
 }
 
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 20+ messages in thread

* RE: [PATCH v2 1/4] drm/xe: Add devcoredump chunking
  2025-04-03 20:27 ` [PATCH v2 1/4] drm/xe: Add devcoredump chunking Matthew Brost
@ 2025-04-03 21:28   ` Cavitt, Jonathan
  2025-04-03 21:39   ` John Harrison
  1 sibling, 0 replies; 20+ messages in thread
From: Cavitt, Jonathan @ 2025-04-03 21:28 UTC (permalink / raw)
  To: Brost, Matthew, intel-xe@lists.freedesktop.org
  Cc: dri-devel@lists.freedesktop.org, Cavitt, Jonathan

-----Original Message-----
From: Intel-xe <intel-xe-bounces@lists.freedesktop.org> On Behalf Of Matthew Brost
Sent: Thursday, April 3, 2025 1:27 PM
To: intel-xe@lists.freedesktop.org
Cc: dri-devel@lists.freedesktop.org
Subject: [PATCH v2 1/4] drm/xe: Add devcoredump chunking
> 
> Chunk devcoredump into 1.5G pieces to avoid hitting the kvmalloc limit
> of 2G. Simple algorithm reads 1.5G at time in xe_devcoredump_read
> callback as needed.
> 
> Signed-off-by: Matthew Brost <matthew.brost@intel.com>

I have no issues with this patch, though you should maybe get a second opinion.
Reviewed-by: Jonathan Cavitt <jonathan.cavitt@intel.com>
-Jonathan Cavitt

> ---
>  drivers/gpu/drm/xe/xe_devcoredump.c       | 59 ++++++++++++++++++-----
>  drivers/gpu/drm/xe/xe_devcoredump_types.h |  2 +
>  drivers/gpu/drm/xe/xe_guc_hwconfig.c      |  2 +-
>  3 files changed, 50 insertions(+), 13 deletions(-)
> 
> diff --git a/drivers/gpu/drm/xe/xe_devcoredump.c b/drivers/gpu/drm/xe/xe_devcoredump.c
> index 81b9d9bb3f57..a9e618abf8ac 100644
> --- a/drivers/gpu/drm/xe/xe_devcoredump.c
> +++ b/drivers/gpu/drm/xe/xe_devcoredump.c
> @@ -80,7 +80,8 @@ static struct xe_guc *exec_queue_to_guc(struct xe_exec_queue *q)
>  	return &q->gt->uc.guc;
>  }
>  
> -static ssize_t __xe_devcoredump_read(char *buffer, size_t count,
> +static ssize_t __xe_devcoredump_read(char *buffer, ssize_t count,
> +				     ssize_t start,
>  				     struct xe_devcoredump *coredump)
>  {
>  	struct xe_device *xe;
> @@ -94,7 +95,7 @@ static ssize_t __xe_devcoredump_read(char *buffer, size_t count,
>  	ss = &coredump->snapshot;
>  
>  	iter.data = buffer;
> -	iter.start = 0;
> +	iter.start = start;
>  	iter.remain = count;
>  
>  	p = drm_coredump_printer(&iter);
> @@ -168,6 +169,8 @@ static void xe_devcoredump_snapshot_free(struct xe_devcoredump_snapshot *ss)
>  	ss->vm = NULL;
>  }
>  
> +#define XE_DEVCOREDUMP_CHUNK_MAX	(SZ_512M + SZ_1G)
> +
>  static ssize_t xe_devcoredump_read(char *buffer, loff_t offset,
>  				   size_t count, void *data, size_t datalen)
>  {
> @@ -183,6 +186,9 @@ static ssize_t xe_devcoredump_read(char *buffer, loff_t offset,
>  	/* Ensure delayed work is captured before continuing */
>  	flush_work(&ss->work);
>  
> +	if (ss->read.size > XE_DEVCOREDUMP_CHUNK_MAX)
> +		xe_pm_runtime_get(gt_to_xe(ss->gt));
> +
>  	mutex_lock(&coredump->lock);
>  
>  	if (!ss->read.buffer) {
> @@ -195,12 +201,26 @@ static ssize_t xe_devcoredump_read(char *buffer, loff_t offset,
>  		return 0;
>  	}
>  
> +	if (offset >= ss->read.chunk_position + XE_DEVCOREDUMP_CHUNK_MAX ||
> +	    offset < ss->read.chunk_position) {
> +		ss->read.chunk_position =
> +			ALIGN_DOWN(offset, XE_DEVCOREDUMP_CHUNK_MAX);
> +
> +		__xe_devcoredump_read(ss->read.buffer,
> +				      XE_DEVCOREDUMP_CHUNK_MAX,
> +				      ss->read.chunk_position, coredump);
> +	}
> +
>  	byte_copied = count < ss->read.size - offset ? count :
>  		ss->read.size - offset;
> -	memcpy(buffer, ss->read.buffer + offset, byte_copied);
> +	memcpy(buffer, ss->read.buffer +
> +	       (offset % XE_DEVCOREDUMP_CHUNK_MAX), byte_copied);
>  
>  	mutex_unlock(&coredump->lock);
>  
> +	if (ss->read.size > XE_DEVCOREDUMP_CHUNK_MAX)
> +		xe_pm_runtime_put(gt_to_xe(ss->gt));
> +
>  	return byte_copied;
>  }
>  
> @@ -254,17 +274,32 @@ static void xe_devcoredump_deferred_snap_work(struct work_struct *work)
>  	xe_guc_exec_queue_snapshot_capture_delayed(ss->ge);
>  	xe_force_wake_put(gt_to_fw(ss->gt), fw_ref);
>  
> -	xe_pm_runtime_put(xe);
> +	ss->read.chunk_position = 0;
>  
>  	/* Calculate devcoredump size */
> -	ss->read.size = __xe_devcoredump_read(NULL, INT_MAX, coredump);
> -
> -	ss->read.buffer = kvmalloc(ss->read.size, GFP_USER);
> -	if (!ss->read.buffer)
> -		return;
> +	ss->read.size = __xe_devcoredump_read(NULL, LONG_MAX, 0, coredump);
> +
> +	if (ss->read.size > XE_DEVCOREDUMP_CHUNK_MAX) {
> +		ss->read.buffer = kvmalloc(XE_DEVCOREDUMP_CHUNK_MAX,
> +					   GFP_USER);
> +		if (!ss->read.buffer)
> +			goto put_pm;
> +
> +		__xe_devcoredump_read(ss->read.buffer,
> +				      XE_DEVCOREDUMP_CHUNK_MAX,
> +				      0, coredump);
> +	} else {
> +		ss->read.buffer = kvmalloc(ss->read.size, GFP_USER);
> +		if (!ss->read.buffer)
> +			goto put_pm;
> +
> +		__xe_devcoredump_read(ss->read.buffer, ss->read.size, 0,
> +				      coredump);
> +		xe_devcoredump_snapshot_free(ss);
> +	}
>  
> -	__xe_devcoredump_read(ss->read.buffer, ss->read.size, coredump);
> -	xe_devcoredump_snapshot_free(ss);
> +put_pm:
> +	xe_pm_runtime_put(xe);
>  }
>  
>  static void devcoredump_snapshot(struct xe_devcoredump *coredump,
> @@ -425,7 +460,7 @@ void xe_print_blob_ascii85(struct drm_printer *p, const char *prefix, char suffi
>  	if (offset & 3)
>  		drm_printf(p, "Offset not word aligned: %zu", offset);
>  
> -	line_buff = kzalloc(DMESG_MAX_LINE_LEN, GFP_KERNEL);
> +	line_buff = kzalloc(DMESG_MAX_LINE_LEN, GFP_ATOMIC);
>  	if (!line_buff) {
>  		drm_printf(p, "Failed to allocate line buffer\n");
>  		return;
> diff --git a/drivers/gpu/drm/xe/xe_devcoredump_types.h b/drivers/gpu/drm/xe/xe_devcoredump_types.h
> index 1a1d16a96b2d..a174385a6d83 100644
> --- a/drivers/gpu/drm/xe/xe_devcoredump_types.h
> +++ b/drivers/gpu/drm/xe/xe_devcoredump_types.h
> @@ -66,6 +66,8 @@ struct xe_devcoredump_snapshot {
>  	struct {
>  		/** @read.size: size of devcoredump in human readable format */
>  		ssize_t size;
> +		/** @read.chunk_position: position of devcoredump chunk */
> +		ssize_t chunk_position;
>  		/** @read.buffer: buffer of devcoredump in human readable format */
>  		char *buffer;
>  	} read;
> diff --git a/drivers/gpu/drm/xe/xe_guc_hwconfig.c b/drivers/gpu/drm/xe/xe_guc_hwconfig.c
> index af2c817d552c..21403a250834 100644
> --- a/drivers/gpu/drm/xe/xe_guc_hwconfig.c
> +++ b/drivers/gpu/drm/xe/xe_guc_hwconfig.c
> @@ -175,7 +175,7 @@ int xe_guc_hwconfig_lookup_u32(struct xe_guc *guc, u32 attribute, u32 *val)
>  	if (num_dw == 0)
>  		return -EINVAL;
>  
> -	hwconfig = kzalloc(size, GFP_KERNEL);
> +	hwconfig = kzalloc(size, GFP_ATOMIC);
>  	if (!hwconfig)
>  		return -ENOMEM;
>  
> -- 
> 2.34.1
> 
> 

^ permalink raw reply	[flat|nested] 20+ messages in thread

* RE: [PATCH v2 2/4] drm/xe: Update xe_ttm_access_memory to use GPU for non-visible access
  2025-04-03 20:27 ` [PATCH v2 2/4] drm/xe: Update xe_ttm_access_memory to use GPU for non-visible access Matthew Brost
@ 2025-04-03 21:28   ` Cavitt, Jonathan
  2025-04-10  3:58     ` Matthew Brost
  2025-04-04  1:03   ` kernel test robot
                     ` (2 subsequent siblings)
  3 siblings, 1 reply; 20+ messages in thread
From: Cavitt, Jonathan @ 2025-04-03 21:28 UTC (permalink / raw)
  To: Brost, Matthew, intel-xe@lists.freedesktop.org
  Cc: dri-devel@lists.freedesktop.org, Cavitt, Jonathan

-----Original Message-----
From: Intel-xe <intel-xe-bounces@lists.freedesktop.org> On Behalf Of Matthew Brost
Sent: Thursday, April 3, 2025 1:27 PM
To: intel-xe@lists.freedesktop.org
Cc: dri-devel@lists.freedesktop.org
Subject: [PATCH v2 2/4] drm/xe: Update xe_ttm_access_memory to use GPU for non-visible access
> 
> Add migrate layer functions to access VRAM and update
> xe_ttm_access_memory to use for non-visible access and large (more than
> 16k) BO access. 8G devcoreump on BMG observed 3 minute CPU copy time vs.
> 3s GPU copy time.
> 
> v4:
>  - Fix non-page aligned accesses
>  - Add support for small / unaligned access
>  - Update commit message indicating migrate used for large accesses (Auld)
>  - Fix warning in xe_res_cursor for non-zero offset
> v5:
>  - Fix 32 bit build (CI)
> v6:
>  - Rebase and use SVM migration copy functions
> v7:
>  - Fix build error (CI)
> 
> Signed-off-by: Matthew Brost <matthew.brost@intel.com>

I left some notes down below.  Most are non-blocking, which I labeled as such.  And
as for the rest, it's probable that I'm just misunderstanding some parts of the code, and
that the notes I left are not relevant.

So, I'll be providing my reviewed-by now in the case that the blocking review notes
turn out to not need to be addressed.  And in the case they need to be addressed,
you can take my reviewed-by for the next revision.

Reviewed-by: Jonathan Cavitt <jonathan.cavitt@intel.com>

> ---
>  drivers/gpu/drm/xe/xe_bo.c      |  15 ++-
>  drivers/gpu/drm/xe/xe_migrate.c | 221 ++++++++++++++++++++++++++++++--
>  drivers/gpu/drm/xe/xe_migrate.h |   4 +
>  3 files changed, 223 insertions(+), 17 deletions(-)
> 
> diff --git a/drivers/gpu/drm/xe/xe_bo.c b/drivers/gpu/drm/xe/xe_bo.c
> index 3c7c2353d3c8..c7e6b03d4aef 100644
> --- a/drivers/gpu/drm/xe/xe_bo.c
> +++ b/drivers/gpu/drm/xe/xe_bo.c
> @@ -1414,6 +1414,7 @@ static int xe_ttm_access_memory(struct ttm_buffer_object *ttm_bo,
>  	struct xe_res_cursor cursor;
>  	struct xe_vram_region *vram;
>  	int bytes_left = len;
> +	int err = 0;
>  
>  	xe_bo_assert_held(bo);
>  	xe_device_assert_mem_access(xe);
> @@ -1421,9 +1422,14 @@ static int xe_ttm_access_memory(struct ttm_buffer_object *ttm_bo,
>  	if (!mem_type_is_vram(ttm_bo->resource->mem_type))
>  		return -EIO;
>  
> -	/* FIXME: Use GPU for non-visible VRAM */
> -	if (!xe_ttm_resource_visible(ttm_bo->resource))
> -		return -EIO;
> +	if (!xe_ttm_resource_visible(ttm_bo->resource) || len >= SZ_16K) {
> +		struct xe_migrate *migrate =
> +			mem_type_to_migrate(xe, ttm_bo->resource->mem_type);
> +
> +		err = xe_migrate_access_memory(migrate, bo, offset, buf, len,
> +					       write);
> +		goto out;
> +	}
>  
>  	vram = res_to_mem_region(ttm_bo->resource);
>  	xe_res_first(ttm_bo->resource, offset & PAGE_MASK,
> @@ -1447,7 +1453,8 @@ static int xe_ttm_access_memory(struct ttm_buffer_object *ttm_bo,
>  			xe_res_next(&cursor, PAGE_SIZE);
>  	} while (bytes_left);
>  
> -	return len;
> +out:
> +	return err ?: len;
>  }
>  
>  const struct ttm_device_funcs xe_ttm_funcs = {
> diff --git a/drivers/gpu/drm/xe/xe_migrate.c b/drivers/gpu/drm/xe/xe_migrate.c
> index ff0fc2fb0eb9..ba8568691d99 100644
> --- a/drivers/gpu/drm/xe/xe_migrate.c
> +++ b/drivers/gpu/drm/xe/xe_migrate.c
> @@ -670,6 +670,7 @@ static void emit_copy(struct xe_gt *gt, struct xe_bb *bb,
>  	u32 mocs = 0;
>  	u32 tile_y = 0;
>  
> +	xe_gt_assert(gt, !(pitch & 3));
>  	xe_gt_assert(gt, size / pitch <= S16_MAX);
>  	xe_gt_assert(gt, pitch / 4 <= S16_MAX);
>  	xe_gt_assert(gt, pitch <= U16_MAX);
> @@ -1602,8 +1603,12 @@ enum xe_migrate_copy_dir {
>  	XE_MIGRATE_COPY_TO_SRAM,
>  };
>  
> +#define XE_CACHELINE_BYTES	64ull
> +#define XE_CACHELINE_MASK	(XE_CACHELINE_BYTES - 1)
> +
>  static struct dma_fence *xe_migrate_vram(struct xe_migrate *m,
> -					 unsigned long npages,
> +					 unsigned long len,
> +					 unsigned long sram_offset,
>  					 dma_addr_t *sram_addr, u64 vram_addr,
>  					 const enum xe_migrate_copy_dir dir)
>  {
> @@ -1613,17 +1618,21 @@ static struct dma_fence *xe_migrate_vram(struct xe_migrate *m,
>  	struct dma_fence *fence = NULL;
>  	u32 batch_size = 2;
>  	u64 src_L0_ofs, dst_L0_ofs;
> -	u64 round_update_size;
>  	struct xe_sched_job *job;
>  	struct xe_bb *bb;
>  	u32 update_idx, pt_slot = 0;
> +	unsigned long npages = DIV_ROUND_UP(len + sram_offset, PAGE_SIZE);
> +	unsigned int pitch = len >= PAGE_SIZE && !(len & ~PAGE_MASK) ?
> +		PAGE_SIZE : 4;
>  	int err;
>  
> -	if (npages * PAGE_SIZE > MAX_PREEMPTDISABLE_TRANSFER)
> -		return ERR_PTR(-EINVAL);
> +	if (drm_WARN_ON(&xe->drm, (len & XE_CACHELINE_MASK) ||
> +			(sram_offset | vram_addr) & XE_CACHELINE_MASK))
> +		return ERR_PTR(-EOPNOTSUPP);
>  
> -	round_update_size = npages * PAGE_SIZE;
> -	batch_size += pte_update_cmd_size(round_update_size);
> +	xe_assert(xe, npages * PAGE_SIZE <= MAX_PREEMPTDISABLE_TRANSFER);

Hmm... Does this case still need to return an error value?  I don't think replacing it with an
xe_assert is entirely valid, as asserts should not be used as a replacement of proper error
handling.  Or so I've been told.

So, I guess the question is: what error was this preventing previously, and is it still a concern?

> +
> +	batch_size += pte_update_cmd_size(len);
>  	batch_size += EMIT_COPY_DW;
>  
>  	bb = xe_bb_new(gt, batch_size, use_usm_batch);
> @@ -1633,22 +1642,21 @@ static struct dma_fence *xe_migrate_vram(struct xe_migrate *m,
>  	}
>  
>  	build_pt_update_batch_sram(m, bb, pt_slot * XE_PAGE_SIZE,
> -				   sram_addr, round_update_size);
> +				   sram_addr, len + sram_offset);
>  
>  	if (dir == XE_MIGRATE_COPY_TO_VRAM) {
> -		src_L0_ofs = xe_migrate_vm_addr(pt_slot, 0);
> +		src_L0_ofs = xe_migrate_vm_addr(pt_slot, 0) + sram_offset;
>  		dst_L0_ofs = xe_migrate_vram_ofs(xe, vram_addr, false);
>  
>  	} else {
>  		src_L0_ofs = xe_migrate_vram_ofs(xe, vram_addr, false);
> -		dst_L0_ofs = xe_migrate_vm_addr(pt_slot, 0);
> +		dst_L0_ofs = xe_migrate_vm_addr(pt_slot, 0) + sram_offset;
>  	}
>  
>  	bb->cs[bb->len++] = MI_BATCH_BUFFER_END;
>  	update_idx = bb->len;
>  
> -	emit_copy(gt, bb, src_L0_ofs, dst_L0_ofs, round_update_size,
> -		  XE_PAGE_SIZE);
> +	emit_copy(gt, bb, src_L0_ofs, dst_L0_ofs, len, pitch);
>  
>  	job = xe_bb_create_migration_job(m->q, bb,
>  					 xe_migrate_batch_base(m, use_usm_batch),
> @@ -1696,7 +1704,7 @@ struct dma_fence *xe_migrate_to_vram(struct xe_migrate *m,
>  				     dma_addr_t *src_addr,
>  				     u64 dst_addr)
>  {
> -	return xe_migrate_vram(m, npages, src_addr, dst_addr,
> +	return xe_migrate_vram(m, npages * PAGE_SIZE, 0, src_addr, dst_addr,
>  			       XE_MIGRATE_COPY_TO_VRAM);
>  }
>  
> @@ -1717,12 +1725,199 @@ struct dma_fence *xe_migrate_from_vram(struct xe_migrate *m,
>  				       u64 src_addr,
>  				       dma_addr_t *dst_addr)
>  {
> -	return xe_migrate_vram(m, npages, dst_addr, src_addr,
> +	return xe_migrate_vram(m, npages * PAGE_SIZE, 0, dst_addr, src_addr,
>  			       XE_MIGRATE_COPY_TO_SRAM);
>  }
>  
>  #endif
>  
> +static void xe_migrate_dma_unmap(struct xe_device *xe, dma_addr_t *dma_addr,
> +				 int len, int write)
> +{
> +	unsigned long i, npages = DIV_ROUND_UP(len, PAGE_SIZE);
> +
> +	for (i = 0; i < npages; ++i) {
> +		if (!dma_addr[i])
> +			continue;

Non-blocking:
I'm guessing the dma_addr array can be non-contiguous (I.E. position 3 having a
dma address does not imply positions 1 and 2 have them).  Because otherwise
this can be a break condition instead of a continue.

> +
> +		dma_unmap_page(xe->drm.dev, dma_addr[i], PAGE_SIZE,
> +			       write ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
> +	}
> +	kfree(dma_addr);
> +}
> +
> +static dma_addr_t *xe_migrate_dma_map(struct xe_device *xe,
> +				      void *buf, int len, int write)
> +{
> +	dma_addr_t *dma_addr;
> +	unsigned long i, npages = DIV_ROUND_UP(len, PAGE_SIZE);
> +
> +	dma_addr = kcalloc(npages, sizeof(*dma_addr), GFP_KERNEL);
> +	if (!dma_addr)
> +		return ERR_PTR(-ENOMEM);
> +
> +	for (i = 0; i < npages; ++i) {
> +		dma_addr_t addr;
> +		struct page *page;
> +
> +		if (is_vmalloc_addr(buf))
> +			page = vmalloc_to_page(buf);
> +		else
> +			page = virt_to_page(buf);
> +
> +		addr = dma_map_page(xe->drm.dev,
> +				    page, 0, PAGE_SIZE,
> +				    write ? DMA_TO_DEVICE :
> +				    DMA_FROM_DEVICE);
> +		if (dma_mapping_error(xe->drm.dev, addr))
> +			goto err_fault;
> +
> +		dma_addr[i] = addr;
> +		buf += PAGE_SIZE;
> +	}
> +
> +	return dma_addr;
> +
> +err_fault:
> +	xe_migrate_dma_unmap(xe, dma_addr, len, write);
> +	return ERR_PTR(-EFAULT);
> +}
> +
> +/**
> + * xe_migrate_access_memory - Access memory of a BO via GPU
> + *
> + * @m: The migration context.
> + * @bo: buffer object
> + * @offset: access offset into buffer object
> + * @buf: pointer to caller memory to read into or write from
> + * @len: length of access
> + * @write: write access
> + *
> + * Access memory of a BO via GPU either reading in or writing from a passed in
> + * pointer. Pointer is dma mapped for GPU access and GPU commands are issued to
> + * read to or write from pointer.
> + *
> + * Returns:
> + * 0 if successful, negative error code on failure.
> + */
> +int xe_migrate_access_memory(struct xe_migrate *m, struct xe_bo *bo,
> +			     unsigned long offset, void *buf, int len,
> +			     int write)
> +{
> +	struct xe_tile *tile = m->tile;
> +	struct xe_device *xe = tile_to_xe(tile);
> +	struct xe_res_cursor cursor;
> +	struct dma_fence *fence = NULL;
> +	dma_addr_t *dma_addr;
> +	unsigned long page_offset = (unsigned long)buf & ~PAGE_MASK;
> +	int bytes_left = len, current_page = 0;
> +	void *orig_buf = buf;
> +
> +	xe_bo_assert_held(bo);
> +
> +	/* Use bounce buffer for small access and unaligned access */
> +	if (len & XE_CACHELINE_MASK ||
> +	    ((uintptr_t)buf | offset) & XE_CACHELINE_MASK) {
> +		int buf_offset = 0;
> +
> +		/*
> +		 * Less than ideal for large unaligned access but this should be
> +		 * fairly rare, can fixup if this becomes common.
> +		 */
> +		do {
> +			u8 bounce[XE_CACHELINE_BYTES];
> +			void *ptr = (void *)bounce;
> +			int err;
> +			int copy_bytes = min_t(int, bytes_left,
> +					       XE_CACHELINE_BYTES -
> +					       (offset & XE_CACHELINE_MASK));
> +			int ptr_offset = offset & XE_CACHELINE_MASK;
> +
> +			err = xe_migrate_access_memory(m, bo,
> +						       offset &
> +						       ~XE_CACHELINE_MASK,
> +						       (void *)ptr,
> +						       sizeof(bounce), 0);
> +			if (err)
> +				return err;
> +
> +			if (!write) {
> +				memcpy(buf + buf_offset, ptr + ptr_offset,
> +				       copy_bytes);
> +				goto next;
> +			}

Non-blocking:
This should debatably be an if-else block, rather than relying on a "goto next"
here, as "goto next" is not used elsewhere in the function.

"""
	if (write) {
		memcpy(ptr + ptr_offset, buf + buf_offset, copy_bytes);
		err = xe_migrate_access_memory(m, bo,
						    offset & ~XE_CACHELINE_MASK,
						    (void *)ptr,
						    sizeof(bounce), 0);
	} else {
		memcpy(buf + buf_offset, ptr + ptr_offset,
			  copy_bytes);
	}

	if (err)
		return err;
"""

> +
> +			memcpy(ptr + ptr_offset, buf + buf_offset, copy_bytes);
> +			err = xe_migrate_access_memory(m, bo,
> +						       offset & ~XE_CACHELINE_MASK,
> +						       (void *)ptr,
> +						       sizeof(bounce), 0);
> +			if (err)
> +				return err;
> +
> +next:
> +			bytes_left -= copy_bytes;
> +			buf_offset += copy_bytes;
> +			offset += copy_bytes;
> +		} while (bytes_left);
> +
> +		return 0;
> +	}
> +
> +	dma_addr = xe_migrate_dma_map(xe, buf, len + page_offset, write);
> +	if (IS_ERR(dma_addr))
> +		return PTR_ERR(dma_addr);
> +
> +	xe_res_first(bo->ttm.resource, offset, bo->size - offset, &cursor);
> +
> +	do {
> +		struct dma_fence *__fence;
> +		u64 vram_addr = vram_region_gpu_offset(bo->ttm.resource) +
> +			cursor.start;
> +		int current_bytes;
> +
> +		if (cursor.size > MAX_PREEMPTDISABLE_TRANSFER)
> +			current_bytes = min_t(int, bytes_left,
> +					      MAX_PREEMPTDISABLE_TRANSFER);
> +		else
> +			current_bytes = min_t(int, bytes_left, cursor.size);
> +
> +		if (fence)
> +			dma_fence_put(fence);
> +
> +		__fence = xe_migrate_vram(m, current_bytes,
> +					  (unsigned long)buf & ~PAGE_MASK,
> +					  dma_addr + current_page,
> +					  vram_addr, write ?
> +					  XE_MIGRATE_COPY_TO_VRAM :
> +					  XE_MIGRATE_COPY_TO_SRAM);
> +		if (IS_ERR(__fence)) {
> +			if (fence)
> +				dma_fence_wait(fence, false);
> +			fence = __fence;
> +			goto out_err;
> +		}
> +		fence = __fence;

Non-blocking:
It would be nice if we didn't have two paths assigning fence the __fence value,
but any attempt I do to try and change that ends in having to compute the IS_ERR
value for __fence twice.  E.G.:

"""
	if (IS_ERR(__fence) && fence)
		dma_fence_wait(fence, false);
	fence = __fence;
	if (IS_ERR(fence))
		goto out_err;
"""

So, either way works.

> +
> +		buf += current_bytes;
> +		offset += current_bytes;
> +		current_page = (int)(buf - orig_buf) / PAGE_SIZE;
> +		bytes_left -= current_bytes;
> +		if (bytes_left)
> +			xe_res_next(&cursor, current_bytes);
> +	} while (bytes_left);
> +
> +	dma_fence_wait(fence, false);
> +	dma_fence_put(fence);
> +	xe_migrate_dma_unmap(xe, dma_addr, len + page_offset, write);
> +
> +	return 0;
> +
> +out_err:
> +	xe_migrate_dma_unmap(xe, dma_addr, len + page_offset, write);
> +	return PTR_ERR(fence);

Non-blocking:
The branching execution might be able to be shortened.  Perhaps:
"""
	dma_fence_wait(fence, false);
	dma_fence_put(fence);

out_err:
	xe_migtate_dma_unmap(xe, dma_addr, len + page_offset, write);
	return IS_ERR(fence) ? PTR_ERR(fence) : 0;
}
"""
-Jonathan Cavitt

> +}
> +
>  #if IS_ENABLED(CONFIG_DRM_XE_KUNIT_TEST)
>  #include "tests/xe_migrate.c"
>  #endif
> diff --git a/drivers/gpu/drm/xe/xe_migrate.h b/drivers/gpu/drm/xe/xe_migrate.h
> index 6ff9a963425c..fb9839c1bae0 100644
> --- a/drivers/gpu/drm/xe/xe_migrate.h
> +++ b/drivers/gpu/drm/xe/xe_migrate.h
> @@ -112,6 +112,10 @@ struct dma_fence *xe_migrate_copy(struct xe_migrate *m,
>  				  struct ttm_resource *dst,
>  				  bool copy_only_ccs);
>  
> +int xe_migrate_access_memory(struct xe_migrate *m, struct xe_bo *bo,
> +			     unsigned long offset, void *buf, int len,
> +			     int write);
> +
>  #define XE_MIGRATE_CLEAR_FLAG_BO_DATA		BIT(0)
>  #define XE_MIGRATE_CLEAR_FLAG_CCS_DATA		BIT(1)
>  #define XE_MIGRATE_CLEAR_FLAG_FULL	(XE_MIGRATE_CLEAR_FLAG_BO_DATA | \
> -- 
> 2.34.1
> 
> 

^ permalink raw reply	[flat|nested] 20+ messages in thread

* RE: [PATCH v2 3/4] drm/print: Add drm_printer_is_full
  2025-04-03 20:27 ` [PATCH v2 3/4] drm/print: Add drm_printer_is_full Matthew Brost
@ 2025-04-03 21:28   ` Cavitt, Jonathan
  0 siblings, 0 replies; 20+ messages in thread
From: Cavitt, Jonathan @ 2025-04-03 21:28 UTC (permalink / raw)
  To: Brost, Matthew, intel-xe@lists.freedesktop.org
  Cc: dri-devel@lists.freedesktop.org, Cavitt, Jonathan

-----Original Message-----
From: Intel-xe <intel-xe-bounces@lists.freedesktop.org> On Behalf Of Matthew Brost
Sent: Thursday, April 3, 2025 1:27 PM
To: intel-xe@lists.freedesktop.org
Cc: dri-devel@lists.freedesktop.org
Subject: [PATCH v2 3/4] drm/print: Add drm_printer_is_full
> 
> Add drm_printer_is_full which indicates if a drm printer's output is
> full. Useful to short circuit coredump printing once printer's output is
> full.
> 
> Signed-off-by: Matthew Brost <matthew.brost@intel.com>

LGTM, though I will admit that "remain" is a bit of a strange name for the
iterator variable.  I'd almost imagine "remain" to refer to the "remaining
output the iterator needs to print", rather than the "remaining space in
the output buffer of the iterator".
But that's just an aside.  Nothing worth blocking on.

Reviewed-by: Jonathan Cavitt <jonathan.cavitt@intel.com>
-Jonathan Cavitt

> ---
>  include/drm/drm_print.h | 17 +++++++++++++++++
>  1 file changed, 17 insertions(+)
> 
> diff --git a/include/drm/drm_print.h b/include/drm/drm_print.h
> index f31eba1c7cab..db7517ee1b19 100644
> --- a/include/drm/drm_print.h
> +++ b/include/drm/drm_print.h
> @@ -242,6 +242,23 @@ struct drm_print_iterator {
>  	ssize_t offset;
>  };
>  
> +/**
> + * drm_printer_is_full() - DRM printer output is full
> + * @p: DRM printer
> + *
> + * DRM printer output is full, useful to short circuit coredump printing once
> + * printer is full.
> + *
> + * RETURNS:
> + * True if DRM printer output buffer is full, False otherwise
> + */
> +static inline bool drm_printer_is_full(struct drm_printer *p)
> +{
> +	struct drm_print_iterator *iterator = p->arg;
> +
> +	return !iterator->remain;
> +}
> +
>  /**
>   * drm_coredump_printer - construct a &drm_printer that can output to a buffer
>   * from the read function for devcoredump
> -- 
> 2.34.1
> 
> 

^ permalink raw reply	[flat|nested] 20+ messages in thread

* RE: [PATCH v2 4/4] drm/xe: Abort printing coredump in VM printer output if full
  2025-04-03 20:27 ` [PATCH v2 4/4] drm/xe: Abort printing coredump in VM printer output if full Matthew Brost
@ 2025-04-03 21:28   ` Cavitt, Jonathan
  0 siblings, 0 replies; 20+ messages in thread
From: Cavitt, Jonathan @ 2025-04-03 21:28 UTC (permalink / raw)
  To: Brost, Matthew, intel-xe@lists.freedesktop.org
  Cc: dri-devel@lists.freedesktop.org, Cavitt, Jonathan

-----Original Message-----
From: Intel-xe <intel-xe-bounces@lists.freedesktop.org> On Behalf Of Matthew Brost
Sent: Thursday, April 3, 2025 1:27 PM
To: intel-xe@lists.freedesktop.org
Cc: dri-devel@lists.freedesktop.org
Subject: [PATCH v2 4/4] drm/xe: Abort printing coredump in VM printer output if full
> 
> Abort printing coredump in VM printer output if full. Helps speedup
> large coredumps which need to walked multiple times in
> xe_devcoredump_read.
> 
> Signed-off-by: Matthew Brost <matthew.brost@intel.com>

LGTM.
Reviewed-by: Jonathan Cavitt <jonathan.cavitt@intel.com>
-Jonathan Cavitt

> ---
>  drivers/gpu/drm/xe/xe_vm.c | 3 +++
>  1 file changed, 3 insertions(+)
> 
> diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c
> index 864266e38aa7..164824617a2e 100644
> --- a/drivers/gpu/drm/xe/xe_vm.c
> +++ b/drivers/gpu/drm/xe/xe_vm.c
> @@ -3847,6 +3847,9 @@ void xe_vm_snapshot_print(struct xe_vm_snapshot *snap, struct drm_printer *p)
>  		}
>  
>  		drm_puts(p, "\n");
> +
> +		if (drm_printer_is_full(p))
> +			return;
>  	}
>  }
>  
> -- 
> 2.34.1
> 
> 

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v2 1/4] drm/xe: Add devcoredump chunking
  2025-04-03 20:27 ` [PATCH v2 1/4] drm/xe: Add devcoredump chunking Matthew Brost
  2025-04-03 21:28   ` Cavitt, Jonathan
@ 2025-04-03 21:39   ` John Harrison
  2025-04-03 22:32     ` Matthew Brost
  1 sibling, 1 reply; 20+ messages in thread
From: John Harrison @ 2025-04-03 21:39 UTC (permalink / raw)
  To: Matthew Brost, intel-xe; +Cc: dri-devel

On 4/3/2025 1:27 PM, Matthew Brost wrote:
> Chunk devcoredump into 1.5G pieces to avoid hitting the kvmalloc limit
> of 2G. Simple algorithm reads 1.5G at time in xe_devcoredump_read
> callback as needed.
>
> Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> ---
>   drivers/gpu/drm/xe/xe_devcoredump.c       | 59 ++++++++++++++++++-----
>   drivers/gpu/drm/xe/xe_devcoredump_types.h |  2 +
>   drivers/gpu/drm/xe/xe_guc_hwconfig.c      |  2 +-
>   3 files changed, 50 insertions(+), 13 deletions(-)
>
> diff --git a/drivers/gpu/drm/xe/xe_devcoredump.c b/drivers/gpu/drm/xe/xe_devcoredump.c
> index 81b9d9bb3f57..a9e618abf8ac 100644
> --- a/drivers/gpu/drm/xe/xe_devcoredump.c
> +++ b/drivers/gpu/drm/xe/xe_devcoredump.c
> @@ -80,7 +80,8 @@ static struct xe_guc *exec_queue_to_guc(struct xe_exec_queue *q)
>   	return &q->gt->uc.guc;
>   }
>   
> -static ssize_t __xe_devcoredump_read(char *buffer, size_t count,
> +static ssize_t __xe_devcoredump_read(char *buffer, ssize_t count,
> +				     ssize_t start,
>   				     struct xe_devcoredump *coredump)
>   {
>   	struct xe_device *xe;
> @@ -94,7 +95,7 @@ static ssize_t __xe_devcoredump_read(char *buffer, size_t count,
>   	ss = &coredump->snapshot;
>   
>   	iter.data = buffer;
> -	iter.start = 0;
> +	iter.start = start;
>   	iter.remain = count;
>   
>   	p = drm_coredump_printer(&iter);
> @@ -168,6 +169,8 @@ static void xe_devcoredump_snapshot_free(struct xe_devcoredump_snapshot *ss)
>   	ss->vm = NULL;
>   }
>   
> +#define XE_DEVCOREDUMP_CHUNK_MAX	(SZ_512M + SZ_1G)
> +
>   static ssize_t xe_devcoredump_read(char *buffer, loff_t offset,
>   				   size_t count, void *data, size_t datalen)
>   {
> @@ -183,6 +186,9 @@ static ssize_t xe_devcoredump_read(char *buffer, loff_t offset,
>   	/* Ensure delayed work is captured before continuing */
>   	flush_work(&ss->work);
>   
> +	if (ss->read.size > XE_DEVCOREDUMP_CHUNK_MAX)
> +		xe_pm_runtime_get(gt_to_xe(ss->gt));
> +
>   	mutex_lock(&coredump->lock);
>   
>   	if (!ss->read.buffer) {
> @@ -195,12 +201,26 @@ static ssize_t xe_devcoredump_read(char *buffer, loff_t offset,
>   		return 0;
>   	}
>   
> +	if (offset >= ss->read.chunk_position + XE_DEVCOREDUMP_CHUNK_MAX ||
> +	    offset < ss->read.chunk_position) {
> +		ss->read.chunk_position =
> +			ALIGN_DOWN(offset, XE_DEVCOREDUMP_CHUNK_MAX);
> +
> +		__xe_devcoredump_read(ss->read.buffer,
> +				      XE_DEVCOREDUMP_CHUNK_MAX,
> +				      ss->read.chunk_position, coredump);
> +	}
> +
>   	byte_copied = count < ss->read.size - offset ? count :
>   		ss->read.size - offset;
> -	memcpy(buffer, ss->read.buffer + offset, byte_copied);
> +	memcpy(buffer, ss->read.buffer +
> +	       (offset % XE_DEVCOREDUMP_CHUNK_MAX), byte_copied);
>   
>   	mutex_unlock(&coredump->lock);
>   
> +	if (ss->read.size > XE_DEVCOREDUMP_CHUNK_MAX)
> +		xe_pm_runtime_put(gt_to_xe(ss->gt));
> +
>   	return byte_copied;
>   }
>   
> @@ -254,17 +274,32 @@ static void xe_devcoredump_deferred_snap_work(struct work_struct *work)
>   	xe_guc_exec_queue_snapshot_capture_delayed(ss->ge);
>   	xe_force_wake_put(gt_to_fw(ss->gt), fw_ref);
>   
> -	xe_pm_runtime_put(xe);
> +	ss->read.chunk_position = 0;
>   
>   	/* Calculate devcoredump size */
> -	ss->read.size = __xe_devcoredump_read(NULL, INT_MAX, coredump);
> -
> -	ss->read.buffer = kvmalloc(ss->read.size, GFP_USER);
> -	if (!ss->read.buffer)
> -		return;
> +	ss->read.size = __xe_devcoredump_read(NULL, LONG_MAX, 0, coredump);
> +
> +	if (ss->read.size > XE_DEVCOREDUMP_CHUNK_MAX) {
> +		ss->read.buffer = kvmalloc(XE_DEVCOREDUMP_CHUNK_MAX,
> +					   GFP_USER);
> +		if (!ss->read.buffer)
> +			goto put_pm;
> +
> +		__xe_devcoredump_read(ss->read.buffer,
> +				      XE_DEVCOREDUMP_CHUNK_MAX,
> +				      0, coredump);
> +	} else {
> +		ss->read.buffer = kvmalloc(ss->read.size, GFP_USER);
> +		if (!ss->read.buffer)
> +			goto put_pm;
> +
> +		__xe_devcoredump_read(ss->read.buffer, ss->read.size, 0,
> +				      coredump);
> +		xe_devcoredump_snapshot_free(ss);
> +	}
>   
> -	__xe_devcoredump_read(ss->read.buffer, ss->read.size, coredump);
> -	xe_devcoredump_snapshot_free(ss);
> +put_pm:
> +	xe_pm_runtime_put(xe);
>   }
>   
>   static void devcoredump_snapshot(struct xe_devcoredump *coredump,
> @@ -425,7 +460,7 @@ void xe_print_blob_ascii85(struct drm_printer *p, const char *prefix, char suffi
>   	if (offset & 3)
>   		drm_printf(p, "Offset not word aligned: %zu", offset);
>   
> -	line_buff = kzalloc(DMESG_MAX_LINE_LEN, GFP_KERNEL);
> +	line_buff = kzalloc(DMESG_MAX_LINE_LEN, GFP_ATOMIC);
Is this related? Or should it be a separate patch?

>   	if (!line_buff) {
>   		drm_printf(p, "Failed to allocate line buffer\n");
>   		return;
> diff --git a/drivers/gpu/drm/xe/xe_devcoredump_types.h b/drivers/gpu/drm/xe/xe_devcoredump_types.h
> index 1a1d16a96b2d..a174385a6d83 100644
> --- a/drivers/gpu/drm/xe/xe_devcoredump_types.h
> +++ b/drivers/gpu/drm/xe/xe_devcoredump_types.h
> @@ -66,6 +66,8 @@ struct xe_devcoredump_snapshot {
>   	struct {
>   		/** @read.size: size of devcoredump in human readable format */
>   		ssize_t size;
> +		/** @read.chunk_position: position of devcoredump chunk */
> +		ssize_t chunk_position;
>   		/** @read.buffer: buffer of devcoredump in human readable format */
>   		char *buffer;
>   	} read;
> diff --git a/drivers/gpu/drm/xe/xe_guc_hwconfig.c b/drivers/gpu/drm/xe/xe_guc_hwconfig.c
> index af2c817d552c..21403a250834 100644
> --- a/drivers/gpu/drm/xe/xe_guc_hwconfig.c
> +++ b/drivers/gpu/drm/xe/xe_guc_hwconfig.c
> @@ -175,7 +175,7 @@ int xe_guc_hwconfig_lookup_u32(struct xe_guc *guc, u32 attribute, u32 *val)
>   	if (num_dw == 0)
>   		return -EINVAL;
>   
> -	hwconfig = kzalloc(size, GFP_KERNEL);
> +	hwconfig = kzalloc(size, GFP_ATOMIC);
Likewise this?

John.

>   	if (!hwconfig)
>   		return -ENOMEM;
>   


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v2 1/4] drm/xe: Add devcoredump chunking
  2025-04-03 21:39   ` John Harrison
@ 2025-04-03 22:32     ` Matthew Brost
  2025-04-03 22:41       ` John Harrison
  0 siblings, 1 reply; 20+ messages in thread
From: Matthew Brost @ 2025-04-03 22:32 UTC (permalink / raw)
  To: John Harrison; +Cc: intel-xe, dri-devel

On Thu, Apr 03, 2025 at 02:39:16PM -0700, John Harrison wrote:
> On 4/3/2025 1:27 PM, Matthew Brost wrote:
> > Chunk devcoredump into 1.5G pieces to avoid hitting the kvmalloc limit
> > of 2G. Simple algorithm reads 1.5G at time in xe_devcoredump_read
> > callback as needed.
> > 
> > Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> > ---
> >   drivers/gpu/drm/xe/xe_devcoredump.c       | 59 ++++++++++++++++++-----
> >   drivers/gpu/drm/xe/xe_devcoredump_types.h |  2 +
> >   drivers/gpu/drm/xe/xe_guc_hwconfig.c      |  2 +-
> >   3 files changed, 50 insertions(+), 13 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/xe/xe_devcoredump.c b/drivers/gpu/drm/xe/xe_devcoredump.c
> > index 81b9d9bb3f57..a9e618abf8ac 100644
> > --- a/drivers/gpu/drm/xe/xe_devcoredump.c
> > +++ b/drivers/gpu/drm/xe/xe_devcoredump.c
> > @@ -80,7 +80,8 @@ static struct xe_guc *exec_queue_to_guc(struct xe_exec_queue *q)
> >   	return &q->gt->uc.guc;
> >   }
> > -static ssize_t __xe_devcoredump_read(char *buffer, size_t count,
> > +static ssize_t __xe_devcoredump_read(char *buffer, ssize_t count,
> > +				     ssize_t start,
> >   				     struct xe_devcoredump *coredump)
> >   {
> >   	struct xe_device *xe;
> > @@ -94,7 +95,7 @@ static ssize_t __xe_devcoredump_read(char *buffer, size_t count,
> >   	ss = &coredump->snapshot;
> >   	iter.data = buffer;
> > -	iter.start = 0;
> > +	iter.start = start;
> >   	iter.remain = count;
> >   	p = drm_coredump_printer(&iter);
> > @@ -168,6 +169,8 @@ static void xe_devcoredump_snapshot_free(struct xe_devcoredump_snapshot *ss)
> >   	ss->vm = NULL;
> >   }
> > +#define XE_DEVCOREDUMP_CHUNK_MAX	(SZ_512M + SZ_1G)
> > +
> >   static ssize_t xe_devcoredump_read(char *buffer, loff_t offset,
> >   				   size_t count, void *data, size_t datalen)
> >   {
> > @@ -183,6 +186,9 @@ static ssize_t xe_devcoredump_read(char *buffer, loff_t offset,
> >   	/* Ensure delayed work is captured before continuing */
> >   	flush_work(&ss->work);
> > +	if (ss->read.size > XE_DEVCOREDUMP_CHUNK_MAX)
> > +		xe_pm_runtime_get(gt_to_xe(ss->gt));
> > +
> >   	mutex_lock(&coredump->lock);
> >   	if (!ss->read.buffer) {
> > @@ -195,12 +201,26 @@ static ssize_t xe_devcoredump_read(char *buffer, loff_t offset,
> >   		return 0;
> >   	}
> > +	if (offset >= ss->read.chunk_position + XE_DEVCOREDUMP_CHUNK_MAX ||
> > +	    offset < ss->read.chunk_position) {
> > +		ss->read.chunk_position =
> > +			ALIGN_DOWN(offset, XE_DEVCOREDUMP_CHUNK_MAX);
> > +
> > +		__xe_devcoredump_read(ss->read.buffer,
> > +				      XE_DEVCOREDUMP_CHUNK_MAX,
> > +				      ss->read.chunk_position, coredump);
> > +	}
> > +
> >   	byte_copied = count < ss->read.size - offset ? count :
> >   		ss->read.size - offset;
> > -	memcpy(buffer, ss->read.buffer + offset, byte_copied);
> > +	memcpy(buffer, ss->read.buffer +
> > +	       (offset % XE_DEVCOREDUMP_CHUNK_MAX), byte_copied);
> >   	mutex_unlock(&coredump->lock);
> > +	if (ss->read.size > XE_DEVCOREDUMP_CHUNK_MAX)
> > +		xe_pm_runtime_put(gt_to_xe(ss->gt));
> > +
> >   	return byte_copied;
> >   }
> > @@ -254,17 +274,32 @@ static void xe_devcoredump_deferred_snap_work(struct work_struct *work)
> >   	xe_guc_exec_queue_snapshot_capture_delayed(ss->ge);
> >   	xe_force_wake_put(gt_to_fw(ss->gt), fw_ref);
> > -	xe_pm_runtime_put(xe);
> > +	ss->read.chunk_position = 0;
> >   	/* Calculate devcoredump size */
> > -	ss->read.size = __xe_devcoredump_read(NULL, INT_MAX, coredump);
> > -
> > -	ss->read.buffer = kvmalloc(ss->read.size, GFP_USER);
> > -	if (!ss->read.buffer)
> > -		return;
> > +	ss->read.size = __xe_devcoredump_read(NULL, LONG_MAX, 0, coredump);
> > +
> > +	if (ss->read.size > XE_DEVCOREDUMP_CHUNK_MAX) {
> > +		ss->read.buffer = kvmalloc(XE_DEVCOREDUMP_CHUNK_MAX,
> > +					   GFP_USER);
> > +		if (!ss->read.buffer)
> > +			goto put_pm;
> > +
> > +		__xe_devcoredump_read(ss->read.buffer,
> > +				      XE_DEVCOREDUMP_CHUNK_MAX,
> > +				      0, coredump);
> > +	} else {
> > +		ss->read.buffer = kvmalloc(ss->read.size, GFP_USER);
> > +		if (!ss->read.buffer)
> > +			goto put_pm;
> > +
> > +		__xe_devcoredump_read(ss->read.buffer, ss->read.size, 0,
> > +				      coredump);
> > +		xe_devcoredump_snapshot_free(ss);
> > +	}
> > -	__xe_devcoredump_read(ss->read.buffer, ss->read.size, coredump);
> > -	xe_devcoredump_snapshot_free(ss);
> > +put_pm:
> > +	xe_pm_runtime_put(xe);
> >   }
> >   static void devcoredump_snapshot(struct xe_devcoredump *coredump,
> > @@ -425,7 +460,7 @@ void xe_print_blob_ascii85(struct drm_printer *p, const char *prefix, char suffi
> >   	if (offset & 3)
> >   		drm_printf(p, "Offset not word aligned: %zu", offset);
> > -	line_buff = kzalloc(DMESG_MAX_LINE_LEN, GFP_KERNEL);
> > +	line_buff = kzalloc(DMESG_MAX_LINE_LEN, GFP_ATOMIC);
> Is this related? Or should it be a separate patch?
> 

It is related. Now that __xe_devcoredump_read is called under
'coredump->lock' we are in the path of reclaim, __xe_devcoredump_read
calls xe_print_blob_ascii85.

Both cases the allocation is relatively small, so would be fairly
unlikely to fail.

I could make this a seperate prep patch if you think that would be
better.

> >   	if (!line_buff) {
> >   		drm_printf(p, "Failed to allocate line buffer\n");
> >   		return;
> > diff --git a/drivers/gpu/drm/xe/xe_devcoredump_types.h b/drivers/gpu/drm/xe/xe_devcoredump_types.h
> > index 1a1d16a96b2d..a174385a6d83 100644
> > --- a/drivers/gpu/drm/xe/xe_devcoredump_types.h
> > +++ b/drivers/gpu/drm/xe/xe_devcoredump_types.h
> > @@ -66,6 +66,8 @@ struct xe_devcoredump_snapshot {
> >   	struct {
> >   		/** @read.size: size of devcoredump in human readable format */
> >   		ssize_t size;
> > +		/** @read.chunk_position: position of devcoredump chunk */
> > +		ssize_t chunk_position;
> >   		/** @read.buffer: buffer of devcoredump in human readable format */
> >   		char *buffer;
> >   	} read;
> > diff --git a/drivers/gpu/drm/xe/xe_guc_hwconfig.c b/drivers/gpu/drm/xe/xe_guc_hwconfig.c
> > index af2c817d552c..21403a250834 100644
> > --- a/drivers/gpu/drm/xe/xe_guc_hwconfig.c
> > +++ b/drivers/gpu/drm/xe/xe_guc_hwconfig.c
> > @@ -175,7 +175,7 @@ int xe_guc_hwconfig_lookup_u32(struct xe_guc *guc, u32 attribute, u32 *val)
> >   	if (num_dw == 0)
> >   		return -EINVAL;
> > -	hwconfig = kzalloc(size, GFP_KERNEL);
> > +	hwconfig = kzalloc(size, GFP_ATOMIC);
> Likewise this?
> 

Same as above.

Matt

> John.
> 
> >   	if (!hwconfig)
> >   		return -ENOMEM;
> 

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v2 1/4] drm/xe: Add devcoredump chunking
  2025-04-03 22:32     ` Matthew Brost
@ 2025-04-03 22:41       ` John Harrison
  2025-04-04  2:33         ` Matthew Brost
  0 siblings, 1 reply; 20+ messages in thread
From: John Harrison @ 2025-04-03 22:41 UTC (permalink / raw)
  To: Matthew Brost; +Cc: intel-xe, dri-devel

On 4/3/2025 3:32 PM, Matthew Brost wrote:
> On Thu, Apr 03, 2025 at 02:39:16PM -0700, John Harrison wrote:
>> On 4/3/2025 1:27 PM, Matthew Brost wrote:
>>> Chunk devcoredump into 1.5G pieces to avoid hitting the kvmalloc limit
>>> of 2G. Simple algorithm reads 1.5G at time in xe_devcoredump_read
>>> callback as needed.
>>>
>>> Signed-off-by: Matthew Brost <matthew.brost@intel.com>
>>> ---
>>>    drivers/gpu/drm/xe/xe_devcoredump.c       | 59 ++++++++++++++++++-----
>>>    drivers/gpu/drm/xe/xe_devcoredump_types.h |  2 +
>>>    drivers/gpu/drm/xe/xe_guc_hwconfig.c      |  2 +-
>>>    3 files changed, 50 insertions(+), 13 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/xe/xe_devcoredump.c b/drivers/gpu/drm/xe/xe_devcoredump.c
>>> index 81b9d9bb3f57..a9e618abf8ac 100644
>>> --- a/drivers/gpu/drm/xe/xe_devcoredump.c
>>> +++ b/drivers/gpu/drm/xe/xe_devcoredump.c
>>> @@ -80,7 +80,8 @@ static struct xe_guc *exec_queue_to_guc(struct xe_exec_queue *q)
>>>    	return &q->gt->uc.guc;
>>>    }
>>> -static ssize_t __xe_devcoredump_read(char *buffer, size_t count,
>>> +static ssize_t __xe_devcoredump_read(char *buffer, ssize_t count,
>>> +				     ssize_t start,
>>>    				     struct xe_devcoredump *coredump)
>>>    {
>>>    	struct xe_device *xe;
>>> @@ -94,7 +95,7 @@ static ssize_t __xe_devcoredump_read(char *buffer, size_t count,
>>>    	ss = &coredump->snapshot;
>>>    	iter.data = buffer;
>>> -	iter.start = 0;
>>> +	iter.start = start;
>>>    	iter.remain = count;
>>>    	p = drm_coredump_printer(&iter);
>>> @@ -168,6 +169,8 @@ static void xe_devcoredump_snapshot_free(struct xe_devcoredump_snapshot *ss)
>>>    	ss->vm = NULL;
>>>    }
>>> +#define XE_DEVCOREDUMP_CHUNK_MAX	(SZ_512M + SZ_1G)
>>> +
>>>    static ssize_t xe_devcoredump_read(char *buffer, loff_t offset,
>>>    				   size_t count, void *data, size_t datalen)
>>>    {
>>> @@ -183,6 +186,9 @@ static ssize_t xe_devcoredump_read(char *buffer, loff_t offset,
>>>    	/* Ensure delayed work is captured before continuing */
>>>    	flush_work(&ss->work);
>>> +	if (ss->read.size > XE_DEVCOREDUMP_CHUNK_MAX)
>>> +		xe_pm_runtime_get(gt_to_xe(ss->gt));
>>> +
>>>    	mutex_lock(&coredump->lock);
>>>    	if (!ss->read.buffer) {
>>> @@ -195,12 +201,26 @@ static ssize_t xe_devcoredump_read(char *buffer, loff_t offset,
>>>    		return 0;
>>>    	}
>>> +	if (offset >= ss->read.chunk_position + XE_DEVCOREDUMP_CHUNK_MAX ||
>>> +	    offset < ss->read.chunk_position) {
>>> +		ss->read.chunk_position =
>>> +			ALIGN_DOWN(offset, XE_DEVCOREDUMP_CHUNK_MAX);
>>> +
>>> +		__xe_devcoredump_read(ss->read.buffer,
>>> +				      XE_DEVCOREDUMP_CHUNK_MAX,
>>> +				      ss->read.chunk_position, coredump);
>>> +	}
>>> +
>>>    	byte_copied = count < ss->read.size - offset ? count :
>>>    		ss->read.size - offset;
>>> -	memcpy(buffer, ss->read.buffer + offset, byte_copied);
>>> +	memcpy(buffer, ss->read.buffer +
>>> +	       (offset % XE_DEVCOREDUMP_CHUNK_MAX), byte_copied);
>>>    	mutex_unlock(&coredump->lock);
>>> +	if (ss->read.size > XE_DEVCOREDUMP_CHUNK_MAX)
>>> +		xe_pm_runtime_put(gt_to_xe(ss->gt));
>>> +
>>>    	return byte_copied;
>>>    }
>>> @@ -254,17 +274,32 @@ static void xe_devcoredump_deferred_snap_work(struct work_struct *work)
>>>    	xe_guc_exec_queue_snapshot_capture_delayed(ss->ge);
>>>    	xe_force_wake_put(gt_to_fw(ss->gt), fw_ref);
>>> -	xe_pm_runtime_put(xe);
>>> +	ss->read.chunk_position = 0;
>>>    	/* Calculate devcoredump size */
>>> -	ss->read.size = __xe_devcoredump_read(NULL, INT_MAX, coredump);
>>> -
>>> -	ss->read.buffer = kvmalloc(ss->read.size, GFP_USER);
>>> -	if (!ss->read.buffer)
>>> -		return;
>>> +	ss->read.size = __xe_devcoredump_read(NULL, LONG_MAX, 0, coredump);
>>> +
>>> +	if (ss->read.size > XE_DEVCOREDUMP_CHUNK_MAX) {
>>> +		ss->read.buffer = kvmalloc(XE_DEVCOREDUMP_CHUNK_MAX,
>>> +					   GFP_USER);
>>> +		if (!ss->read.buffer)
>>> +			goto put_pm;
>>> +
>>> +		__xe_devcoredump_read(ss->read.buffer,
>>> +				      XE_DEVCOREDUMP_CHUNK_MAX,
>>> +				      0, coredump);
>>> +	} else {
>>> +		ss->read.buffer = kvmalloc(ss->read.size, GFP_USER);
>>> +		if (!ss->read.buffer)
>>> +			goto put_pm;
>>> +
>>> +		__xe_devcoredump_read(ss->read.buffer, ss->read.size, 0,
>>> +				      coredump);
>>> +		xe_devcoredump_snapshot_free(ss);
>>> +	}
>>> -	__xe_devcoredump_read(ss->read.buffer, ss->read.size, coredump);
>>> -	xe_devcoredump_snapshot_free(ss);
>>> +put_pm:
>>> +	xe_pm_runtime_put(xe);
>>>    }
>>>    static void devcoredump_snapshot(struct xe_devcoredump *coredump,
>>> @@ -425,7 +460,7 @@ void xe_print_blob_ascii85(struct drm_printer *p, const char *prefix, char suffi
>>>    	if (offset & 3)
>>>    		drm_printf(p, "Offset not word aligned: %zu", offset);
>>> -	line_buff = kzalloc(DMESG_MAX_LINE_LEN, GFP_KERNEL);
>>> +	line_buff = kzalloc(DMESG_MAX_LINE_LEN, GFP_ATOMIC);
>> Is this related? Or should it be a separate patch?
>>
> It is related. Now that __xe_devcoredump_read is called under
> 'coredump->lock' we are in the path of reclaim, __xe_devcoredump_read
> calls xe_print_blob_ascii85.
>
> Both cases the allocation is relatively small, so would be fairly
> unlikely to fail.
>
> I could make this a seperate prep patch if you think that would be
> better.
Not worth splitting if this is the requirement. But maybe just add a 
comment to the commit message about why it is necessary.

>
>>>    	if (!line_buff) {
>>>    		drm_printf(p, "Failed to allocate line buffer\n");
>>>    		return;
>>> diff --git a/drivers/gpu/drm/xe/xe_devcoredump_types.h b/drivers/gpu/drm/xe/xe_devcoredump_types.h
>>> index 1a1d16a96b2d..a174385a6d83 100644
>>> --- a/drivers/gpu/drm/xe/xe_devcoredump_types.h
>>> +++ b/drivers/gpu/drm/xe/xe_devcoredump_types.h
>>> @@ -66,6 +66,8 @@ struct xe_devcoredump_snapshot {
>>>    	struct {
>>>    		/** @read.size: size of devcoredump in human readable format */
>>>    		ssize_t size;
>>> +		/** @read.chunk_position: position of devcoredump chunk */
>>> +		ssize_t chunk_position;
>>>    		/** @read.buffer: buffer of devcoredump in human readable format */
>>>    		char *buffer;
>>>    	} read;
>>> diff --git a/drivers/gpu/drm/xe/xe_guc_hwconfig.c b/drivers/gpu/drm/xe/xe_guc_hwconfig.c
>>> index af2c817d552c..21403a250834 100644
>>> --- a/drivers/gpu/drm/xe/xe_guc_hwconfig.c
>>> +++ b/drivers/gpu/drm/xe/xe_guc_hwconfig.c
>>> @@ -175,7 +175,7 @@ int xe_guc_hwconfig_lookup_u32(struct xe_guc *guc, u32 attribute, u32 *val)
>>>    	if (num_dw == 0)
>>>    		return -EINVAL;
>>> -	hwconfig = kzalloc(size, GFP_KERNEL);
>>> +	hwconfig = kzalloc(size, GFP_ATOMIC);
>> Likewise this?
Hmm, hadn't realised we were doing an alloc/copy/free for every config 
table access. Is that just to avoid doing iosys type reads because the 
table is in LMEM? Seems a lot of overhead for each individual u32 read!

But also why are we doing table reads during core dump -> text printing? 
AFAICT, the only reads are in the steering code for doing EU register 
reads. Don't all the reg reads happen up front and then we stop 
accessing the hardware while doing the chunked dump? If we are 
re-reading hardware state when doing the text conversion we can end up 
with inconsistent dumps as the state changes from one chunk to the next?

John.


>>
> Same as above.
>
> Matt
>
>> John.
>>
>>>    	if (!hwconfig)
>>>    		return -ENOMEM;


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v2 2/4] drm/xe: Update xe_ttm_access_memory to use GPU for non-visible access
  2025-04-03 20:27 ` [PATCH v2 2/4] drm/xe: Update xe_ttm_access_memory to use GPU for non-visible access Matthew Brost
  2025-04-03 21:28   ` Cavitt, Jonathan
@ 2025-04-04  1:03   ` kernel test robot
  2025-04-04  3:36   ` kernel test robot
  2025-04-04  3:46   ` kernel test robot
  3 siblings, 0 replies; 20+ messages in thread
From: kernel test robot @ 2025-04-04  1:03 UTC (permalink / raw)
  To: Matthew Brost; +Cc: oe-kbuild-all

Hi Matthew,

kernel test robot noticed the following build warnings:

[auto build test WARNING on drm-xe/drm-xe-next]
[also build test WARNING on next-20250403]
[cannot apply to drm-exynos/exynos-drm-next linus/master drm/drm-next drm-intel/for-linux-next drm-intel/for-linux-next-fixes drm-misc/drm-misc-next drm-tip/drm-tip v6.14]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Matthew-Brost/drm-xe-Add-devcoredump-chunking/20250404-042700
base:   https://gitlab.freedesktop.org/drm/xe/kernel.git drm-xe-next
patch link:    https://lore.kernel.org/r/20250403202705.18488-3-matthew.brost%40intel.com
patch subject: [PATCH v2 2/4] drm/xe: Update xe_ttm_access_memory to use GPU for non-visible access
config: alpha-allyesconfig (https://download.01.org/0day-ci/archive/20250404/202504040820.oEc4vaWF-lkp@intel.com/config)
compiler: alpha-linux-gcc (GCC) 14.2.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20250404/202504040820.oEc4vaWF-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202504040820.oEc4vaWF-lkp@intel.com/

All warnings (new ones prefixed by >>):

   drivers/gpu/drm/xe/xe_migrate.c: In function 'xe_migrate_access_memory':
   drivers/gpu/drm/xe/xe_migrate.c:1819:19: error: 'XE_CACHELINE_MASK' undeclared (first use in this function)
    1819 |         if (len & XE_CACHELINE_MASK ||
         |                   ^~~~~~~~~~~~~~~~~
   drivers/gpu/drm/xe/xe_migrate.c:1819:19: note: each undeclared identifier is reported only once for each function it appears in
   drivers/gpu/drm/xe/xe_migrate.c:1828:35: error: 'XE_CACHELINE_BYTES' undeclared (first use in this function); did you mean 'L1_CACHE_BYTES'?
    1828 |                         u8 bounce[XE_CACHELINE_BYTES];
         |                                   ^~~~~~~~~~~~~~~~~~
         |                                   L1_CACHE_BYTES
>> drivers/gpu/drm/xe/xe_migrate.c:1828:28: warning: unused variable 'bounce' [-Wunused-variable]
    1828 |                         u8 bounce[XE_CACHELINE_BYTES];
         |                            ^~~~~~
   drivers/gpu/drm/xe/xe_migrate.c:1888:27: error: implicit declaration of function 'xe_migrate_vram'; did you mean 'xe_migrate_to_vram'? [-Wimplicit-function-declaration]
    1888 |                 __fence = xe_migrate_vram(m, current_bytes,
         |                           ^~~~~~~~~~~~~~~
         |                           xe_migrate_to_vram
   drivers/gpu/drm/xe/xe_migrate.c:1892:43: error: 'XE_MIGRATE_COPY_TO_VRAM' undeclared (first use in this function)
    1892 |                                           XE_MIGRATE_COPY_TO_VRAM :
         |                                           ^~~~~~~~~~~~~~~~~~~~~~~
   drivers/gpu/drm/xe/xe_migrate.c:1893:43: error: 'XE_MIGRATE_COPY_TO_SRAM' undeclared (first use in this function)
    1893 |                                           XE_MIGRATE_COPY_TO_SRAM);
         |                                           ^~~~~~~~~~~~~~~~~~~~~~~


vim +/bounce +1828 drivers/gpu/drm/xe/xe_migrate.c

  1785	
  1786	/**
  1787	 * xe_migrate_access_memory - Access memory of a BO via GPU
  1788	 *
  1789	 * @m: The migration context.
  1790	 * @bo: buffer object
  1791	 * @offset: access offset into buffer object
  1792	 * @buf: pointer to caller memory to read into or write from
  1793	 * @len: length of access
  1794	 * @write: write access
  1795	 *
  1796	 * Access memory of a BO via GPU either reading in or writing from a passed in
  1797	 * pointer. Pointer is dma mapped for GPU access and GPU commands are issued to
  1798	 * read to or write from pointer.
  1799	 *
  1800	 * Returns:
  1801	 * 0 if successful, negative error code on failure.
  1802	 */
  1803	int xe_migrate_access_memory(struct xe_migrate *m, struct xe_bo *bo,
  1804				     unsigned long offset, void *buf, int len,
  1805				     int write)
  1806	{
  1807		struct xe_tile *tile = m->tile;
  1808		struct xe_device *xe = tile_to_xe(tile);
  1809		struct xe_res_cursor cursor;
  1810		struct dma_fence *fence = NULL;
  1811		dma_addr_t *dma_addr;
  1812		unsigned long page_offset = (unsigned long)buf & ~PAGE_MASK;
  1813		int bytes_left = len, current_page = 0;
  1814		void *orig_buf = buf;
  1815	
  1816		xe_bo_assert_held(bo);
  1817	
  1818		/* Use bounce buffer for small access and unaligned access */
> 1819		if (len & XE_CACHELINE_MASK ||
  1820		    ((uintptr_t)buf | offset) & XE_CACHELINE_MASK) {
  1821			int buf_offset = 0;
  1822	
  1823			/*
  1824			 * Less than ideal for large unaligned access but this should be
  1825			 * fairly rare, can fixup if this becomes common.
  1826			 */
  1827			do {
> 1828				u8 bounce[XE_CACHELINE_BYTES];
  1829				void *ptr = (void *)bounce;
  1830				int err;
  1831				int copy_bytes = min_t(int, bytes_left,
  1832						       XE_CACHELINE_BYTES -
  1833						       (offset & XE_CACHELINE_MASK));
  1834				int ptr_offset = offset & XE_CACHELINE_MASK;
  1835	
  1836				err = xe_migrate_access_memory(m, bo,
  1837							       offset &
  1838							       ~XE_CACHELINE_MASK,
  1839							       (void *)ptr,
  1840							       sizeof(bounce), 0);
  1841				if (err)
  1842					return err;
  1843	
  1844				if (!write) {
  1845					memcpy(buf + buf_offset, ptr + ptr_offset,
  1846					       copy_bytes);
  1847					goto next;
  1848				}
  1849	
  1850				memcpy(ptr + ptr_offset, buf + buf_offset, copy_bytes);
  1851				err = xe_migrate_access_memory(m, bo,
  1852							       offset & ~XE_CACHELINE_MASK,
  1853							       (void *)ptr,
  1854							       sizeof(bounce), 0);
  1855				if (err)
  1856					return err;
  1857	
  1858	next:
  1859				bytes_left -= copy_bytes;
  1860				buf_offset += copy_bytes;
  1861				offset += copy_bytes;
  1862			} while (bytes_left);
  1863	
  1864			return 0;
  1865		}
  1866	
  1867		dma_addr = xe_migrate_dma_map(xe, buf, len + page_offset, write);
  1868		if (IS_ERR(dma_addr))
  1869			return PTR_ERR(dma_addr);
  1870	
  1871		xe_res_first(bo->ttm.resource, offset, bo->size - offset, &cursor);
  1872	
  1873		do {
  1874			struct dma_fence *__fence;
  1875			u64 vram_addr = vram_region_gpu_offset(bo->ttm.resource) +
  1876				cursor.start;
  1877			int current_bytes;
  1878	
  1879			if (cursor.size > MAX_PREEMPTDISABLE_TRANSFER)
  1880				current_bytes = min_t(int, bytes_left,
  1881						      MAX_PREEMPTDISABLE_TRANSFER);
  1882			else
  1883				current_bytes = min_t(int, bytes_left, cursor.size);
  1884	
  1885			if (fence)
  1886				dma_fence_put(fence);
  1887	
  1888			__fence = xe_migrate_vram(m, current_bytes,
  1889						  (unsigned long)buf & ~PAGE_MASK,
  1890						  dma_addr + current_page,
  1891						  vram_addr, write ?
  1892						  XE_MIGRATE_COPY_TO_VRAM :
  1893						  XE_MIGRATE_COPY_TO_SRAM);
  1894			if (IS_ERR(__fence)) {
  1895				if (fence)
  1896					dma_fence_wait(fence, false);
  1897				fence = __fence;
  1898				goto out_err;
  1899			}
  1900			fence = __fence;
  1901	
  1902			buf += current_bytes;
  1903			offset += current_bytes;
  1904			current_page = (int)(buf - orig_buf) / PAGE_SIZE;
  1905			bytes_left -= current_bytes;
  1906			if (bytes_left)
  1907				xe_res_next(&cursor, current_bytes);
  1908		} while (bytes_left);
  1909	
  1910		dma_fence_wait(fence, false);
  1911		dma_fence_put(fence);
  1912		xe_migrate_dma_unmap(xe, dma_addr, len + page_offset, write);
  1913	
  1914		return 0;
  1915	
  1916	out_err:
  1917		xe_migrate_dma_unmap(xe, dma_addr, len + page_offset, write);
  1918		return PTR_ERR(fence);
  1919	}
  1920	

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v2 1/4] drm/xe: Add devcoredump chunking
  2025-04-03 22:41       ` John Harrison
@ 2025-04-04  2:33         ` Matthew Brost
  0 siblings, 0 replies; 20+ messages in thread
From: Matthew Brost @ 2025-04-04  2:33 UTC (permalink / raw)
  To: John Harrison; +Cc: intel-xe, dri-devel

On Thu, Apr 03, 2025 at 03:41:16PM -0700, John Harrison wrote:
> On 4/3/2025 3:32 PM, Matthew Brost wrote:
> > On Thu, Apr 03, 2025 at 02:39:16PM -0700, John Harrison wrote:
> > > On 4/3/2025 1:27 PM, Matthew Brost wrote:
> > > > Chunk devcoredump into 1.5G pieces to avoid hitting the kvmalloc limit
> > > > of 2G. Simple algorithm reads 1.5G at time in xe_devcoredump_read
> > > > callback as needed.
> > > > 
> > > > Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> > > > ---
> > > >    drivers/gpu/drm/xe/xe_devcoredump.c       | 59 ++++++++++++++++++-----
> > > >    drivers/gpu/drm/xe/xe_devcoredump_types.h |  2 +
> > > >    drivers/gpu/drm/xe/xe_guc_hwconfig.c      |  2 +-
> > > >    3 files changed, 50 insertions(+), 13 deletions(-)
> > > > 
> > > > diff --git a/drivers/gpu/drm/xe/xe_devcoredump.c b/drivers/gpu/drm/xe/xe_devcoredump.c
> > > > index 81b9d9bb3f57..a9e618abf8ac 100644
> > > > --- a/drivers/gpu/drm/xe/xe_devcoredump.c
> > > > +++ b/drivers/gpu/drm/xe/xe_devcoredump.c
> > > > @@ -80,7 +80,8 @@ static struct xe_guc *exec_queue_to_guc(struct xe_exec_queue *q)
> > > >    	return &q->gt->uc.guc;
> > > >    }
> > > > -static ssize_t __xe_devcoredump_read(char *buffer, size_t count,
> > > > +static ssize_t __xe_devcoredump_read(char *buffer, ssize_t count,
> > > > +				     ssize_t start,
> > > >    				     struct xe_devcoredump *coredump)
> > > >    {
> > > >    	struct xe_device *xe;
> > > > @@ -94,7 +95,7 @@ static ssize_t __xe_devcoredump_read(char *buffer, size_t count,
> > > >    	ss = &coredump->snapshot;
> > > >    	iter.data = buffer;
> > > > -	iter.start = 0;
> > > > +	iter.start = start;
> > > >    	iter.remain = count;
> > > >    	p = drm_coredump_printer(&iter);
> > > > @@ -168,6 +169,8 @@ static void xe_devcoredump_snapshot_free(struct xe_devcoredump_snapshot *ss)
> > > >    	ss->vm = NULL;
> > > >    }
> > > > +#define XE_DEVCOREDUMP_CHUNK_MAX	(SZ_512M + SZ_1G)
> > > > +
> > > >    static ssize_t xe_devcoredump_read(char *buffer, loff_t offset,
> > > >    				   size_t count, void *data, size_t datalen)
> > > >    {
> > > > @@ -183,6 +186,9 @@ static ssize_t xe_devcoredump_read(char *buffer, loff_t offset,
> > > >    	/* Ensure delayed work is captured before continuing */
> > > >    	flush_work(&ss->work);
> > > > +	if (ss->read.size > XE_DEVCOREDUMP_CHUNK_MAX)
> > > > +		xe_pm_runtime_get(gt_to_xe(ss->gt));
> > > > +
> > > >    	mutex_lock(&coredump->lock);
> > > >    	if (!ss->read.buffer) {
> > > > @@ -195,12 +201,26 @@ static ssize_t xe_devcoredump_read(char *buffer, loff_t offset,
> > > >    		return 0;
> > > >    	}
> > > > +	if (offset >= ss->read.chunk_position + XE_DEVCOREDUMP_CHUNK_MAX ||
> > > > +	    offset < ss->read.chunk_position) {
> > > > +		ss->read.chunk_position =
> > > > +			ALIGN_DOWN(offset, XE_DEVCOREDUMP_CHUNK_MAX);
> > > > +
> > > > +		__xe_devcoredump_read(ss->read.buffer,
> > > > +				      XE_DEVCOREDUMP_CHUNK_MAX,
> > > > +				      ss->read.chunk_position, coredump);
> > > > +	}
> > > > +
> > > >    	byte_copied = count < ss->read.size - offset ? count :
> > > >    		ss->read.size - offset;
> > > > -	memcpy(buffer, ss->read.buffer + offset, byte_copied);
> > > > +	memcpy(buffer, ss->read.buffer +
> > > > +	       (offset % XE_DEVCOREDUMP_CHUNK_MAX), byte_copied);
> > > >    	mutex_unlock(&coredump->lock);
> > > > +	if (ss->read.size > XE_DEVCOREDUMP_CHUNK_MAX)
> > > > +		xe_pm_runtime_put(gt_to_xe(ss->gt));
> > > > +
> > > >    	return byte_copied;
> > > >    }
> > > > @@ -254,17 +274,32 @@ static void xe_devcoredump_deferred_snap_work(struct work_struct *work)
> > > >    	xe_guc_exec_queue_snapshot_capture_delayed(ss->ge);
> > > >    	xe_force_wake_put(gt_to_fw(ss->gt), fw_ref);
> > > > -	xe_pm_runtime_put(xe);
> > > > +	ss->read.chunk_position = 0;
> > > >    	/* Calculate devcoredump size */
> > > > -	ss->read.size = __xe_devcoredump_read(NULL, INT_MAX, coredump);
> > > > -
> > > > -	ss->read.buffer = kvmalloc(ss->read.size, GFP_USER);
> > > > -	if (!ss->read.buffer)
> > > > -		return;
> > > > +	ss->read.size = __xe_devcoredump_read(NULL, LONG_MAX, 0, coredump);
> > > > +
> > > > +	if (ss->read.size > XE_DEVCOREDUMP_CHUNK_MAX) {
> > > > +		ss->read.buffer = kvmalloc(XE_DEVCOREDUMP_CHUNK_MAX,
> > > > +					   GFP_USER);
> > > > +		if (!ss->read.buffer)
> > > > +			goto put_pm;
> > > > +
> > > > +		__xe_devcoredump_read(ss->read.buffer,
> > > > +				      XE_DEVCOREDUMP_CHUNK_MAX,
> > > > +				      0, coredump);
> > > > +	} else {
> > > > +		ss->read.buffer = kvmalloc(ss->read.size, GFP_USER);
> > > > +		if (!ss->read.buffer)
> > > > +			goto put_pm;
> > > > +
> > > > +		__xe_devcoredump_read(ss->read.buffer, ss->read.size, 0,
> > > > +				      coredump);
> > > > +		xe_devcoredump_snapshot_free(ss);
> > > > +	}
> > > > -	__xe_devcoredump_read(ss->read.buffer, ss->read.size, coredump);
> > > > -	xe_devcoredump_snapshot_free(ss);
> > > > +put_pm:
> > > > +	xe_pm_runtime_put(xe);
> > > >    }
> > > >    static void devcoredump_snapshot(struct xe_devcoredump *coredump,
> > > > @@ -425,7 +460,7 @@ void xe_print_blob_ascii85(struct drm_printer *p, const char *prefix, char suffi
> > > >    	if (offset & 3)
> > > >    		drm_printf(p, "Offset not word aligned: %zu", offset);
> > > > -	line_buff = kzalloc(DMESG_MAX_LINE_LEN, GFP_KERNEL);
> > > > +	line_buff = kzalloc(DMESG_MAX_LINE_LEN, GFP_ATOMIC);
> > > Is this related? Or should it be a separate patch?
> > > 
> > It is related. Now that __xe_devcoredump_read is called under
> > 'coredump->lock' we are in the path of reclaim, __xe_devcoredump_read
> > calls xe_print_blob_ascii85.
> > 
> > Both cases the allocation is relatively small, so would be fairly
> > unlikely to fail.
> > 
> > I could make this a seperate prep patch if you think that would be
> > better.
> Not worth splitting if this is the requirement. But maybe just add a comment
> to the commit message about why it is necessary.
> 

Sure.

> > 
> > > >    	if (!line_buff) {
> > > >    		drm_printf(p, "Failed to allocate line buffer\n");
> > > >    		return;
> > > > diff --git a/drivers/gpu/drm/xe/xe_devcoredump_types.h b/drivers/gpu/drm/xe/xe_devcoredump_types.h
> > > > index 1a1d16a96b2d..a174385a6d83 100644
> > > > --- a/drivers/gpu/drm/xe/xe_devcoredump_types.h
> > > > +++ b/drivers/gpu/drm/xe/xe_devcoredump_types.h
> > > > @@ -66,6 +66,8 @@ struct xe_devcoredump_snapshot {
> > > >    	struct {
> > > >    		/** @read.size: size of devcoredump in human readable format */
> > > >    		ssize_t size;
> > > > +		/** @read.chunk_position: position of devcoredump chunk */
> > > > +		ssize_t chunk_position;
> > > >    		/** @read.buffer: buffer of devcoredump in human readable format */
> > > >    		char *buffer;
> > > >    	} read;
> > > > diff --git a/drivers/gpu/drm/xe/xe_guc_hwconfig.c b/drivers/gpu/drm/xe/xe_guc_hwconfig.c
> > > > index af2c817d552c..21403a250834 100644
> > > > --- a/drivers/gpu/drm/xe/xe_guc_hwconfig.c
> > > > +++ b/drivers/gpu/drm/xe/xe_guc_hwconfig.c
> > > > @@ -175,7 +175,7 @@ int xe_guc_hwconfig_lookup_u32(struct xe_guc *guc, u32 attribute, u32 *val)
> > > >    	if (num_dw == 0)
> > > >    		return -EINVAL;
> > > > -	hwconfig = kzalloc(size, GFP_KERNEL);
> > > > +	hwconfig = kzalloc(size, GFP_ATOMIC);
> > > Likewise this?
> Hmm, hadn't realised we were doing an alloc/copy/free for every config table
> access. Is that just to avoid doing iosys type reads because the table is in
> LMEM? Seems a lot of overhead for each individual u32 read!
> 
> But also why are we doing table reads during core dump -> text printing?
> AFAICT, the only reads are in the steering code for doing EU register reads.
> Don't all the reg reads happen up front and then we stop accessing the
> hardware while doing the chunked dump? If we are re-reading hardware state
> when doing the text conversion we can end up with inconsistent dumps as the
> state changes from one chunk to the next?
> 

Honestly not familiar with this code at all, lockdep puked and just fixed it. :)

On its surface, it does like an unnecessary allocation.

Matt

> John.
> 
> 
> > > 
> > Same as above.
> > 
> > Matt
> > 
> > > John.
> > > 
> > > >    	if (!hwconfig)
> > > >    		return -ENOMEM;
> 

^ permalink raw reply	[flat|nested] 20+ messages in thread

* ✓ CI.Patch_applied: success for Large devcoredump file support (rev2)
  2025-04-03 20:27 [PATCH v2 0/4] Large devcoredump file support Matthew Brost
                   ` (3 preceding siblings ...)
  2025-04-03 20:27 ` [PATCH v2 4/4] drm/xe: Abort printing coredump in VM printer output if full Matthew Brost
@ 2025-04-04  3:31 ` Patchwork
  2025-04-04  3:32 ` ✓ CI.checkpatch: " Patchwork
  2025-04-04  3:32 ` ✗ CI.KUnit: failure " Patchwork
  6 siblings, 0 replies; 20+ messages in thread
From: Patchwork @ 2025-04-04  3:31 UTC (permalink / raw)
  To: Matthew Brost; +Cc: intel-xe

== Series Details ==

Series: Large devcoredump file support (rev2)
URL   : https://patchwork.freedesktop.org/series/147085/
State : success

== Summary ==

=== Applying kernel patches on branch 'drm-tip' with base: ===
Base commit: bc18da45d48d drm-tip: 2025y-04m-03d-16h-39m-16s UTC integration manifest
=== git am output follows ===
Applying: drm/xe: Add devcoredump chunking
Applying: drm/xe: Update xe_ttm_access_memory to use GPU for non-visible access
Applying: drm/print: Add drm_printer_is_full
Applying: drm/xe: Abort printing coredump in VM printer output if full



^ permalink raw reply	[flat|nested] 20+ messages in thread

* ✓ CI.checkpatch: success for Large devcoredump file support (rev2)
  2025-04-03 20:27 [PATCH v2 0/4] Large devcoredump file support Matthew Brost
                   ` (4 preceding siblings ...)
  2025-04-04  3:31 ` ✓ CI.Patch_applied: success for Large devcoredump file support (rev2) Patchwork
@ 2025-04-04  3:32 ` Patchwork
  2025-04-04  3:32 ` ✗ CI.KUnit: failure " Patchwork
  6 siblings, 0 replies; 20+ messages in thread
From: Patchwork @ 2025-04-04  3:32 UTC (permalink / raw)
  To: Matthew Brost; +Cc: intel-xe

== Series Details ==

Series: Large devcoredump file support (rev2)
URL   : https://patchwork.freedesktop.org/series/147085/
State : success

== Summary ==

+ KERNEL=/kernel
+ git clone https://gitlab.freedesktop.org/drm/maintainer-tools mt
Cloning into 'mt'...
warning: redirecting to https://gitlab.freedesktop.org/drm/maintainer-tools.git/
+ git -C mt rev-list -n1 origin/master
99e5a866b5e13f134e606a3e29d9508d97826fb3
+ cd /kernel
+ git config --global --add safe.directory /kernel
+ git log -n1
commit 3be0df16c3f601049941a67eda14fc2517175ec5
Author: Matthew Brost <matthew.brost@intel.com>
Date:   Thu Apr 3 13:27:05 2025 -0700

    drm/xe: Abort printing coredump in VM printer output if full
    
    Abort printing coredump in VM printer output if full. Helps speedup
    large coredumps which need to walked multiple times in
    xe_devcoredump_read.
    
    Signed-off-by: Matthew Brost <matthew.brost@intel.com>
    Reviewed-by: Jonathan Cavitt <jonathan.cavitt@intel.com>
+ /mt/dim checkpatch bc18da45d48d337b92a7ff9546ba61da32b3b586 drm-intel
8bc892e4cfa7 drm/xe: Add devcoredump chunking
12ddfe38c995 drm/xe: Update xe_ttm_access_memory to use GPU for non-visible access
4bbe5e66b06a drm/print: Add drm_printer_is_full
3be0df16c3f6 drm/xe: Abort printing coredump in VM printer output if full



^ permalink raw reply	[flat|nested] 20+ messages in thread

* ✗ CI.KUnit: failure for Large devcoredump file support (rev2)
  2025-04-03 20:27 [PATCH v2 0/4] Large devcoredump file support Matthew Brost
                   ` (5 preceding siblings ...)
  2025-04-04  3:32 ` ✓ CI.checkpatch: " Patchwork
@ 2025-04-04  3:32 ` Patchwork
  6 siblings, 0 replies; 20+ messages in thread
From: Patchwork @ 2025-04-04  3:32 UTC (permalink / raw)
  To: Matthew Brost; +Cc: intel-xe

== Series Details ==

Series: Large devcoredump file support (rev2)
URL   : https://patchwork.freedesktop.org/series/147085/
State : failure

== Summary ==

+ trap cleanup EXIT
+ /kernel/tools/testing/kunit/kunit.py run --kunitconfig /kernel/drivers/gpu/drm/xe/.kunitconfig
[03:32:02] Configuring KUnit Kernel ...
Generating .config ...
Populating config with:
$ make ARCH=um O=.kunit olddefconfig
[03:32:06] Building KUnit Kernel ...
Populating config with:
$ make ARCH=um O=.kunit olddefconfig
Building with:
$ make all compile_commands.json ARCH=um O=.kunit --jobs=48
ERROR:root:../lib/iomap.c:156:5: warning: no previous prototype for ‘ioread64_lo_hi’ [-Wmissing-prototypes]
  156 | u64 ioread64_lo_hi(const void __iomem *addr)
      |     ^~~~~~~~~~~~~~
../lib/iomap.c:163:5: warning: no previous prototype for ‘ioread64_hi_lo’ [-Wmissing-prototypes]
  163 | u64 ioread64_hi_lo(const void __iomem *addr)
      |     ^~~~~~~~~~~~~~
../lib/iomap.c:170:5: warning: no previous prototype for ‘ioread64be_lo_hi’ [-Wmissing-prototypes]
  170 | u64 ioread64be_lo_hi(const void __iomem *addr)
      |     ^~~~~~~~~~~~~~~~
../lib/iomap.c:178:5: warning: no previous prototype for ‘ioread64be_hi_lo’ [-Wmissing-prototypes]
  178 | u64 ioread64be_hi_lo(const void __iomem *addr)
      |     ^~~~~~~~~~~~~~~~
../lib/iomap.c:264:6: warning: no previous prototype for ‘iowrite64_lo_hi’ [-Wmissing-prototypes]
  264 | void iowrite64_lo_hi(u64 val, void __iomem *addr)
      |      ^~~~~~~~~~~~~~~
../lib/iomap.c:272:6: warning: no previous prototype for ‘iowrite64_hi_lo’ [-Wmissing-prototypes]
  272 | void iowrite64_hi_lo(u64 val, void __iomem *addr)
      |      ^~~~~~~~~~~~~~~
../lib/iomap.c:280:6: warning: no previous prototype for ‘iowrite64be_lo_hi’ [-Wmissing-prototypes]
  280 | void iowrite64be_lo_hi(u64 val, void __iomem *addr)
      |      ^~~~~~~~~~~~~~~~~
../lib/iomap.c:288:6: warning: no previous prototype for ‘iowrite64be_hi_lo’ [-Wmissing-prototypes]
  288 | void iowrite64be_hi_lo(u64 val, void __iomem *addr)
      |      ^~~~~~~~~~~~~~~~~
../drivers/gpu/drm/xe/xe_migrate.c: In function ‘xe_migrate_access_memory’:
../drivers/gpu/drm/xe/xe_migrate.c:1819:19: error: ‘XE_CACHELINE_MASK’ undeclared (first use in this function)
 1819 |         if (len & XE_CACHELINE_MASK ||
      |                   ^~~~~~~~~~~~~~~~~
../drivers/gpu/drm/xe/xe_migrate.c:1819:19: note: each undeclared identifier is reported only once for each function it appears in
../drivers/gpu/drm/xe/xe_migrate.c:1828:35: error: ‘XE_CACHELINE_BYTES’ undeclared (first use in this function); did you mean ‘L1_CACHE_BYTES’?
 1828 |                         u8 bounce[XE_CACHELINE_BYTES];
      |                                   ^~~~~~~~~~~~~~~~~~
      |                                   L1_CACHE_BYTES
../drivers/gpu/drm/xe/xe_migrate.c:1828:28: warning: unused variable ‘bounce’ [-Wunused-variable]
 1828 |                         u8 bounce[XE_CACHELINE_BYTES];
      |                            ^~~~~~
../drivers/gpu/drm/xe/xe_migrate.c:1888:27: error: implicit declaration of function ‘xe_migrate_vram’; did you mean ‘xe_migrate_to_vram’? [-Werror=implicit-function-declaration]
 1888 |                 __fence = xe_migrate_vram(m, current_bytes,
      |                           ^~~~~~~~~~~~~~~
      |                           xe_migrate_to_vram
../drivers/gpu/drm/xe/xe_migrate.c:1892:43: error: ‘XE_MIGRATE_COPY_TO_VRAM’ undeclared (first use in this function)
 1892 |                                           XE_MIGRATE_COPY_TO_VRAM :
      |                                           ^~~~~~~~~~~~~~~~~~~~~~~
../drivers/gpu/drm/xe/xe_migrate.c:1893:43: error: ‘XE_MIGRATE_COPY_TO_SRAM’ undeclared (first use in this function)
 1893 |                                           XE_MIGRATE_COPY_TO_SRAM);
      |                                           ^~~~~~~~~~~~~~~~~~~~~~~
cc1: some warnings being treated as errors
make[7]: *** [../scripts/Makefile.build:207: drivers/gpu/drm/xe/xe_migrate.o] Error 1
make[7]: *** Waiting for unfinished jobs....
make[6]: *** [../scripts/Makefile.build:465: drivers/gpu/drm/xe] Error 2
make[5]: *** [../scripts/Makefile.build:465: drivers/gpu/drm] Error 2
make[4]: *** [../scripts/Makefile.build:465: drivers/gpu] Error 2
make[3]: *** [../scripts/Makefile.build:465: drivers] Error 2
make[2]: *** [/kernel/Makefile:1994: .] Error 2
make[1]: *** [/kernel/Makefile:251: __sub-make] Error 2
make: *** [Makefile:251: __sub-make] Error 2

+ cleanup
++ stat -c %u:%g /kernel
+ chown -R 1003:1003 /kernel



^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v2 2/4] drm/xe: Update xe_ttm_access_memory to use GPU for non-visible access
  2025-04-03 20:27 ` [PATCH v2 2/4] drm/xe: Update xe_ttm_access_memory to use GPU for non-visible access Matthew Brost
  2025-04-03 21:28   ` Cavitt, Jonathan
  2025-04-04  1:03   ` kernel test robot
@ 2025-04-04  3:36   ` kernel test robot
  2025-04-04  3:46   ` kernel test robot
  3 siblings, 0 replies; 20+ messages in thread
From: kernel test robot @ 2025-04-04  3:36 UTC (permalink / raw)
  To: Matthew Brost; +Cc: llvm, oe-kbuild-all

Hi Matthew,

kernel test robot noticed the following build errors:

[auto build test ERROR on drm-xe/drm-xe-next]
[also build test ERROR on next-20250403]
[cannot apply to drm-exynos/exynos-drm-next linus/master drm/drm-next drm-intel/for-linux-next drm-intel/for-linux-next-fixes drm-misc/drm-misc-next drm-tip/drm-tip v6.14]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Matthew-Brost/drm-xe-Add-devcoredump-chunking/20250404-042700
base:   https://gitlab.freedesktop.org/drm/xe/kernel.git drm-xe-next
patch link:    https://lore.kernel.org/r/20250403202705.18488-3-matthew.brost%40intel.com
patch subject: [PATCH v2 2/4] drm/xe: Update xe_ttm_access_memory to use GPU for non-visible access
config: x86_64-buildonly-randconfig-003-20250404 (https://download.01.org/0day-ci/archive/20250404/202504041302.vLhBU2QJ-lkp@intel.com/config)
compiler: clang version 20.1.2 (https://github.com/llvm/llvm-project 58df0ef89dd64126512e4ee27b4ac3fd8ddf6247)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20250404/202504041302.vLhBU2QJ-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202504041302.vLhBU2QJ-lkp@intel.com/

All errors (new ones prefixed by >>):

>> drivers/gpu/drm/xe/xe_migrate.c:1819:12: error: use of undeclared identifier 'XE_CACHELINE_MASK'
    1819 |         if (len & XE_CACHELINE_MASK ||
         |                   ^
   drivers/gpu/drm/xe/xe_migrate.c:1820:34: error: use of undeclared identifier 'XE_CACHELINE_MASK'
    1820 |             ((uintptr_t)buf | offset) & XE_CACHELINE_MASK) {
         |                                         ^
>> drivers/gpu/drm/xe/xe_migrate.c:1828:14: error: use of undeclared identifier 'XE_CACHELINE_BYTES'
    1828 |                         u8 bounce[XE_CACHELINE_BYTES];
         |                                   ^
   drivers/gpu/drm/xe/xe_migrate.c:1833:23: error: use of undeclared identifier 'XE_CACHELINE_MASK'
    1833 |                                                (offset & XE_CACHELINE_MASK));
         |                                                          ^
   drivers/gpu/drm/xe/xe_migrate.c:1832:13: error: use of undeclared identifier 'XE_CACHELINE_BYTES'
    1832 |                                                XE_CACHELINE_BYTES -
         |                                                ^
   drivers/gpu/drm/xe/xe_migrate.c:1834:30: error: use of undeclared identifier 'XE_CACHELINE_MASK'
    1834 |                         int ptr_offset = offset & XE_CACHELINE_MASK;
         |                                                   ^
   drivers/gpu/drm/xe/xe_migrate.c:1838:15: error: use of undeclared identifier 'XE_CACHELINE_MASK'
    1838 |                                                        ~XE_CACHELINE_MASK,
         |                                                         ^
   drivers/gpu/drm/xe/xe_migrate.c:1852:24: error: use of undeclared identifier 'XE_CACHELINE_MASK'
    1852 |                                                        offset & ~XE_CACHELINE_MASK,
         |                                                                  ^
>> drivers/gpu/drm/xe/xe_migrate.c:1888:13: error: call to undeclared function 'xe_migrate_vram'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration]
    1888 |                 __fence = xe_migrate_vram(m, current_bytes,
         |                           ^
   drivers/gpu/drm/xe/xe_migrate.c:1888:13: note: did you mean 'xe_migrate_to_vram'?
   drivers/gpu/drm/xe/xe_migrate.h:98:19: note: 'xe_migrate_to_vram' declared here
      98 | struct dma_fence *xe_migrate_to_vram(struct xe_migrate *m,
         |                   ^
>> drivers/gpu/drm/xe/xe_migrate.c:1892:8: error: use of undeclared identifier 'XE_MIGRATE_COPY_TO_VRAM'
    1892 |                                           XE_MIGRATE_COPY_TO_VRAM :
         |                                           ^
>> drivers/gpu/drm/xe/xe_migrate.c:1893:8: error: use of undeclared identifier 'XE_MIGRATE_COPY_TO_SRAM'
    1893 |                                           XE_MIGRATE_COPY_TO_SRAM);
         |                                           ^
   11 errors generated.


vim +/XE_CACHELINE_MASK +1819 drivers/gpu/drm/xe/xe_migrate.c

  1785	
  1786	/**
  1787	 * xe_migrate_access_memory - Access memory of a BO via GPU
  1788	 *
  1789	 * @m: The migration context.
  1790	 * @bo: buffer object
  1791	 * @offset: access offset into buffer object
  1792	 * @buf: pointer to caller memory to read into or write from
  1793	 * @len: length of access
  1794	 * @write: write access
  1795	 *
  1796	 * Access memory of a BO via GPU either reading in or writing from a passed in
  1797	 * pointer. Pointer is dma mapped for GPU access and GPU commands are issued to
  1798	 * read to or write from pointer.
  1799	 *
  1800	 * Returns:
  1801	 * 0 if successful, negative error code on failure.
  1802	 */
  1803	int xe_migrate_access_memory(struct xe_migrate *m, struct xe_bo *bo,
  1804				     unsigned long offset, void *buf, int len,
  1805				     int write)
  1806	{
  1807		struct xe_tile *tile = m->tile;
  1808		struct xe_device *xe = tile_to_xe(tile);
  1809		struct xe_res_cursor cursor;
  1810		struct dma_fence *fence = NULL;
  1811		dma_addr_t *dma_addr;
  1812		unsigned long page_offset = (unsigned long)buf & ~PAGE_MASK;
  1813		int bytes_left = len, current_page = 0;
  1814		void *orig_buf = buf;
  1815	
  1816		xe_bo_assert_held(bo);
  1817	
  1818		/* Use bounce buffer for small access and unaligned access */
> 1819		if (len & XE_CACHELINE_MASK ||
  1820		    ((uintptr_t)buf | offset) & XE_CACHELINE_MASK) {
  1821			int buf_offset = 0;
  1822	
  1823			/*
  1824			 * Less than ideal for large unaligned access but this should be
  1825			 * fairly rare, can fixup if this becomes common.
  1826			 */
  1827			do {
> 1828				u8 bounce[XE_CACHELINE_BYTES];
  1829				void *ptr = (void *)bounce;
  1830				int err;
  1831				int copy_bytes = min_t(int, bytes_left,
  1832						       XE_CACHELINE_BYTES -
  1833						       (offset & XE_CACHELINE_MASK));
  1834				int ptr_offset = offset & XE_CACHELINE_MASK;
  1835	
  1836				err = xe_migrate_access_memory(m, bo,
  1837							       offset &
  1838							       ~XE_CACHELINE_MASK,
  1839							       (void *)ptr,
  1840							       sizeof(bounce), 0);
  1841				if (err)
  1842					return err;
  1843	
  1844				if (!write) {
  1845					memcpy(buf + buf_offset, ptr + ptr_offset,
  1846					       copy_bytes);
  1847					goto next;
  1848				}
  1849	
  1850				memcpy(ptr + ptr_offset, buf + buf_offset, copy_bytes);
  1851				err = xe_migrate_access_memory(m, bo,
  1852							       offset & ~XE_CACHELINE_MASK,
  1853							       (void *)ptr,
  1854							       sizeof(bounce), 0);
  1855				if (err)
  1856					return err;
  1857	
  1858	next:
  1859				bytes_left -= copy_bytes;
  1860				buf_offset += copy_bytes;
  1861				offset += copy_bytes;
  1862			} while (bytes_left);
  1863	
  1864			return 0;
  1865		}
  1866	
  1867		dma_addr = xe_migrate_dma_map(xe, buf, len + page_offset, write);
  1868		if (IS_ERR(dma_addr))
  1869			return PTR_ERR(dma_addr);
  1870	
  1871		xe_res_first(bo->ttm.resource, offset, bo->size - offset, &cursor);
  1872	
  1873		do {
  1874			struct dma_fence *__fence;
  1875			u64 vram_addr = vram_region_gpu_offset(bo->ttm.resource) +
  1876				cursor.start;
  1877			int current_bytes;
  1878	
  1879			if (cursor.size > MAX_PREEMPTDISABLE_TRANSFER)
  1880				current_bytes = min_t(int, bytes_left,
  1881						      MAX_PREEMPTDISABLE_TRANSFER);
  1882			else
  1883				current_bytes = min_t(int, bytes_left, cursor.size);
  1884	
  1885			if (fence)
  1886				dma_fence_put(fence);
  1887	
> 1888			__fence = xe_migrate_vram(m, current_bytes,
  1889						  (unsigned long)buf & ~PAGE_MASK,
  1890						  dma_addr + current_page,
  1891						  vram_addr, write ?
> 1892						  XE_MIGRATE_COPY_TO_VRAM :
> 1893						  XE_MIGRATE_COPY_TO_SRAM);
  1894			if (IS_ERR(__fence)) {
  1895				if (fence)
  1896					dma_fence_wait(fence, false);
  1897				fence = __fence;
  1898				goto out_err;
  1899			}
  1900			fence = __fence;
  1901	
  1902			buf += current_bytes;
  1903			offset += current_bytes;
  1904			current_page = (int)(buf - orig_buf) / PAGE_SIZE;
  1905			bytes_left -= current_bytes;
  1906			if (bytes_left)
  1907				xe_res_next(&cursor, current_bytes);
  1908		} while (bytes_left);
  1909	
  1910		dma_fence_wait(fence, false);
  1911		dma_fence_put(fence);
  1912		xe_migrate_dma_unmap(xe, dma_addr, len + page_offset, write);
  1913	
  1914		return 0;
  1915	
  1916	out_err:
  1917		xe_migrate_dma_unmap(xe, dma_addr, len + page_offset, write);
  1918		return PTR_ERR(fence);
  1919	}
  1920	

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v2 2/4] drm/xe: Update xe_ttm_access_memory to use GPU for non-visible access
  2025-04-03 20:27 ` [PATCH v2 2/4] drm/xe: Update xe_ttm_access_memory to use GPU for non-visible access Matthew Brost
                     ` (2 preceding siblings ...)
  2025-04-04  3:36   ` kernel test robot
@ 2025-04-04  3:46   ` kernel test robot
  3 siblings, 0 replies; 20+ messages in thread
From: kernel test robot @ 2025-04-04  3:46 UTC (permalink / raw)
  To: Matthew Brost; +Cc: oe-kbuild-all

Hi Matthew,

kernel test robot noticed the following build errors:

[auto build test ERROR on drm-xe/drm-xe-next]
[also build test ERROR on next-20250403]
[cannot apply to drm-exynos/exynos-drm-next linus/master drm/drm-next drm-intel/for-linux-next drm-intel/for-linux-next-fixes drm-misc/drm-misc-next drm-tip/drm-tip v6.14]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Matthew-Brost/drm-xe-Add-devcoredump-chunking/20250404-042700
base:   https://gitlab.freedesktop.org/drm/xe/kernel.git drm-xe-next
patch link:    https://lore.kernel.org/r/20250403202705.18488-3-matthew.brost%40intel.com
patch subject: [PATCH v2 2/4] drm/xe: Update xe_ttm_access_memory to use GPU for non-visible access
config: alpha-allyesconfig (https://download.01.org/0day-ci/archive/20250404/202504041115.bbGTufbJ-lkp@intel.com/config)
compiler: alpha-linux-gcc (GCC) 14.2.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20250404/202504041115.bbGTufbJ-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202504041115.bbGTufbJ-lkp@intel.com/

All errors (new ones prefixed by >>):

   drivers/gpu/drm/xe/xe_migrate.c: In function 'xe_migrate_access_memory':
>> drivers/gpu/drm/xe/xe_migrate.c:1819:19: error: 'XE_CACHELINE_MASK' undeclared (first use in this function)
    1819 |         if (len & XE_CACHELINE_MASK ||
         |                   ^~~~~~~~~~~~~~~~~
   drivers/gpu/drm/xe/xe_migrate.c:1819:19: note: each undeclared identifier is reported only once for each function it appears in
>> drivers/gpu/drm/xe/xe_migrate.c:1828:35: error: 'XE_CACHELINE_BYTES' undeclared (first use in this function); did you mean 'L1_CACHE_BYTES'?
    1828 |                         u8 bounce[XE_CACHELINE_BYTES];
         |                                   ^~~~~~~~~~~~~~~~~~
         |                                   L1_CACHE_BYTES
   drivers/gpu/drm/xe/xe_migrate.c:1828:28: warning: unused variable 'bounce' [-Wunused-variable]
    1828 |                         u8 bounce[XE_CACHELINE_BYTES];
         |                            ^~~~~~
>> drivers/gpu/drm/xe/xe_migrate.c:1888:27: error: implicit declaration of function 'xe_migrate_vram'; did you mean 'xe_migrate_to_vram'? [-Wimplicit-function-declaration]
    1888 |                 __fence = xe_migrate_vram(m, current_bytes,
         |                           ^~~~~~~~~~~~~~~
         |                           xe_migrate_to_vram
>> drivers/gpu/drm/xe/xe_migrate.c:1892:43: error: 'XE_MIGRATE_COPY_TO_VRAM' undeclared (first use in this function)
    1892 |                                           XE_MIGRATE_COPY_TO_VRAM :
         |                                           ^~~~~~~~~~~~~~~~~~~~~~~
>> drivers/gpu/drm/xe/xe_migrate.c:1893:43: error: 'XE_MIGRATE_COPY_TO_SRAM' undeclared (first use in this function)
    1893 |                                           XE_MIGRATE_COPY_TO_SRAM);
         |                                           ^~~~~~~~~~~~~~~~~~~~~~~


vim +/XE_CACHELINE_MASK +1819 drivers/gpu/drm/xe/xe_migrate.c

  1785	
  1786	/**
  1787	 * xe_migrate_access_memory - Access memory of a BO via GPU
  1788	 *
  1789	 * @m: The migration context.
  1790	 * @bo: buffer object
  1791	 * @offset: access offset into buffer object
  1792	 * @buf: pointer to caller memory to read into or write from
  1793	 * @len: length of access
  1794	 * @write: write access
  1795	 *
  1796	 * Access memory of a BO via GPU either reading in or writing from a passed in
  1797	 * pointer. Pointer is dma mapped for GPU access and GPU commands are issued to
  1798	 * read to or write from pointer.
  1799	 *
  1800	 * Returns:
  1801	 * 0 if successful, negative error code on failure.
  1802	 */
  1803	int xe_migrate_access_memory(struct xe_migrate *m, struct xe_bo *bo,
  1804				     unsigned long offset, void *buf, int len,
  1805				     int write)
  1806	{
  1807		struct xe_tile *tile = m->tile;
  1808		struct xe_device *xe = tile_to_xe(tile);
  1809		struct xe_res_cursor cursor;
  1810		struct dma_fence *fence = NULL;
  1811		dma_addr_t *dma_addr;
  1812		unsigned long page_offset = (unsigned long)buf & ~PAGE_MASK;
  1813		int bytes_left = len, current_page = 0;
  1814		void *orig_buf = buf;
  1815	
  1816		xe_bo_assert_held(bo);
  1817	
  1818		/* Use bounce buffer for small access and unaligned access */
> 1819		if (len & XE_CACHELINE_MASK ||
  1820		    ((uintptr_t)buf | offset) & XE_CACHELINE_MASK) {
  1821			int buf_offset = 0;
  1822	
  1823			/*
  1824			 * Less than ideal for large unaligned access but this should be
  1825			 * fairly rare, can fixup if this becomes common.
  1826			 */
  1827			do {
> 1828				u8 bounce[XE_CACHELINE_BYTES];
  1829				void *ptr = (void *)bounce;
  1830				int err;
  1831				int copy_bytes = min_t(int, bytes_left,
  1832						       XE_CACHELINE_BYTES -
  1833						       (offset & XE_CACHELINE_MASK));
  1834				int ptr_offset = offset & XE_CACHELINE_MASK;
  1835	
  1836				err = xe_migrate_access_memory(m, bo,
  1837							       offset &
  1838							       ~XE_CACHELINE_MASK,
  1839							       (void *)ptr,
  1840							       sizeof(bounce), 0);
  1841				if (err)
  1842					return err;
  1843	
  1844				if (!write) {
  1845					memcpy(buf + buf_offset, ptr + ptr_offset,
  1846					       copy_bytes);
  1847					goto next;
  1848				}
  1849	
  1850				memcpy(ptr + ptr_offset, buf + buf_offset, copy_bytes);
  1851				err = xe_migrate_access_memory(m, bo,
  1852							       offset & ~XE_CACHELINE_MASK,
  1853							       (void *)ptr,
  1854							       sizeof(bounce), 0);
  1855				if (err)
  1856					return err;
  1857	
  1858	next:
  1859				bytes_left -= copy_bytes;
  1860				buf_offset += copy_bytes;
  1861				offset += copy_bytes;
  1862			} while (bytes_left);
  1863	
  1864			return 0;
  1865		}
  1866	
  1867		dma_addr = xe_migrate_dma_map(xe, buf, len + page_offset, write);
  1868		if (IS_ERR(dma_addr))
  1869			return PTR_ERR(dma_addr);
  1870	
  1871		xe_res_first(bo->ttm.resource, offset, bo->size - offset, &cursor);
  1872	
  1873		do {
  1874			struct dma_fence *__fence;
  1875			u64 vram_addr = vram_region_gpu_offset(bo->ttm.resource) +
  1876				cursor.start;
  1877			int current_bytes;
  1878	
  1879			if (cursor.size > MAX_PREEMPTDISABLE_TRANSFER)
  1880				current_bytes = min_t(int, bytes_left,
  1881						      MAX_PREEMPTDISABLE_TRANSFER);
  1882			else
  1883				current_bytes = min_t(int, bytes_left, cursor.size);
  1884	
  1885			if (fence)
  1886				dma_fence_put(fence);
  1887	
> 1888			__fence = xe_migrate_vram(m, current_bytes,
  1889						  (unsigned long)buf & ~PAGE_MASK,
  1890						  dma_addr + current_page,
  1891						  vram_addr, write ?
> 1892						  XE_MIGRATE_COPY_TO_VRAM :
> 1893						  XE_MIGRATE_COPY_TO_SRAM);
  1894			if (IS_ERR(__fence)) {
  1895				if (fence)
  1896					dma_fence_wait(fence, false);
  1897				fence = __fence;
  1898				goto out_err;
  1899			}
  1900			fence = __fence;
  1901	
  1902			buf += current_bytes;
  1903			offset += current_bytes;
  1904			current_page = (int)(buf - orig_buf) / PAGE_SIZE;
  1905			bytes_left -= current_bytes;
  1906			if (bytes_left)
  1907				xe_res_next(&cursor, current_bytes);
  1908		} while (bytes_left);
  1909	
  1910		dma_fence_wait(fence, false);
  1911		dma_fence_put(fence);
  1912		xe_migrate_dma_unmap(xe, dma_addr, len + page_offset, write);
  1913	
  1914		return 0;
  1915	
  1916	out_err:
  1917		xe_migrate_dma_unmap(xe, dma_addr, len + page_offset, write);
  1918		return PTR_ERR(fence);
  1919	}
  1920	

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v2 2/4] drm/xe: Update xe_ttm_access_memory to use GPU for non-visible access
  2025-04-03 21:28   ` Cavitt, Jonathan
@ 2025-04-10  3:58     ` Matthew Brost
  0 siblings, 0 replies; 20+ messages in thread
From: Matthew Brost @ 2025-04-10  3:58 UTC (permalink / raw)
  To: Cavitt, Jonathan
  Cc: intel-xe@lists.freedesktop.org, dri-devel@lists.freedesktop.org

On Thu, Apr 03, 2025 at 03:28:48PM -0600, Cavitt, Jonathan wrote:
> -----Original Message-----
> From: Intel-xe <intel-xe-bounces@lists.freedesktop.org> On Behalf Of Matthew Brost
> Sent: Thursday, April 3, 2025 1:27 PM
> To: intel-xe@lists.freedesktop.org
> Cc: dri-devel@lists.freedesktop.org
> Subject: [PATCH v2 2/4] drm/xe: Update xe_ttm_access_memory to use GPU for non-visible access
> > 
> > Add migrate layer functions to access VRAM and update
> > xe_ttm_access_memory to use for non-visible access and large (more than
> > 16k) BO access. 8G devcoreump on BMG observed 3 minute CPU copy time vs.
> > 3s GPU copy time.
> > 
> > v4:
> >  - Fix non-page aligned accesses
> >  - Add support for small / unaligned access
> >  - Update commit message indicating migrate used for large accesses (Auld)
> >  - Fix warning in xe_res_cursor for non-zero offset
> > v5:
> >  - Fix 32 bit build (CI)
> > v6:
> >  - Rebase and use SVM migration copy functions
> > v7:
> >  - Fix build error (CI)
> > 
> > Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> 
> I left some notes down below.  Most are non-blocking, which I labeled as such.  And
> as for the rest, it's probable that I'm just misunderstanding some parts of the code, and
> that the notes I left are not relevant.
> 
> So, I'll be providing my reviewed-by now in the case that the blocking review notes
> turn out to not need to be addressed.  And in the case they need to be addressed,
> you can take my reviewed-by for the next revision.
> 
> Reviewed-by: Jonathan Cavitt <jonathan.cavitt@intel.com>
> 
> > ---
> >  drivers/gpu/drm/xe/xe_bo.c      |  15 ++-
> >  drivers/gpu/drm/xe/xe_migrate.c | 221 ++++++++++++++++++++++++++++++--
> >  drivers/gpu/drm/xe/xe_migrate.h |   4 +
> >  3 files changed, 223 insertions(+), 17 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/xe/xe_bo.c b/drivers/gpu/drm/xe/xe_bo.c
> > index 3c7c2353d3c8..c7e6b03d4aef 100644
> > --- a/drivers/gpu/drm/xe/xe_bo.c
> > +++ b/drivers/gpu/drm/xe/xe_bo.c
> > @@ -1414,6 +1414,7 @@ static int xe_ttm_access_memory(struct ttm_buffer_object *ttm_bo,
> >  	struct xe_res_cursor cursor;
> >  	struct xe_vram_region *vram;
> >  	int bytes_left = len;
> > +	int err = 0;
> >  
> >  	xe_bo_assert_held(bo);
> >  	xe_device_assert_mem_access(xe);
> > @@ -1421,9 +1422,14 @@ static int xe_ttm_access_memory(struct ttm_buffer_object *ttm_bo,
> >  	if (!mem_type_is_vram(ttm_bo->resource->mem_type))
> >  		return -EIO;
> >  
> > -	/* FIXME: Use GPU for non-visible VRAM */
> > -	if (!xe_ttm_resource_visible(ttm_bo->resource))
> > -		return -EIO;
> > +	if (!xe_ttm_resource_visible(ttm_bo->resource) || len >= SZ_16K) {
> > +		struct xe_migrate *migrate =
> > +			mem_type_to_migrate(xe, ttm_bo->resource->mem_type);
> > +
> > +		err = xe_migrate_access_memory(migrate, bo, offset, buf, len,
> > +					       write);
> > +		goto out;
> > +	}
> >  
> >  	vram = res_to_mem_region(ttm_bo->resource);
> >  	xe_res_first(ttm_bo->resource, offset & PAGE_MASK,
> > @@ -1447,7 +1453,8 @@ static int xe_ttm_access_memory(struct ttm_buffer_object *ttm_bo,
> >  			xe_res_next(&cursor, PAGE_SIZE);
> >  	} while (bytes_left);
> >  
> > -	return len;
> > +out:
> > +	return err ?: len;
> >  }
> >  
> >  const struct ttm_device_funcs xe_ttm_funcs = {
> > diff --git a/drivers/gpu/drm/xe/xe_migrate.c b/drivers/gpu/drm/xe/xe_migrate.c
> > index ff0fc2fb0eb9..ba8568691d99 100644
> > --- a/drivers/gpu/drm/xe/xe_migrate.c
> > +++ b/drivers/gpu/drm/xe/xe_migrate.c
> > @@ -670,6 +670,7 @@ static void emit_copy(struct xe_gt *gt, struct xe_bb *bb,
> >  	u32 mocs = 0;
> >  	u32 tile_y = 0;
> >  
> > +	xe_gt_assert(gt, !(pitch & 3));
> >  	xe_gt_assert(gt, size / pitch <= S16_MAX);
> >  	xe_gt_assert(gt, pitch / 4 <= S16_MAX);
> >  	xe_gt_assert(gt, pitch <= U16_MAX);
> > @@ -1602,8 +1603,12 @@ enum xe_migrate_copy_dir {
> >  	XE_MIGRATE_COPY_TO_SRAM,
> >  };
> >  
> > +#define XE_CACHELINE_BYTES	64ull
> > +#define XE_CACHELINE_MASK	(XE_CACHELINE_BYTES - 1)
> > +
> >  static struct dma_fence *xe_migrate_vram(struct xe_migrate *m,
> > -					 unsigned long npages,
> > +					 unsigned long len,
> > +					 unsigned long sram_offset,
> >  					 dma_addr_t *sram_addr, u64 vram_addr,
> >  					 const enum xe_migrate_copy_dir dir)
> >  {
> > @@ -1613,17 +1618,21 @@ static struct dma_fence *xe_migrate_vram(struct xe_migrate *m,
> >  	struct dma_fence *fence = NULL;
> >  	u32 batch_size = 2;
> >  	u64 src_L0_ofs, dst_L0_ofs;
> > -	u64 round_update_size;
> >  	struct xe_sched_job *job;
> >  	struct xe_bb *bb;
> >  	u32 update_idx, pt_slot = 0;
> > +	unsigned long npages = DIV_ROUND_UP(len + sram_offset, PAGE_SIZE);
> > +	unsigned int pitch = len >= PAGE_SIZE && !(len & ~PAGE_MASK) ?
> > +		PAGE_SIZE : 4;
> >  	int err;
> >  
> > -	if (npages * PAGE_SIZE > MAX_PREEMPTDISABLE_TRANSFER)
> > -		return ERR_PTR(-EINVAL);
> > +	if (drm_WARN_ON(&xe->drm, (len & XE_CACHELINE_MASK) ||
> > +			(sram_offset | vram_addr) & XE_CACHELINE_MASK))
> > +		return ERR_PTR(-EOPNOTSUPP);
> >  
> > -	round_update_size = npages * PAGE_SIZE;
> > -	batch_size += pte_update_cmd_size(round_update_size);
> > +	xe_assert(xe, npages * PAGE_SIZE <= MAX_PREEMPTDISABLE_TRANSFER);
> 
> Hmm... Does this case still need to return an error value?  I don't think replacing it with an
> xe_assert is entirely valid, as asserts should not be used as a replacement of proper error
> handling.  Or so I've been told.
> 
> So, I guess the question is: what error was this preventing previously, and is it still a concern?
> 

This should have been an assert before - it just validating the caller
passing in sane arguments which we do everywhere in Xe. If that changes,
then boom the assert will pop. 

> > +
> > +	batch_size += pte_update_cmd_size(len);
> >  	batch_size += EMIT_COPY_DW;
> >  
> >  	bb = xe_bb_new(gt, batch_size, use_usm_batch);
> > @@ -1633,22 +1642,21 @@ static struct dma_fence *xe_migrate_vram(struct xe_migrate *m,
> >  	}
> >  
> >  	build_pt_update_batch_sram(m, bb, pt_slot * XE_PAGE_SIZE,
> > -				   sram_addr, round_update_size);
> > +				   sram_addr, len + sram_offset);
> >  
> >  	if (dir == XE_MIGRATE_COPY_TO_VRAM) {
> > -		src_L0_ofs = xe_migrate_vm_addr(pt_slot, 0);
> > +		src_L0_ofs = xe_migrate_vm_addr(pt_slot, 0) + sram_offset;
> >  		dst_L0_ofs = xe_migrate_vram_ofs(xe, vram_addr, false);
> >  
> >  	} else {
> >  		src_L0_ofs = xe_migrate_vram_ofs(xe, vram_addr, false);
> > -		dst_L0_ofs = xe_migrate_vm_addr(pt_slot, 0);
> > +		dst_L0_ofs = xe_migrate_vm_addr(pt_slot, 0) + sram_offset;
> >  	}
> >  
> >  	bb->cs[bb->len++] = MI_BATCH_BUFFER_END;
> >  	update_idx = bb->len;
> >  
> > -	emit_copy(gt, bb, src_L0_ofs, dst_L0_ofs, round_update_size,
> > -		  XE_PAGE_SIZE);
> > +	emit_copy(gt, bb, src_L0_ofs, dst_L0_ofs, len, pitch);
> >  
> >  	job = xe_bb_create_migration_job(m->q, bb,
> >  					 xe_migrate_batch_base(m, use_usm_batch),
> > @@ -1696,7 +1704,7 @@ struct dma_fence *xe_migrate_to_vram(struct xe_migrate *m,
> >  				     dma_addr_t *src_addr,
> >  				     u64 dst_addr)
> >  {
> > -	return xe_migrate_vram(m, npages, src_addr, dst_addr,
> > +	return xe_migrate_vram(m, npages * PAGE_SIZE, 0, src_addr, dst_addr,
> >  			       XE_MIGRATE_COPY_TO_VRAM);
> >  }
> >  
> > @@ -1717,12 +1725,199 @@ struct dma_fence *xe_migrate_from_vram(struct xe_migrate *m,
> >  				       u64 src_addr,
> >  				       dma_addr_t *dst_addr)
> >  {
> > -	return xe_migrate_vram(m, npages, dst_addr, src_addr,
> > +	return xe_migrate_vram(m, npages * PAGE_SIZE, 0, dst_addr, src_addr,
> >  			       XE_MIGRATE_COPY_TO_SRAM);
> >  }
> >  
> >  #endif
> >  
> > +static void xe_migrate_dma_unmap(struct xe_device *xe, dma_addr_t *dma_addr,
> > +				 int len, int write)
> > +{
> > +	unsigned long i, npages = DIV_ROUND_UP(len, PAGE_SIZE);
> > +
> > +	for (i = 0; i < npages; ++i) {
> > +		if (!dma_addr[i])
> > +			continue;
> 
> Non-blocking:
> I'm guessing the dma_addr array can be non-contiguous (I.E. position 3 having a
> dma address does not imply positions 1 and 2 have them).  Because otherwise
> this can be a break condition instead of a continue.
>

It should be a break, let me fix that.
 
> > +
> > +		dma_unmap_page(xe->drm.dev, dma_addr[i], PAGE_SIZE,
> > +			       write ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
> > +	}
> > +	kfree(dma_addr);
> > +}
> > +
> > +static dma_addr_t *xe_migrate_dma_map(struct xe_device *xe,
> > +				      void *buf, int len, int write)
> > +{
> > +	dma_addr_t *dma_addr;
> > +	unsigned long i, npages = DIV_ROUND_UP(len, PAGE_SIZE);
> > +
> > +	dma_addr = kcalloc(npages, sizeof(*dma_addr), GFP_KERNEL);
> > +	if (!dma_addr)
> > +		return ERR_PTR(-ENOMEM);
> > +
> > +	for (i = 0; i < npages; ++i) {
> > +		dma_addr_t addr;
> > +		struct page *page;
> > +
> > +		if (is_vmalloc_addr(buf))
> > +			page = vmalloc_to_page(buf);
> > +		else
> > +			page = virt_to_page(buf);
> > +
> > +		addr = dma_map_page(xe->drm.dev,
> > +				    page, 0, PAGE_SIZE,
> > +				    write ? DMA_TO_DEVICE :
> > +				    DMA_FROM_DEVICE);
> > +		if (dma_mapping_error(xe->drm.dev, addr))
> > +			goto err_fault;
> > +
> > +		dma_addr[i] = addr;
> > +		buf += PAGE_SIZE;
> > +	}
> > +
> > +	return dma_addr;
> > +
> > +err_fault:
> > +	xe_migrate_dma_unmap(xe, dma_addr, len, write);
> > +	return ERR_PTR(-EFAULT);
> > +}
> > +
> > +/**
> > + * xe_migrate_access_memory - Access memory of a BO via GPU
> > + *
> > + * @m: The migration context.
> > + * @bo: buffer object
> > + * @offset: access offset into buffer object
> > + * @buf: pointer to caller memory to read into or write from
> > + * @len: length of access
> > + * @write: write access
> > + *
> > + * Access memory of a BO via GPU either reading in or writing from a passed in
> > + * pointer. Pointer is dma mapped for GPU access and GPU commands are issued to
> > + * read to or write from pointer.
> > + *
> > + * Returns:
> > + * 0 if successful, negative error code on failure.
> > + */
> > +int xe_migrate_access_memory(struct xe_migrate *m, struct xe_bo *bo,
> > +			     unsigned long offset, void *buf, int len,
> > +			     int write)
> > +{
> > +	struct xe_tile *tile = m->tile;
> > +	struct xe_device *xe = tile_to_xe(tile);
> > +	struct xe_res_cursor cursor;
> > +	struct dma_fence *fence = NULL;
> > +	dma_addr_t *dma_addr;
> > +	unsigned long page_offset = (unsigned long)buf & ~PAGE_MASK;
> > +	int bytes_left = len, current_page = 0;
> > +	void *orig_buf = buf;
> > +
> > +	xe_bo_assert_held(bo);
> > +
> > +	/* Use bounce buffer for small access and unaligned access */
> > +	if (len & XE_CACHELINE_MASK ||
> > +	    ((uintptr_t)buf | offset) & XE_CACHELINE_MASK) {
> > +		int buf_offset = 0;
> > +
> > +		/*
> > +		 * Less than ideal for large unaligned access but this should be
> > +		 * fairly rare, can fixup if this becomes common.
> > +		 */
> > +		do {
> > +			u8 bounce[XE_CACHELINE_BYTES];
> > +			void *ptr = (void *)bounce;
> > +			int err;
> > +			int copy_bytes = min_t(int, bytes_left,
> > +					       XE_CACHELINE_BYTES -
> > +					       (offset & XE_CACHELINE_MASK));
> > +			int ptr_offset = offset & XE_CACHELINE_MASK;
> > +
> > +			err = xe_migrate_access_memory(m, bo,
> > +						       offset &
> > +						       ~XE_CACHELINE_MASK,
> > +						       (void *)ptr,
> > +						       sizeof(bounce), 0);
> > +			if (err)
> > +				return err;
> > +
> > +			if (!write) {
> > +				memcpy(buf + buf_offset, ptr + ptr_offset,
> > +				       copy_bytes);
> > +				goto next;
> > +			}
> 
> Non-blocking:
> This should debatably be an if-else block, rather than relying on a "goto next"
> here, as "goto next" is not used elsewhere in the function.
> 
> """
> 	if (write) {
> 		memcpy(ptr + ptr_offset, buf + buf_offset, copy_bytes);
> 		err = xe_migrate_access_memory(m, bo,
> 						    offset & ~XE_CACHELINE_MASK,
> 						    (void *)ptr,
> 						    sizeof(bounce), 0);
> 	} else {
> 		memcpy(buf + buf_offset, ptr + ptr_offset,
> 			  copy_bytes);
> 	}
> 
> 	if (err)
> 		return err;
> """

Yea, let clean this up.

> 
> > +
> > +			memcpy(ptr + ptr_offset, buf + buf_offset, copy_bytes);
> > +			err = xe_migrate_access_memory(m, bo,
> > +						       offset & ~XE_CACHELINE_MASK,
> > +						       (void *)ptr,
> > +						       sizeof(bounce), 0);
> > +			if (err)
> > +				return err;
> > +
> > +next:
> > +			bytes_left -= copy_bytes;
> > +			buf_offset += copy_bytes;
> > +			offset += copy_bytes;
> > +		} while (bytes_left);
> > +
> > +		return 0;
> > +	}
> > +
> > +	dma_addr = xe_migrate_dma_map(xe, buf, len + page_offset, write);
> > +	if (IS_ERR(dma_addr))
> > +		return PTR_ERR(dma_addr);
> > +
> > +	xe_res_first(bo->ttm.resource, offset, bo->size - offset, &cursor);
> > +
> > +	do {
> > +		struct dma_fence *__fence;
> > +		u64 vram_addr = vram_region_gpu_offset(bo->ttm.resource) +
> > +			cursor.start;
> > +		int current_bytes;
> > +
> > +		if (cursor.size > MAX_PREEMPTDISABLE_TRANSFER)
> > +			current_bytes = min_t(int, bytes_left,
> > +					      MAX_PREEMPTDISABLE_TRANSFER);
> > +		else
> > +			current_bytes = min_t(int, bytes_left, cursor.size);
> > +
> > +		if (fence)
> > +			dma_fence_put(fence);
> > +
> > +		__fence = xe_migrate_vram(m, current_bytes,
> > +					  (unsigned long)buf & ~PAGE_MASK,
> > +					  dma_addr + current_page,
> > +					  vram_addr, write ?
> > +					  XE_MIGRATE_COPY_TO_VRAM :
> > +					  XE_MIGRATE_COPY_TO_SRAM);
> > +		if (IS_ERR(__fence)) {
> > +			if (fence)
> > +				dma_fence_wait(fence, false);
> > +			fence = __fence;
> > +			goto out_err;
> > +		}
> > +		fence = __fence;
> 
> Non-blocking:
> It would be nice if we didn't have two paths assigning fence the __fence value,
> but any attempt I do to try and change that ends in having to compute the IS_ERR
> value for __fence twice.  E.G.:
> 
> """
> 	if (IS_ERR(__fence) && fence)
> 		dma_fence_wait(fence, false);
> 	fence = __fence;
> 	if (IS_ERR(fence))
> 		goto out_err;
> """
> 
> So, either way works.
> 

I prefer the way I have it.

> > +
> > +		buf += current_bytes;
> > +		offset += current_bytes;
> > +		current_page = (int)(buf - orig_buf) / PAGE_SIZE;
> > +		bytes_left -= current_bytes;
> > +		if (bytes_left)
> > +			xe_res_next(&cursor, current_bytes);
> > +	} while (bytes_left);
> > +
> > +	dma_fence_wait(fence, false);
> > +	dma_fence_put(fence);
> > +	xe_migrate_dma_unmap(xe, dma_addr, len + page_offset, write);
> > +
> > +	return 0;
> > +
> > +out_err:
> > +	xe_migrate_dma_unmap(xe, dma_addr, len + page_offset, write);
> > +	return PTR_ERR(fence);
> 
> Non-blocking:
> The branching execution might be able to be shortened.  Perhaps:
> """
> 	dma_fence_wait(fence, false);
> 	dma_fence_put(fence);
> 
> out_err:
> 	xe_migtate_dma_unmap(xe, dma_addr, len + page_offset, write);
> 	return IS_ERR(fence) ? PTR_ERR(fence) : 0;
> }
> """

Sure.

Matt

> -Jonathan Cavitt
> 
> > +}
> > +
> >  #if IS_ENABLED(CONFIG_DRM_XE_KUNIT_TEST)
> >  #include "tests/xe_migrate.c"
> >  #endif
> > diff --git a/drivers/gpu/drm/xe/xe_migrate.h b/drivers/gpu/drm/xe/xe_migrate.h
> > index 6ff9a963425c..fb9839c1bae0 100644
> > --- a/drivers/gpu/drm/xe/xe_migrate.h
> > +++ b/drivers/gpu/drm/xe/xe_migrate.h
> > @@ -112,6 +112,10 @@ struct dma_fence *xe_migrate_copy(struct xe_migrate *m,
> >  				  struct ttm_resource *dst,
> >  				  bool copy_only_ccs);
> >  
> > +int xe_migrate_access_memory(struct xe_migrate *m, struct xe_bo *bo,
> > +			     unsigned long offset, void *buf, int len,
> > +			     int write);
> > +
> >  #define XE_MIGRATE_CLEAR_FLAG_BO_DATA		BIT(0)
> >  #define XE_MIGRATE_CLEAR_FLAG_CCS_DATA		BIT(1)
> >  #define XE_MIGRATE_CLEAR_FLAG_FULL	(XE_MIGRATE_CLEAR_FLAG_BO_DATA | \
> > -- 
> > 2.34.1
> > 
> > 

^ permalink raw reply	[flat|nested] 20+ messages in thread

end of thread, other threads:[~2025-04-10  3:57 UTC | newest]

Thread overview: 20+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-04-03 20:27 [PATCH v2 0/4] Large devcoredump file support Matthew Brost
2025-04-03 20:27 ` [PATCH v2 1/4] drm/xe: Add devcoredump chunking Matthew Brost
2025-04-03 21:28   ` Cavitt, Jonathan
2025-04-03 21:39   ` John Harrison
2025-04-03 22:32     ` Matthew Brost
2025-04-03 22:41       ` John Harrison
2025-04-04  2:33         ` Matthew Brost
2025-04-03 20:27 ` [PATCH v2 2/4] drm/xe: Update xe_ttm_access_memory to use GPU for non-visible access Matthew Brost
2025-04-03 21:28   ` Cavitt, Jonathan
2025-04-10  3:58     ` Matthew Brost
2025-04-04  1:03   ` kernel test robot
2025-04-04  3:36   ` kernel test robot
2025-04-04  3:46   ` kernel test robot
2025-04-03 20:27 ` [PATCH v2 3/4] drm/print: Add drm_printer_is_full Matthew Brost
2025-04-03 21:28   ` Cavitt, Jonathan
2025-04-03 20:27 ` [PATCH v2 4/4] drm/xe: Abort printing coredump in VM printer output if full Matthew Brost
2025-04-03 21:28   ` Cavitt, Jonathan
2025-04-04  3:31 ` ✓ CI.Patch_applied: success for Large devcoredump file support (rev2) Patchwork
2025-04-04  3:32 ` ✓ CI.checkpatch: " Patchwork
2025-04-04  3:32 ` ✗ CI.KUnit: failure " Patchwork

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.