* [PATCH v2 0/4] Large devcoredump file support
@ 2025-04-03 20:27 Matthew Brost
2025-04-03 20:27 ` [PATCH v2 1/4] drm/xe: Add devcoredump chunking Matthew Brost
` (6 more replies)
0 siblings, 7 replies; 20+ messages in thread
From: Matthew Brost @ 2025-04-03 20:27 UTC (permalink / raw)
To: intel-xe; +Cc: dri-devel
Devcoredump were truncated at 2G, remove this restriction. While here,
add support for GPU copies of BOs to increase devcoredump speed.
v2:
- Fix build error
- Abort printing once printer if full
Matthew Brost (4):
drm/xe: Add devcoredump chunking
drm/xe: Update xe_ttm_access_memory to use GPU for non-visible access
drm/print: Add drm_printer_is_full
drm/xe: Abort printing coredump in VM printer output if full
drivers/gpu/drm/xe/xe_bo.c | 15 +-
drivers/gpu/drm/xe/xe_devcoredump.c | 59 ++++--
drivers/gpu/drm/xe/xe_devcoredump_types.h | 2 +
drivers/gpu/drm/xe/xe_guc_hwconfig.c | 2 +-
drivers/gpu/drm/xe/xe_migrate.c | 221 ++++++++++++++++++++--
drivers/gpu/drm/xe/xe_migrate.h | 4 +
drivers/gpu/drm/xe/xe_vm.c | 3 +
include/drm/drm_print.h | 17 ++
8 files changed, 293 insertions(+), 30 deletions(-)
--
2.34.1
^ permalink raw reply [flat|nested] 20+ messages in thread
* [PATCH v2 1/4] drm/xe: Add devcoredump chunking
2025-04-03 20:27 [PATCH v2 0/4] Large devcoredump file support Matthew Brost
@ 2025-04-03 20:27 ` Matthew Brost
2025-04-03 21:28 ` Cavitt, Jonathan
2025-04-03 21:39 ` John Harrison
2025-04-03 20:27 ` [PATCH v2 2/4] drm/xe: Update xe_ttm_access_memory to use GPU for non-visible access Matthew Brost
` (5 subsequent siblings)
6 siblings, 2 replies; 20+ messages in thread
From: Matthew Brost @ 2025-04-03 20:27 UTC (permalink / raw)
To: intel-xe; +Cc: dri-devel
Chunk devcoredump into 1.5G pieces to avoid hitting the kvmalloc limit
of 2G. Simple algorithm reads 1.5G at time in xe_devcoredump_read
callback as needed.
Signed-off-by: Matthew Brost <matthew.brost@intel.com>
---
drivers/gpu/drm/xe/xe_devcoredump.c | 59 ++++++++++++++++++-----
drivers/gpu/drm/xe/xe_devcoredump_types.h | 2 +
drivers/gpu/drm/xe/xe_guc_hwconfig.c | 2 +-
3 files changed, 50 insertions(+), 13 deletions(-)
diff --git a/drivers/gpu/drm/xe/xe_devcoredump.c b/drivers/gpu/drm/xe/xe_devcoredump.c
index 81b9d9bb3f57..a9e618abf8ac 100644
--- a/drivers/gpu/drm/xe/xe_devcoredump.c
+++ b/drivers/gpu/drm/xe/xe_devcoredump.c
@@ -80,7 +80,8 @@ static struct xe_guc *exec_queue_to_guc(struct xe_exec_queue *q)
return &q->gt->uc.guc;
}
-static ssize_t __xe_devcoredump_read(char *buffer, size_t count,
+static ssize_t __xe_devcoredump_read(char *buffer, ssize_t count,
+ ssize_t start,
struct xe_devcoredump *coredump)
{
struct xe_device *xe;
@@ -94,7 +95,7 @@ static ssize_t __xe_devcoredump_read(char *buffer, size_t count,
ss = &coredump->snapshot;
iter.data = buffer;
- iter.start = 0;
+ iter.start = start;
iter.remain = count;
p = drm_coredump_printer(&iter);
@@ -168,6 +169,8 @@ static void xe_devcoredump_snapshot_free(struct xe_devcoredump_snapshot *ss)
ss->vm = NULL;
}
+#define XE_DEVCOREDUMP_CHUNK_MAX (SZ_512M + SZ_1G)
+
static ssize_t xe_devcoredump_read(char *buffer, loff_t offset,
size_t count, void *data, size_t datalen)
{
@@ -183,6 +186,9 @@ static ssize_t xe_devcoredump_read(char *buffer, loff_t offset,
/* Ensure delayed work is captured before continuing */
flush_work(&ss->work);
+ if (ss->read.size > XE_DEVCOREDUMP_CHUNK_MAX)
+ xe_pm_runtime_get(gt_to_xe(ss->gt));
+
mutex_lock(&coredump->lock);
if (!ss->read.buffer) {
@@ -195,12 +201,26 @@ static ssize_t xe_devcoredump_read(char *buffer, loff_t offset,
return 0;
}
+ if (offset >= ss->read.chunk_position + XE_DEVCOREDUMP_CHUNK_MAX ||
+ offset < ss->read.chunk_position) {
+ ss->read.chunk_position =
+ ALIGN_DOWN(offset, XE_DEVCOREDUMP_CHUNK_MAX);
+
+ __xe_devcoredump_read(ss->read.buffer,
+ XE_DEVCOREDUMP_CHUNK_MAX,
+ ss->read.chunk_position, coredump);
+ }
+
byte_copied = count < ss->read.size - offset ? count :
ss->read.size - offset;
- memcpy(buffer, ss->read.buffer + offset, byte_copied);
+ memcpy(buffer, ss->read.buffer +
+ (offset % XE_DEVCOREDUMP_CHUNK_MAX), byte_copied);
mutex_unlock(&coredump->lock);
+ if (ss->read.size > XE_DEVCOREDUMP_CHUNK_MAX)
+ xe_pm_runtime_put(gt_to_xe(ss->gt));
+
return byte_copied;
}
@@ -254,17 +274,32 @@ static void xe_devcoredump_deferred_snap_work(struct work_struct *work)
xe_guc_exec_queue_snapshot_capture_delayed(ss->ge);
xe_force_wake_put(gt_to_fw(ss->gt), fw_ref);
- xe_pm_runtime_put(xe);
+ ss->read.chunk_position = 0;
/* Calculate devcoredump size */
- ss->read.size = __xe_devcoredump_read(NULL, INT_MAX, coredump);
-
- ss->read.buffer = kvmalloc(ss->read.size, GFP_USER);
- if (!ss->read.buffer)
- return;
+ ss->read.size = __xe_devcoredump_read(NULL, LONG_MAX, 0, coredump);
+
+ if (ss->read.size > XE_DEVCOREDUMP_CHUNK_MAX) {
+ ss->read.buffer = kvmalloc(XE_DEVCOREDUMP_CHUNK_MAX,
+ GFP_USER);
+ if (!ss->read.buffer)
+ goto put_pm;
+
+ __xe_devcoredump_read(ss->read.buffer,
+ XE_DEVCOREDUMP_CHUNK_MAX,
+ 0, coredump);
+ } else {
+ ss->read.buffer = kvmalloc(ss->read.size, GFP_USER);
+ if (!ss->read.buffer)
+ goto put_pm;
+
+ __xe_devcoredump_read(ss->read.buffer, ss->read.size, 0,
+ coredump);
+ xe_devcoredump_snapshot_free(ss);
+ }
- __xe_devcoredump_read(ss->read.buffer, ss->read.size, coredump);
- xe_devcoredump_snapshot_free(ss);
+put_pm:
+ xe_pm_runtime_put(xe);
}
static void devcoredump_snapshot(struct xe_devcoredump *coredump,
@@ -425,7 +460,7 @@ void xe_print_blob_ascii85(struct drm_printer *p, const char *prefix, char suffi
if (offset & 3)
drm_printf(p, "Offset not word aligned: %zu", offset);
- line_buff = kzalloc(DMESG_MAX_LINE_LEN, GFP_KERNEL);
+ line_buff = kzalloc(DMESG_MAX_LINE_LEN, GFP_ATOMIC);
if (!line_buff) {
drm_printf(p, "Failed to allocate line buffer\n");
return;
diff --git a/drivers/gpu/drm/xe/xe_devcoredump_types.h b/drivers/gpu/drm/xe/xe_devcoredump_types.h
index 1a1d16a96b2d..a174385a6d83 100644
--- a/drivers/gpu/drm/xe/xe_devcoredump_types.h
+++ b/drivers/gpu/drm/xe/xe_devcoredump_types.h
@@ -66,6 +66,8 @@ struct xe_devcoredump_snapshot {
struct {
/** @read.size: size of devcoredump in human readable format */
ssize_t size;
+ /** @read.chunk_position: position of devcoredump chunk */
+ ssize_t chunk_position;
/** @read.buffer: buffer of devcoredump in human readable format */
char *buffer;
} read;
diff --git a/drivers/gpu/drm/xe/xe_guc_hwconfig.c b/drivers/gpu/drm/xe/xe_guc_hwconfig.c
index af2c817d552c..21403a250834 100644
--- a/drivers/gpu/drm/xe/xe_guc_hwconfig.c
+++ b/drivers/gpu/drm/xe/xe_guc_hwconfig.c
@@ -175,7 +175,7 @@ int xe_guc_hwconfig_lookup_u32(struct xe_guc *guc, u32 attribute, u32 *val)
if (num_dw == 0)
return -EINVAL;
- hwconfig = kzalloc(size, GFP_KERNEL);
+ hwconfig = kzalloc(size, GFP_ATOMIC);
if (!hwconfig)
return -ENOMEM;
--
2.34.1
^ permalink raw reply related [flat|nested] 20+ messages in thread
* [PATCH v2 2/4] drm/xe: Update xe_ttm_access_memory to use GPU for non-visible access
2025-04-03 20:27 [PATCH v2 0/4] Large devcoredump file support Matthew Brost
2025-04-03 20:27 ` [PATCH v2 1/4] drm/xe: Add devcoredump chunking Matthew Brost
@ 2025-04-03 20:27 ` Matthew Brost
2025-04-03 21:28 ` Cavitt, Jonathan
` (3 more replies)
2025-04-03 20:27 ` [PATCH v2 3/4] drm/print: Add drm_printer_is_full Matthew Brost
` (4 subsequent siblings)
6 siblings, 4 replies; 20+ messages in thread
From: Matthew Brost @ 2025-04-03 20:27 UTC (permalink / raw)
To: intel-xe; +Cc: dri-devel
Add migrate layer functions to access VRAM and update
xe_ttm_access_memory to use for non-visible access and large (more than
16k) BO access. 8G devcoreump on BMG observed 3 minute CPU copy time vs.
3s GPU copy time.
v4:
- Fix non-page aligned accesses
- Add support for small / unaligned access
- Update commit message indicating migrate used for large accesses (Auld)
- Fix warning in xe_res_cursor for non-zero offset
v5:
- Fix 32 bit build (CI)
v6:
- Rebase and use SVM migration copy functions
v7:
- Fix build error (CI)
Signed-off-by: Matthew Brost <matthew.brost@intel.com>
---
drivers/gpu/drm/xe/xe_bo.c | 15 ++-
drivers/gpu/drm/xe/xe_migrate.c | 221 ++++++++++++++++++++++++++++++--
drivers/gpu/drm/xe/xe_migrate.h | 4 +
3 files changed, 223 insertions(+), 17 deletions(-)
diff --git a/drivers/gpu/drm/xe/xe_bo.c b/drivers/gpu/drm/xe/xe_bo.c
index 3c7c2353d3c8..c7e6b03d4aef 100644
--- a/drivers/gpu/drm/xe/xe_bo.c
+++ b/drivers/gpu/drm/xe/xe_bo.c
@@ -1414,6 +1414,7 @@ static int xe_ttm_access_memory(struct ttm_buffer_object *ttm_bo,
struct xe_res_cursor cursor;
struct xe_vram_region *vram;
int bytes_left = len;
+ int err = 0;
xe_bo_assert_held(bo);
xe_device_assert_mem_access(xe);
@@ -1421,9 +1422,14 @@ static int xe_ttm_access_memory(struct ttm_buffer_object *ttm_bo,
if (!mem_type_is_vram(ttm_bo->resource->mem_type))
return -EIO;
- /* FIXME: Use GPU for non-visible VRAM */
- if (!xe_ttm_resource_visible(ttm_bo->resource))
- return -EIO;
+ if (!xe_ttm_resource_visible(ttm_bo->resource) || len >= SZ_16K) {
+ struct xe_migrate *migrate =
+ mem_type_to_migrate(xe, ttm_bo->resource->mem_type);
+
+ err = xe_migrate_access_memory(migrate, bo, offset, buf, len,
+ write);
+ goto out;
+ }
vram = res_to_mem_region(ttm_bo->resource);
xe_res_first(ttm_bo->resource, offset & PAGE_MASK,
@@ -1447,7 +1453,8 @@ static int xe_ttm_access_memory(struct ttm_buffer_object *ttm_bo,
xe_res_next(&cursor, PAGE_SIZE);
} while (bytes_left);
- return len;
+out:
+ return err ?: len;
}
const struct ttm_device_funcs xe_ttm_funcs = {
diff --git a/drivers/gpu/drm/xe/xe_migrate.c b/drivers/gpu/drm/xe/xe_migrate.c
index ff0fc2fb0eb9..ba8568691d99 100644
--- a/drivers/gpu/drm/xe/xe_migrate.c
+++ b/drivers/gpu/drm/xe/xe_migrate.c
@@ -670,6 +670,7 @@ static void emit_copy(struct xe_gt *gt, struct xe_bb *bb,
u32 mocs = 0;
u32 tile_y = 0;
+ xe_gt_assert(gt, !(pitch & 3));
xe_gt_assert(gt, size / pitch <= S16_MAX);
xe_gt_assert(gt, pitch / 4 <= S16_MAX);
xe_gt_assert(gt, pitch <= U16_MAX);
@@ -1602,8 +1603,12 @@ enum xe_migrate_copy_dir {
XE_MIGRATE_COPY_TO_SRAM,
};
+#define XE_CACHELINE_BYTES 64ull
+#define XE_CACHELINE_MASK (XE_CACHELINE_BYTES - 1)
+
static struct dma_fence *xe_migrate_vram(struct xe_migrate *m,
- unsigned long npages,
+ unsigned long len,
+ unsigned long sram_offset,
dma_addr_t *sram_addr, u64 vram_addr,
const enum xe_migrate_copy_dir dir)
{
@@ -1613,17 +1618,21 @@ static struct dma_fence *xe_migrate_vram(struct xe_migrate *m,
struct dma_fence *fence = NULL;
u32 batch_size = 2;
u64 src_L0_ofs, dst_L0_ofs;
- u64 round_update_size;
struct xe_sched_job *job;
struct xe_bb *bb;
u32 update_idx, pt_slot = 0;
+ unsigned long npages = DIV_ROUND_UP(len + sram_offset, PAGE_SIZE);
+ unsigned int pitch = len >= PAGE_SIZE && !(len & ~PAGE_MASK) ?
+ PAGE_SIZE : 4;
int err;
- if (npages * PAGE_SIZE > MAX_PREEMPTDISABLE_TRANSFER)
- return ERR_PTR(-EINVAL);
+ if (drm_WARN_ON(&xe->drm, (len & XE_CACHELINE_MASK) ||
+ (sram_offset | vram_addr) & XE_CACHELINE_MASK))
+ return ERR_PTR(-EOPNOTSUPP);
- round_update_size = npages * PAGE_SIZE;
- batch_size += pte_update_cmd_size(round_update_size);
+ xe_assert(xe, npages * PAGE_SIZE <= MAX_PREEMPTDISABLE_TRANSFER);
+
+ batch_size += pte_update_cmd_size(len);
batch_size += EMIT_COPY_DW;
bb = xe_bb_new(gt, batch_size, use_usm_batch);
@@ -1633,22 +1642,21 @@ static struct dma_fence *xe_migrate_vram(struct xe_migrate *m,
}
build_pt_update_batch_sram(m, bb, pt_slot * XE_PAGE_SIZE,
- sram_addr, round_update_size);
+ sram_addr, len + sram_offset);
if (dir == XE_MIGRATE_COPY_TO_VRAM) {
- src_L0_ofs = xe_migrate_vm_addr(pt_slot, 0);
+ src_L0_ofs = xe_migrate_vm_addr(pt_slot, 0) + sram_offset;
dst_L0_ofs = xe_migrate_vram_ofs(xe, vram_addr, false);
} else {
src_L0_ofs = xe_migrate_vram_ofs(xe, vram_addr, false);
- dst_L0_ofs = xe_migrate_vm_addr(pt_slot, 0);
+ dst_L0_ofs = xe_migrate_vm_addr(pt_slot, 0) + sram_offset;
}
bb->cs[bb->len++] = MI_BATCH_BUFFER_END;
update_idx = bb->len;
- emit_copy(gt, bb, src_L0_ofs, dst_L0_ofs, round_update_size,
- XE_PAGE_SIZE);
+ emit_copy(gt, bb, src_L0_ofs, dst_L0_ofs, len, pitch);
job = xe_bb_create_migration_job(m->q, bb,
xe_migrate_batch_base(m, use_usm_batch),
@@ -1696,7 +1704,7 @@ struct dma_fence *xe_migrate_to_vram(struct xe_migrate *m,
dma_addr_t *src_addr,
u64 dst_addr)
{
- return xe_migrate_vram(m, npages, src_addr, dst_addr,
+ return xe_migrate_vram(m, npages * PAGE_SIZE, 0, src_addr, dst_addr,
XE_MIGRATE_COPY_TO_VRAM);
}
@@ -1717,12 +1725,199 @@ struct dma_fence *xe_migrate_from_vram(struct xe_migrate *m,
u64 src_addr,
dma_addr_t *dst_addr)
{
- return xe_migrate_vram(m, npages, dst_addr, src_addr,
+ return xe_migrate_vram(m, npages * PAGE_SIZE, 0, dst_addr, src_addr,
XE_MIGRATE_COPY_TO_SRAM);
}
#endif
+static void xe_migrate_dma_unmap(struct xe_device *xe, dma_addr_t *dma_addr,
+ int len, int write)
+{
+ unsigned long i, npages = DIV_ROUND_UP(len, PAGE_SIZE);
+
+ for (i = 0; i < npages; ++i) {
+ if (!dma_addr[i])
+ continue;
+
+ dma_unmap_page(xe->drm.dev, dma_addr[i], PAGE_SIZE,
+ write ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
+ }
+ kfree(dma_addr);
+}
+
+static dma_addr_t *xe_migrate_dma_map(struct xe_device *xe,
+ void *buf, int len, int write)
+{
+ dma_addr_t *dma_addr;
+ unsigned long i, npages = DIV_ROUND_UP(len, PAGE_SIZE);
+
+ dma_addr = kcalloc(npages, sizeof(*dma_addr), GFP_KERNEL);
+ if (!dma_addr)
+ return ERR_PTR(-ENOMEM);
+
+ for (i = 0; i < npages; ++i) {
+ dma_addr_t addr;
+ struct page *page;
+
+ if (is_vmalloc_addr(buf))
+ page = vmalloc_to_page(buf);
+ else
+ page = virt_to_page(buf);
+
+ addr = dma_map_page(xe->drm.dev,
+ page, 0, PAGE_SIZE,
+ write ? DMA_TO_DEVICE :
+ DMA_FROM_DEVICE);
+ if (dma_mapping_error(xe->drm.dev, addr))
+ goto err_fault;
+
+ dma_addr[i] = addr;
+ buf += PAGE_SIZE;
+ }
+
+ return dma_addr;
+
+err_fault:
+ xe_migrate_dma_unmap(xe, dma_addr, len, write);
+ return ERR_PTR(-EFAULT);
+}
+
+/**
+ * xe_migrate_access_memory - Access memory of a BO via GPU
+ *
+ * @m: The migration context.
+ * @bo: buffer object
+ * @offset: access offset into buffer object
+ * @buf: pointer to caller memory to read into or write from
+ * @len: length of access
+ * @write: write access
+ *
+ * Access memory of a BO via GPU either reading in or writing from a passed in
+ * pointer. Pointer is dma mapped for GPU access and GPU commands are issued to
+ * read to or write from pointer.
+ *
+ * Returns:
+ * 0 if successful, negative error code on failure.
+ */
+int xe_migrate_access_memory(struct xe_migrate *m, struct xe_bo *bo,
+ unsigned long offset, void *buf, int len,
+ int write)
+{
+ struct xe_tile *tile = m->tile;
+ struct xe_device *xe = tile_to_xe(tile);
+ struct xe_res_cursor cursor;
+ struct dma_fence *fence = NULL;
+ dma_addr_t *dma_addr;
+ unsigned long page_offset = (unsigned long)buf & ~PAGE_MASK;
+ int bytes_left = len, current_page = 0;
+ void *orig_buf = buf;
+
+ xe_bo_assert_held(bo);
+
+ /* Use bounce buffer for small access and unaligned access */
+ if (len & XE_CACHELINE_MASK ||
+ ((uintptr_t)buf | offset) & XE_CACHELINE_MASK) {
+ int buf_offset = 0;
+
+ /*
+ * Less than ideal for large unaligned access but this should be
+ * fairly rare, can fixup if this becomes common.
+ */
+ do {
+ u8 bounce[XE_CACHELINE_BYTES];
+ void *ptr = (void *)bounce;
+ int err;
+ int copy_bytes = min_t(int, bytes_left,
+ XE_CACHELINE_BYTES -
+ (offset & XE_CACHELINE_MASK));
+ int ptr_offset = offset & XE_CACHELINE_MASK;
+
+ err = xe_migrate_access_memory(m, bo,
+ offset &
+ ~XE_CACHELINE_MASK,
+ (void *)ptr,
+ sizeof(bounce), 0);
+ if (err)
+ return err;
+
+ if (!write) {
+ memcpy(buf + buf_offset, ptr + ptr_offset,
+ copy_bytes);
+ goto next;
+ }
+
+ memcpy(ptr + ptr_offset, buf + buf_offset, copy_bytes);
+ err = xe_migrate_access_memory(m, bo,
+ offset & ~XE_CACHELINE_MASK,
+ (void *)ptr,
+ sizeof(bounce), 0);
+ if (err)
+ return err;
+
+next:
+ bytes_left -= copy_bytes;
+ buf_offset += copy_bytes;
+ offset += copy_bytes;
+ } while (bytes_left);
+
+ return 0;
+ }
+
+ dma_addr = xe_migrate_dma_map(xe, buf, len + page_offset, write);
+ if (IS_ERR(dma_addr))
+ return PTR_ERR(dma_addr);
+
+ xe_res_first(bo->ttm.resource, offset, bo->size - offset, &cursor);
+
+ do {
+ struct dma_fence *__fence;
+ u64 vram_addr = vram_region_gpu_offset(bo->ttm.resource) +
+ cursor.start;
+ int current_bytes;
+
+ if (cursor.size > MAX_PREEMPTDISABLE_TRANSFER)
+ current_bytes = min_t(int, bytes_left,
+ MAX_PREEMPTDISABLE_TRANSFER);
+ else
+ current_bytes = min_t(int, bytes_left, cursor.size);
+
+ if (fence)
+ dma_fence_put(fence);
+
+ __fence = xe_migrate_vram(m, current_bytes,
+ (unsigned long)buf & ~PAGE_MASK,
+ dma_addr + current_page,
+ vram_addr, write ?
+ XE_MIGRATE_COPY_TO_VRAM :
+ XE_MIGRATE_COPY_TO_SRAM);
+ if (IS_ERR(__fence)) {
+ if (fence)
+ dma_fence_wait(fence, false);
+ fence = __fence;
+ goto out_err;
+ }
+ fence = __fence;
+
+ buf += current_bytes;
+ offset += current_bytes;
+ current_page = (int)(buf - orig_buf) / PAGE_SIZE;
+ bytes_left -= current_bytes;
+ if (bytes_left)
+ xe_res_next(&cursor, current_bytes);
+ } while (bytes_left);
+
+ dma_fence_wait(fence, false);
+ dma_fence_put(fence);
+ xe_migrate_dma_unmap(xe, dma_addr, len + page_offset, write);
+
+ return 0;
+
+out_err:
+ xe_migrate_dma_unmap(xe, dma_addr, len + page_offset, write);
+ return PTR_ERR(fence);
+}
+
#if IS_ENABLED(CONFIG_DRM_XE_KUNIT_TEST)
#include "tests/xe_migrate.c"
#endif
diff --git a/drivers/gpu/drm/xe/xe_migrate.h b/drivers/gpu/drm/xe/xe_migrate.h
index 6ff9a963425c..fb9839c1bae0 100644
--- a/drivers/gpu/drm/xe/xe_migrate.h
+++ b/drivers/gpu/drm/xe/xe_migrate.h
@@ -112,6 +112,10 @@ struct dma_fence *xe_migrate_copy(struct xe_migrate *m,
struct ttm_resource *dst,
bool copy_only_ccs);
+int xe_migrate_access_memory(struct xe_migrate *m, struct xe_bo *bo,
+ unsigned long offset, void *buf, int len,
+ int write);
+
#define XE_MIGRATE_CLEAR_FLAG_BO_DATA BIT(0)
#define XE_MIGRATE_CLEAR_FLAG_CCS_DATA BIT(1)
#define XE_MIGRATE_CLEAR_FLAG_FULL (XE_MIGRATE_CLEAR_FLAG_BO_DATA | \
--
2.34.1
^ permalink raw reply related [flat|nested] 20+ messages in thread
* [PATCH v2 3/4] drm/print: Add drm_printer_is_full
2025-04-03 20:27 [PATCH v2 0/4] Large devcoredump file support Matthew Brost
2025-04-03 20:27 ` [PATCH v2 1/4] drm/xe: Add devcoredump chunking Matthew Brost
2025-04-03 20:27 ` [PATCH v2 2/4] drm/xe: Update xe_ttm_access_memory to use GPU for non-visible access Matthew Brost
@ 2025-04-03 20:27 ` Matthew Brost
2025-04-03 21:28 ` Cavitt, Jonathan
2025-04-03 20:27 ` [PATCH v2 4/4] drm/xe: Abort printing coredump in VM printer output if full Matthew Brost
` (3 subsequent siblings)
6 siblings, 1 reply; 20+ messages in thread
From: Matthew Brost @ 2025-04-03 20:27 UTC (permalink / raw)
To: intel-xe; +Cc: dri-devel
Add drm_printer_is_full which indicates if a drm printer's output is
full. Useful to short circuit coredump printing once printer's output is
full.
Signed-off-by: Matthew Brost <matthew.brost@intel.com>
---
include/drm/drm_print.h | 17 +++++++++++++++++
1 file changed, 17 insertions(+)
diff --git a/include/drm/drm_print.h b/include/drm/drm_print.h
index f31eba1c7cab..db7517ee1b19 100644
--- a/include/drm/drm_print.h
+++ b/include/drm/drm_print.h
@@ -242,6 +242,23 @@ struct drm_print_iterator {
ssize_t offset;
};
+/**
+ * drm_printer_is_full() - DRM printer output is full
+ * @p: DRM printer
+ *
+ * DRM printer output is full, useful to short circuit coredump printing once
+ * printer is full.
+ *
+ * RETURNS:
+ * True if DRM printer output buffer is full, False otherwise
+ */
+static inline bool drm_printer_is_full(struct drm_printer *p)
+{
+ struct drm_print_iterator *iterator = p->arg;
+
+ return !iterator->remain;
+}
+
/**
* drm_coredump_printer - construct a &drm_printer that can output to a buffer
* from the read function for devcoredump
--
2.34.1
^ permalink raw reply related [flat|nested] 20+ messages in thread
* [PATCH v2 4/4] drm/xe: Abort printing coredump in VM printer output if full
2025-04-03 20:27 [PATCH v2 0/4] Large devcoredump file support Matthew Brost
` (2 preceding siblings ...)
2025-04-03 20:27 ` [PATCH v2 3/4] drm/print: Add drm_printer_is_full Matthew Brost
@ 2025-04-03 20:27 ` Matthew Brost
2025-04-03 21:28 ` Cavitt, Jonathan
2025-04-04 3:31 ` ✓ CI.Patch_applied: success for Large devcoredump file support (rev2) Patchwork
` (2 subsequent siblings)
6 siblings, 1 reply; 20+ messages in thread
From: Matthew Brost @ 2025-04-03 20:27 UTC (permalink / raw)
To: intel-xe; +Cc: dri-devel
Abort printing coredump in VM printer output if full. Helps speedup
large coredumps which need to walked multiple times in
xe_devcoredump_read.
Signed-off-by: Matthew Brost <matthew.brost@intel.com>
---
drivers/gpu/drm/xe/xe_vm.c | 3 +++
1 file changed, 3 insertions(+)
diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c
index 864266e38aa7..164824617a2e 100644
--- a/drivers/gpu/drm/xe/xe_vm.c
+++ b/drivers/gpu/drm/xe/xe_vm.c
@@ -3847,6 +3847,9 @@ void xe_vm_snapshot_print(struct xe_vm_snapshot *snap, struct drm_printer *p)
}
drm_puts(p, "\n");
+
+ if (drm_printer_is_full(p))
+ return;
}
}
--
2.34.1
^ permalink raw reply related [flat|nested] 20+ messages in thread
* RE: [PATCH v2 1/4] drm/xe: Add devcoredump chunking
2025-04-03 20:27 ` [PATCH v2 1/4] drm/xe: Add devcoredump chunking Matthew Brost
@ 2025-04-03 21:28 ` Cavitt, Jonathan
2025-04-03 21:39 ` John Harrison
1 sibling, 0 replies; 20+ messages in thread
From: Cavitt, Jonathan @ 2025-04-03 21:28 UTC (permalink / raw)
To: Brost, Matthew, intel-xe@lists.freedesktop.org
Cc: dri-devel@lists.freedesktop.org, Cavitt, Jonathan
-----Original Message-----
From: Intel-xe <intel-xe-bounces@lists.freedesktop.org> On Behalf Of Matthew Brost
Sent: Thursday, April 3, 2025 1:27 PM
To: intel-xe@lists.freedesktop.org
Cc: dri-devel@lists.freedesktop.org
Subject: [PATCH v2 1/4] drm/xe: Add devcoredump chunking
>
> Chunk devcoredump into 1.5G pieces to avoid hitting the kvmalloc limit
> of 2G. Simple algorithm reads 1.5G at time in xe_devcoredump_read
> callback as needed.
>
> Signed-off-by: Matthew Brost <matthew.brost@intel.com>
I have no issues with this patch, though you should maybe get a second opinion.
Reviewed-by: Jonathan Cavitt <jonathan.cavitt@intel.com>
-Jonathan Cavitt
> ---
> drivers/gpu/drm/xe/xe_devcoredump.c | 59 ++++++++++++++++++-----
> drivers/gpu/drm/xe/xe_devcoredump_types.h | 2 +
> drivers/gpu/drm/xe/xe_guc_hwconfig.c | 2 +-
> 3 files changed, 50 insertions(+), 13 deletions(-)
>
> diff --git a/drivers/gpu/drm/xe/xe_devcoredump.c b/drivers/gpu/drm/xe/xe_devcoredump.c
> index 81b9d9bb3f57..a9e618abf8ac 100644
> --- a/drivers/gpu/drm/xe/xe_devcoredump.c
> +++ b/drivers/gpu/drm/xe/xe_devcoredump.c
> @@ -80,7 +80,8 @@ static struct xe_guc *exec_queue_to_guc(struct xe_exec_queue *q)
> return &q->gt->uc.guc;
> }
>
> -static ssize_t __xe_devcoredump_read(char *buffer, size_t count,
> +static ssize_t __xe_devcoredump_read(char *buffer, ssize_t count,
> + ssize_t start,
> struct xe_devcoredump *coredump)
> {
> struct xe_device *xe;
> @@ -94,7 +95,7 @@ static ssize_t __xe_devcoredump_read(char *buffer, size_t count,
> ss = &coredump->snapshot;
>
> iter.data = buffer;
> - iter.start = 0;
> + iter.start = start;
> iter.remain = count;
>
> p = drm_coredump_printer(&iter);
> @@ -168,6 +169,8 @@ static void xe_devcoredump_snapshot_free(struct xe_devcoredump_snapshot *ss)
> ss->vm = NULL;
> }
>
> +#define XE_DEVCOREDUMP_CHUNK_MAX (SZ_512M + SZ_1G)
> +
> static ssize_t xe_devcoredump_read(char *buffer, loff_t offset,
> size_t count, void *data, size_t datalen)
> {
> @@ -183,6 +186,9 @@ static ssize_t xe_devcoredump_read(char *buffer, loff_t offset,
> /* Ensure delayed work is captured before continuing */
> flush_work(&ss->work);
>
> + if (ss->read.size > XE_DEVCOREDUMP_CHUNK_MAX)
> + xe_pm_runtime_get(gt_to_xe(ss->gt));
> +
> mutex_lock(&coredump->lock);
>
> if (!ss->read.buffer) {
> @@ -195,12 +201,26 @@ static ssize_t xe_devcoredump_read(char *buffer, loff_t offset,
> return 0;
> }
>
> + if (offset >= ss->read.chunk_position + XE_DEVCOREDUMP_CHUNK_MAX ||
> + offset < ss->read.chunk_position) {
> + ss->read.chunk_position =
> + ALIGN_DOWN(offset, XE_DEVCOREDUMP_CHUNK_MAX);
> +
> + __xe_devcoredump_read(ss->read.buffer,
> + XE_DEVCOREDUMP_CHUNK_MAX,
> + ss->read.chunk_position, coredump);
> + }
> +
> byte_copied = count < ss->read.size - offset ? count :
> ss->read.size - offset;
> - memcpy(buffer, ss->read.buffer + offset, byte_copied);
> + memcpy(buffer, ss->read.buffer +
> + (offset % XE_DEVCOREDUMP_CHUNK_MAX), byte_copied);
>
> mutex_unlock(&coredump->lock);
>
> + if (ss->read.size > XE_DEVCOREDUMP_CHUNK_MAX)
> + xe_pm_runtime_put(gt_to_xe(ss->gt));
> +
> return byte_copied;
> }
>
> @@ -254,17 +274,32 @@ static void xe_devcoredump_deferred_snap_work(struct work_struct *work)
> xe_guc_exec_queue_snapshot_capture_delayed(ss->ge);
> xe_force_wake_put(gt_to_fw(ss->gt), fw_ref);
>
> - xe_pm_runtime_put(xe);
> + ss->read.chunk_position = 0;
>
> /* Calculate devcoredump size */
> - ss->read.size = __xe_devcoredump_read(NULL, INT_MAX, coredump);
> -
> - ss->read.buffer = kvmalloc(ss->read.size, GFP_USER);
> - if (!ss->read.buffer)
> - return;
> + ss->read.size = __xe_devcoredump_read(NULL, LONG_MAX, 0, coredump);
> +
> + if (ss->read.size > XE_DEVCOREDUMP_CHUNK_MAX) {
> + ss->read.buffer = kvmalloc(XE_DEVCOREDUMP_CHUNK_MAX,
> + GFP_USER);
> + if (!ss->read.buffer)
> + goto put_pm;
> +
> + __xe_devcoredump_read(ss->read.buffer,
> + XE_DEVCOREDUMP_CHUNK_MAX,
> + 0, coredump);
> + } else {
> + ss->read.buffer = kvmalloc(ss->read.size, GFP_USER);
> + if (!ss->read.buffer)
> + goto put_pm;
> +
> + __xe_devcoredump_read(ss->read.buffer, ss->read.size, 0,
> + coredump);
> + xe_devcoredump_snapshot_free(ss);
> + }
>
> - __xe_devcoredump_read(ss->read.buffer, ss->read.size, coredump);
> - xe_devcoredump_snapshot_free(ss);
> +put_pm:
> + xe_pm_runtime_put(xe);
> }
>
> static void devcoredump_snapshot(struct xe_devcoredump *coredump,
> @@ -425,7 +460,7 @@ void xe_print_blob_ascii85(struct drm_printer *p, const char *prefix, char suffi
> if (offset & 3)
> drm_printf(p, "Offset not word aligned: %zu", offset);
>
> - line_buff = kzalloc(DMESG_MAX_LINE_LEN, GFP_KERNEL);
> + line_buff = kzalloc(DMESG_MAX_LINE_LEN, GFP_ATOMIC);
> if (!line_buff) {
> drm_printf(p, "Failed to allocate line buffer\n");
> return;
> diff --git a/drivers/gpu/drm/xe/xe_devcoredump_types.h b/drivers/gpu/drm/xe/xe_devcoredump_types.h
> index 1a1d16a96b2d..a174385a6d83 100644
> --- a/drivers/gpu/drm/xe/xe_devcoredump_types.h
> +++ b/drivers/gpu/drm/xe/xe_devcoredump_types.h
> @@ -66,6 +66,8 @@ struct xe_devcoredump_snapshot {
> struct {
> /** @read.size: size of devcoredump in human readable format */
> ssize_t size;
> + /** @read.chunk_position: position of devcoredump chunk */
> + ssize_t chunk_position;
> /** @read.buffer: buffer of devcoredump in human readable format */
> char *buffer;
> } read;
> diff --git a/drivers/gpu/drm/xe/xe_guc_hwconfig.c b/drivers/gpu/drm/xe/xe_guc_hwconfig.c
> index af2c817d552c..21403a250834 100644
> --- a/drivers/gpu/drm/xe/xe_guc_hwconfig.c
> +++ b/drivers/gpu/drm/xe/xe_guc_hwconfig.c
> @@ -175,7 +175,7 @@ int xe_guc_hwconfig_lookup_u32(struct xe_guc *guc, u32 attribute, u32 *val)
> if (num_dw == 0)
> return -EINVAL;
>
> - hwconfig = kzalloc(size, GFP_KERNEL);
> + hwconfig = kzalloc(size, GFP_ATOMIC);
> if (!hwconfig)
> return -ENOMEM;
>
> --
> 2.34.1
>
>
^ permalink raw reply [flat|nested] 20+ messages in thread
* RE: [PATCH v2 2/4] drm/xe: Update xe_ttm_access_memory to use GPU for non-visible access
2025-04-03 20:27 ` [PATCH v2 2/4] drm/xe: Update xe_ttm_access_memory to use GPU for non-visible access Matthew Brost
@ 2025-04-03 21:28 ` Cavitt, Jonathan
2025-04-10 3:58 ` Matthew Brost
2025-04-04 1:03 ` kernel test robot
` (2 subsequent siblings)
3 siblings, 1 reply; 20+ messages in thread
From: Cavitt, Jonathan @ 2025-04-03 21:28 UTC (permalink / raw)
To: Brost, Matthew, intel-xe@lists.freedesktop.org
Cc: dri-devel@lists.freedesktop.org, Cavitt, Jonathan
-----Original Message-----
From: Intel-xe <intel-xe-bounces@lists.freedesktop.org> On Behalf Of Matthew Brost
Sent: Thursday, April 3, 2025 1:27 PM
To: intel-xe@lists.freedesktop.org
Cc: dri-devel@lists.freedesktop.org
Subject: [PATCH v2 2/4] drm/xe: Update xe_ttm_access_memory to use GPU for non-visible access
>
> Add migrate layer functions to access VRAM and update
> xe_ttm_access_memory to use for non-visible access and large (more than
> 16k) BO access. 8G devcoreump on BMG observed 3 minute CPU copy time vs.
> 3s GPU copy time.
>
> v4:
> - Fix non-page aligned accesses
> - Add support for small / unaligned access
> - Update commit message indicating migrate used for large accesses (Auld)
> - Fix warning in xe_res_cursor for non-zero offset
> v5:
> - Fix 32 bit build (CI)
> v6:
> - Rebase and use SVM migration copy functions
> v7:
> - Fix build error (CI)
>
> Signed-off-by: Matthew Brost <matthew.brost@intel.com>
I left some notes down below. Most are non-blocking, which I labeled as such. And
as for the rest, it's probable that I'm just misunderstanding some parts of the code, and
that the notes I left are not relevant.
So, I'll be providing my reviewed-by now in the case that the blocking review notes
turn out to not need to be addressed. And in the case they need to be addressed,
you can take my reviewed-by for the next revision.
Reviewed-by: Jonathan Cavitt <jonathan.cavitt@intel.com>
> ---
> drivers/gpu/drm/xe/xe_bo.c | 15 ++-
> drivers/gpu/drm/xe/xe_migrate.c | 221 ++++++++++++++++++++++++++++++--
> drivers/gpu/drm/xe/xe_migrate.h | 4 +
> 3 files changed, 223 insertions(+), 17 deletions(-)
>
> diff --git a/drivers/gpu/drm/xe/xe_bo.c b/drivers/gpu/drm/xe/xe_bo.c
> index 3c7c2353d3c8..c7e6b03d4aef 100644
> --- a/drivers/gpu/drm/xe/xe_bo.c
> +++ b/drivers/gpu/drm/xe/xe_bo.c
> @@ -1414,6 +1414,7 @@ static int xe_ttm_access_memory(struct ttm_buffer_object *ttm_bo,
> struct xe_res_cursor cursor;
> struct xe_vram_region *vram;
> int bytes_left = len;
> + int err = 0;
>
> xe_bo_assert_held(bo);
> xe_device_assert_mem_access(xe);
> @@ -1421,9 +1422,14 @@ static int xe_ttm_access_memory(struct ttm_buffer_object *ttm_bo,
> if (!mem_type_is_vram(ttm_bo->resource->mem_type))
> return -EIO;
>
> - /* FIXME: Use GPU for non-visible VRAM */
> - if (!xe_ttm_resource_visible(ttm_bo->resource))
> - return -EIO;
> + if (!xe_ttm_resource_visible(ttm_bo->resource) || len >= SZ_16K) {
> + struct xe_migrate *migrate =
> + mem_type_to_migrate(xe, ttm_bo->resource->mem_type);
> +
> + err = xe_migrate_access_memory(migrate, bo, offset, buf, len,
> + write);
> + goto out;
> + }
>
> vram = res_to_mem_region(ttm_bo->resource);
> xe_res_first(ttm_bo->resource, offset & PAGE_MASK,
> @@ -1447,7 +1453,8 @@ static int xe_ttm_access_memory(struct ttm_buffer_object *ttm_bo,
> xe_res_next(&cursor, PAGE_SIZE);
> } while (bytes_left);
>
> - return len;
> +out:
> + return err ?: len;
> }
>
> const struct ttm_device_funcs xe_ttm_funcs = {
> diff --git a/drivers/gpu/drm/xe/xe_migrate.c b/drivers/gpu/drm/xe/xe_migrate.c
> index ff0fc2fb0eb9..ba8568691d99 100644
> --- a/drivers/gpu/drm/xe/xe_migrate.c
> +++ b/drivers/gpu/drm/xe/xe_migrate.c
> @@ -670,6 +670,7 @@ static void emit_copy(struct xe_gt *gt, struct xe_bb *bb,
> u32 mocs = 0;
> u32 tile_y = 0;
>
> + xe_gt_assert(gt, !(pitch & 3));
> xe_gt_assert(gt, size / pitch <= S16_MAX);
> xe_gt_assert(gt, pitch / 4 <= S16_MAX);
> xe_gt_assert(gt, pitch <= U16_MAX);
> @@ -1602,8 +1603,12 @@ enum xe_migrate_copy_dir {
> XE_MIGRATE_COPY_TO_SRAM,
> };
>
> +#define XE_CACHELINE_BYTES 64ull
> +#define XE_CACHELINE_MASK (XE_CACHELINE_BYTES - 1)
> +
> static struct dma_fence *xe_migrate_vram(struct xe_migrate *m,
> - unsigned long npages,
> + unsigned long len,
> + unsigned long sram_offset,
> dma_addr_t *sram_addr, u64 vram_addr,
> const enum xe_migrate_copy_dir dir)
> {
> @@ -1613,17 +1618,21 @@ static struct dma_fence *xe_migrate_vram(struct xe_migrate *m,
> struct dma_fence *fence = NULL;
> u32 batch_size = 2;
> u64 src_L0_ofs, dst_L0_ofs;
> - u64 round_update_size;
> struct xe_sched_job *job;
> struct xe_bb *bb;
> u32 update_idx, pt_slot = 0;
> + unsigned long npages = DIV_ROUND_UP(len + sram_offset, PAGE_SIZE);
> + unsigned int pitch = len >= PAGE_SIZE && !(len & ~PAGE_MASK) ?
> + PAGE_SIZE : 4;
> int err;
>
> - if (npages * PAGE_SIZE > MAX_PREEMPTDISABLE_TRANSFER)
> - return ERR_PTR(-EINVAL);
> + if (drm_WARN_ON(&xe->drm, (len & XE_CACHELINE_MASK) ||
> + (sram_offset | vram_addr) & XE_CACHELINE_MASK))
> + return ERR_PTR(-EOPNOTSUPP);
>
> - round_update_size = npages * PAGE_SIZE;
> - batch_size += pte_update_cmd_size(round_update_size);
> + xe_assert(xe, npages * PAGE_SIZE <= MAX_PREEMPTDISABLE_TRANSFER);
Hmm... Does this case still need to return an error value? I don't think replacing it with an
xe_assert is entirely valid, as asserts should not be used as a replacement of proper error
handling. Or so I've been told.
So, I guess the question is: what error was this preventing previously, and is it still a concern?
> +
> + batch_size += pte_update_cmd_size(len);
> batch_size += EMIT_COPY_DW;
>
> bb = xe_bb_new(gt, batch_size, use_usm_batch);
> @@ -1633,22 +1642,21 @@ static struct dma_fence *xe_migrate_vram(struct xe_migrate *m,
> }
>
> build_pt_update_batch_sram(m, bb, pt_slot * XE_PAGE_SIZE,
> - sram_addr, round_update_size);
> + sram_addr, len + sram_offset);
>
> if (dir == XE_MIGRATE_COPY_TO_VRAM) {
> - src_L0_ofs = xe_migrate_vm_addr(pt_slot, 0);
> + src_L0_ofs = xe_migrate_vm_addr(pt_slot, 0) + sram_offset;
> dst_L0_ofs = xe_migrate_vram_ofs(xe, vram_addr, false);
>
> } else {
> src_L0_ofs = xe_migrate_vram_ofs(xe, vram_addr, false);
> - dst_L0_ofs = xe_migrate_vm_addr(pt_slot, 0);
> + dst_L0_ofs = xe_migrate_vm_addr(pt_slot, 0) + sram_offset;
> }
>
> bb->cs[bb->len++] = MI_BATCH_BUFFER_END;
> update_idx = bb->len;
>
> - emit_copy(gt, bb, src_L0_ofs, dst_L0_ofs, round_update_size,
> - XE_PAGE_SIZE);
> + emit_copy(gt, bb, src_L0_ofs, dst_L0_ofs, len, pitch);
>
> job = xe_bb_create_migration_job(m->q, bb,
> xe_migrate_batch_base(m, use_usm_batch),
> @@ -1696,7 +1704,7 @@ struct dma_fence *xe_migrate_to_vram(struct xe_migrate *m,
> dma_addr_t *src_addr,
> u64 dst_addr)
> {
> - return xe_migrate_vram(m, npages, src_addr, dst_addr,
> + return xe_migrate_vram(m, npages * PAGE_SIZE, 0, src_addr, dst_addr,
> XE_MIGRATE_COPY_TO_VRAM);
> }
>
> @@ -1717,12 +1725,199 @@ struct dma_fence *xe_migrate_from_vram(struct xe_migrate *m,
> u64 src_addr,
> dma_addr_t *dst_addr)
> {
> - return xe_migrate_vram(m, npages, dst_addr, src_addr,
> + return xe_migrate_vram(m, npages * PAGE_SIZE, 0, dst_addr, src_addr,
> XE_MIGRATE_COPY_TO_SRAM);
> }
>
> #endif
>
> +static void xe_migrate_dma_unmap(struct xe_device *xe, dma_addr_t *dma_addr,
> + int len, int write)
> +{
> + unsigned long i, npages = DIV_ROUND_UP(len, PAGE_SIZE);
> +
> + for (i = 0; i < npages; ++i) {
> + if (!dma_addr[i])
> + continue;
Non-blocking:
I'm guessing the dma_addr array can be non-contiguous (I.E. position 3 having a
dma address does not imply positions 1 and 2 have them). Because otherwise
this can be a break condition instead of a continue.
> +
> + dma_unmap_page(xe->drm.dev, dma_addr[i], PAGE_SIZE,
> + write ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
> + }
> + kfree(dma_addr);
> +}
> +
> +static dma_addr_t *xe_migrate_dma_map(struct xe_device *xe,
> + void *buf, int len, int write)
> +{
> + dma_addr_t *dma_addr;
> + unsigned long i, npages = DIV_ROUND_UP(len, PAGE_SIZE);
> +
> + dma_addr = kcalloc(npages, sizeof(*dma_addr), GFP_KERNEL);
> + if (!dma_addr)
> + return ERR_PTR(-ENOMEM);
> +
> + for (i = 0; i < npages; ++i) {
> + dma_addr_t addr;
> + struct page *page;
> +
> + if (is_vmalloc_addr(buf))
> + page = vmalloc_to_page(buf);
> + else
> + page = virt_to_page(buf);
> +
> + addr = dma_map_page(xe->drm.dev,
> + page, 0, PAGE_SIZE,
> + write ? DMA_TO_DEVICE :
> + DMA_FROM_DEVICE);
> + if (dma_mapping_error(xe->drm.dev, addr))
> + goto err_fault;
> +
> + dma_addr[i] = addr;
> + buf += PAGE_SIZE;
> + }
> +
> + return dma_addr;
> +
> +err_fault:
> + xe_migrate_dma_unmap(xe, dma_addr, len, write);
> + return ERR_PTR(-EFAULT);
> +}
> +
> +/**
> + * xe_migrate_access_memory - Access memory of a BO via GPU
> + *
> + * @m: The migration context.
> + * @bo: buffer object
> + * @offset: access offset into buffer object
> + * @buf: pointer to caller memory to read into or write from
> + * @len: length of access
> + * @write: write access
> + *
> + * Access memory of a BO via GPU either reading in or writing from a passed in
> + * pointer. Pointer is dma mapped for GPU access and GPU commands are issued to
> + * read to or write from pointer.
> + *
> + * Returns:
> + * 0 if successful, negative error code on failure.
> + */
> +int xe_migrate_access_memory(struct xe_migrate *m, struct xe_bo *bo,
> + unsigned long offset, void *buf, int len,
> + int write)
> +{
> + struct xe_tile *tile = m->tile;
> + struct xe_device *xe = tile_to_xe(tile);
> + struct xe_res_cursor cursor;
> + struct dma_fence *fence = NULL;
> + dma_addr_t *dma_addr;
> + unsigned long page_offset = (unsigned long)buf & ~PAGE_MASK;
> + int bytes_left = len, current_page = 0;
> + void *orig_buf = buf;
> +
> + xe_bo_assert_held(bo);
> +
> + /* Use bounce buffer for small access and unaligned access */
> + if (len & XE_CACHELINE_MASK ||
> + ((uintptr_t)buf | offset) & XE_CACHELINE_MASK) {
> + int buf_offset = 0;
> +
> + /*
> + * Less than ideal for large unaligned access but this should be
> + * fairly rare, can fixup if this becomes common.
> + */
> + do {
> + u8 bounce[XE_CACHELINE_BYTES];
> + void *ptr = (void *)bounce;
> + int err;
> + int copy_bytes = min_t(int, bytes_left,
> + XE_CACHELINE_BYTES -
> + (offset & XE_CACHELINE_MASK));
> + int ptr_offset = offset & XE_CACHELINE_MASK;
> +
> + err = xe_migrate_access_memory(m, bo,
> + offset &
> + ~XE_CACHELINE_MASK,
> + (void *)ptr,
> + sizeof(bounce), 0);
> + if (err)
> + return err;
> +
> + if (!write) {
> + memcpy(buf + buf_offset, ptr + ptr_offset,
> + copy_bytes);
> + goto next;
> + }
Non-blocking:
This should debatably be an if-else block, rather than relying on a "goto next"
here, as "goto next" is not used elsewhere in the function.
"""
if (write) {
memcpy(ptr + ptr_offset, buf + buf_offset, copy_bytes);
err = xe_migrate_access_memory(m, bo,
offset & ~XE_CACHELINE_MASK,
(void *)ptr,
sizeof(bounce), 0);
} else {
memcpy(buf + buf_offset, ptr + ptr_offset,
copy_bytes);
}
if (err)
return err;
"""
> +
> + memcpy(ptr + ptr_offset, buf + buf_offset, copy_bytes);
> + err = xe_migrate_access_memory(m, bo,
> + offset & ~XE_CACHELINE_MASK,
> + (void *)ptr,
> + sizeof(bounce), 0);
> + if (err)
> + return err;
> +
> +next:
> + bytes_left -= copy_bytes;
> + buf_offset += copy_bytes;
> + offset += copy_bytes;
> + } while (bytes_left);
> +
> + return 0;
> + }
> +
> + dma_addr = xe_migrate_dma_map(xe, buf, len + page_offset, write);
> + if (IS_ERR(dma_addr))
> + return PTR_ERR(dma_addr);
> +
> + xe_res_first(bo->ttm.resource, offset, bo->size - offset, &cursor);
> +
> + do {
> + struct dma_fence *__fence;
> + u64 vram_addr = vram_region_gpu_offset(bo->ttm.resource) +
> + cursor.start;
> + int current_bytes;
> +
> + if (cursor.size > MAX_PREEMPTDISABLE_TRANSFER)
> + current_bytes = min_t(int, bytes_left,
> + MAX_PREEMPTDISABLE_TRANSFER);
> + else
> + current_bytes = min_t(int, bytes_left, cursor.size);
> +
> + if (fence)
> + dma_fence_put(fence);
> +
> + __fence = xe_migrate_vram(m, current_bytes,
> + (unsigned long)buf & ~PAGE_MASK,
> + dma_addr + current_page,
> + vram_addr, write ?
> + XE_MIGRATE_COPY_TO_VRAM :
> + XE_MIGRATE_COPY_TO_SRAM);
> + if (IS_ERR(__fence)) {
> + if (fence)
> + dma_fence_wait(fence, false);
> + fence = __fence;
> + goto out_err;
> + }
> + fence = __fence;
Non-blocking:
It would be nice if we didn't have two paths assigning fence the __fence value,
but any attempt I do to try and change that ends in having to compute the IS_ERR
value for __fence twice. E.G.:
"""
if (IS_ERR(__fence) && fence)
dma_fence_wait(fence, false);
fence = __fence;
if (IS_ERR(fence))
goto out_err;
"""
So, either way works.
> +
> + buf += current_bytes;
> + offset += current_bytes;
> + current_page = (int)(buf - orig_buf) / PAGE_SIZE;
> + bytes_left -= current_bytes;
> + if (bytes_left)
> + xe_res_next(&cursor, current_bytes);
> + } while (bytes_left);
> +
> + dma_fence_wait(fence, false);
> + dma_fence_put(fence);
> + xe_migrate_dma_unmap(xe, dma_addr, len + page_offset, write);
> +
> + return 0;
> +
> +out_err:
> + xe_migrate_dma_unmap(xe, dma_addr, len + page_offset, write);
> + return PTR_ERR(fence);
Non-blocking:
The branching execution might be able to be shortened. Perhaps:
"""
dma_fence_wait(fence, false);
dma_fence_put(fence);
out_err:
xe_migtate_dma_unmap(xe, dma_addr, len + page_offset, write);
return IS_ERR(fence) ? PTR_ERR(fence) : 0;
}
"""
-Jonathan Cavitt
> +}
> +
> #if IS_ENABLED(CONFIG_DRM_XE_KUNIT_TEST)
> #include "tests/xe_migrate.c"
> #endif
> diff --git a/drivers/gpu/drm/xe/xe_migrate.h b/drivers/gpu/drm/xe/xe_migrate.h
> index 6ff9a963425c..fb9839c1bae0 100644
> --- a/drivers/gpu/drm/xe/xe_migrate.h
> +++ b/drivers/gpu/drm/xe/xe_migrate.h
> @@ -112,6 +112,10 @@ struct dma_fence *xe_migrate_copy(struct xe_migrate *m,
> struct ttm_resource *dst,
> bool copy_only_ccs);
>
> +int xe_migrate_access_memory(struct xe_migrate *m, struct xe_bo *bo,
> + unsigned long offset, void *buf, int len,
> + int write);
> +
> #define XE_MIGRATE_CLEAR_FLAG_BO_DATA BIT(0)
> #define XE_MIGRATE_CLEAR_FLAG_CCS_DATA BIT(1)
> #define XE_MIGRATE_CLEAR_FLAG_FULL (XE_MIGRATE_CLEAR_FLAG_BO_DATA | \
> --
> 2.34.1
>
>
^ permalink raw reply [flat|nested] 20+ messages in thread
* RE: [PATCH v2 3/4] drm/print: Add drm_printer_is_full
2025-04-03 20:27 ` [PATCH v2 3/4] drm/print: Add drm_printer_is_full Matthew Brost
@ 2025-04-03 21:28 ` Cavitt, Jonathan
0 siblings, 0 replies; 20+ messages in thread
From: Cavitt, Jonathan @ 2025-04-03 21:28 UTC (permalink / raw)
To: Brost, Matthew, intel-xe@lists.freedesktop.org
Cc: dri-devel@lists.freedesktop.org, Cavitt, Jonathan
-----Original Message-----
From: Intel-xe <intel-xe-bounces@lists.freedesktop.org> On Behalf Of Matthew Brost
Sent: Thursday, April 3, 2025 1:27 PM
To: intel-xe@lists.freedesktop.org
Cc: dri-devel@lists.freedesktop.org
Subject: [PATCH v2 3/4] drm/print: Add drm_printer_is_full
>
> Add drm_printer_is_full which indicates if a drm printer's output is
> full. Useful to short circuit coredump printing once printer's output is
> full.
>
> Signed-off-by: Matthew Brost <matthew.brost@intel.com>
LGTM, though I will admit that "remain" is a bit of a strange name for the
iterator variable. I'd almost imagine "remain" to refer to the "remaining
output the iterator needs to print", rather than the "remaining space in
the output buffer of the iterator".
But that's just an aside. Nothing worth blocking on.
Reviewed-by: Jonathan Cavitt <jonathan.cavitt@intel.com>
-Jonathan Cavitt
> ---
> include/drm/drm_print.h | 17 +++++++++++++++++
> 1 file changed, 17 insertions(+)
>
> diff --git a/include/drm/drm_print.h b/include/drm/drm_print.h
> index f31eba1c7cab..db7517ee1b19 100644
> --- a/include/drm/drm_print.h
> +++ b/include/drm/drm_print.h
> @@ -242,6 +242,23 @@ struct drm_print_iterator {
> ssize_t offset;
> };
>
> +/**
> + * drm_printer_is_full() - DRM printer output is full
> + * @p: DRM printer
> + *
> + * DRM printer output is full, useful to short circuit coredump printing once
> + * printer is full.
> + *
> + * RETURNS:
> + * True if DRM printer output buffer is full, False otherwise
> + */
> +static inline bool drm_printer_is_full(struct drm_printer *p)
> +{
> + struct drm_print_iterator *iterator = p->arg;
> +
> + return !iterator->remain;
> +}
> +
> /**
> * drm_coredump_printer - construct a &drm_printer that can output to a buffer
> * from the read function for devcoredump
> --
> 2.34.1
>
>
^ permalink raw reply [flat|nested] 20+ messages in thread
* RE: [PATCH v2 4/4] drm/xe: Abort printing coredump in VM printer output if full
2025-04-03 20:27 ` [PATCH v2 4/4] drm/xe: Abort printing coredump in VM printer output if full Matthew Brost
@ 2025-04-03 21:28 ` Cavitt, Jonathan
0 siblings, 0 replies; 20+ messages in thread
From: Cavitt, Jonathan @ 2025-04-03 21:28 UTC (permalink / raw)
To: Brost, Matthew, intel-xe@lists.freedesktop.org
Cc: dri-devel@lists.freedesktop.org, Cavitt, Jonathan
-----Original Message-----
From: Intel-xe <intel-xe-bounces@lists.freedesktop.org> On Behalf Of Matthew Brost
Sent: Thursday, April 3, 2025 1:27 PM
To: intel-xe@lists.freedesktop.org
Cc: dri-devel@lists.freedesktop.org
Subject: [PATCH v2 4/4] drm/xe: Abort printing coredump in VM printer output if full
>
> Abort printing coredump in VM printer output if full. Helps speedup
> large coredumps which need to walked multiple times in
> xe_devcoredump_read.
>
> Signed-off-by: Matthew Brost <matthew.brost@intel.com>
LGTM.
Reviewed-by: Jonathan Cavitt <jonathan.cavitt@intel.com>
-Jonathan Cavitt
> ---
> drivers/gpu/drm/xe/xe_vm.c | 3 +++
> 1 file changed, 3 insertions(+)
>
> diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c
> index 864266e38aa7..164824617a2e 100644
> --- a/drivers/gpu/drm/xe/xe_vm.c
> +++ b/drivers/gpu/drm/xe/xe_vm.c
> @@ -3847,6 +3847,9 @@ void xe_vm_snapshot_print(struct xe_vm_snapshot *snap, struct drm_printer *p)
> }
>
> drm_puts(p, "\n");
> +
> + if (drm_printer_is_full(p))
> + return;
> }
> }
>
> --
> 2.34.1
>
>
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH v2 1/4] drm/xe: Add devcoredump chunking
2025-04-03 20:27 ` [PATCH v2 1/4] drm/xe: Add devcoredump chunking Matthew Brost
2025-04-03 21:28 ` Cavitt, Jonathan
@ 2025-04-03 21:39 ` John Harrison
2025-04-03 22:32 ` Matthew Brost
1 sibling, 1 reply; 20+ messages in thread
From: John Harrison @ 2025-04-03 21:39 UTC (permalink / raw)
To: Matthew Brost, intel-xe; +Cc: dri-devel
On 4/3/2025 1:27 PM, Matthew Brost wrote:
> Chunk devcoredump into 1.5G pieces to avoid hitting the kvmalloc limit
> of 2G. Simple algorithm reads 1.5G at time in xe_devcoredump_read
> callback as needed.
>
> Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> ---
> drivers/gpu/drm/xe/xe_devcoredump.c | 59 ++++++++++++++++++-----
> drivers/gpu/drm/xe/xe_devcoredump_types.h | 2 +
> drivers/gpu/drm/xe/xe_guc_hwconfig.c | 2 +-
> 3 files changed, 50 insertions(+), 13 deletions(-)
>
> diff --git a/drivers/gpu/drm/xe/xe_devcoredump.c b/drivers/gpu/drm/xe/xe_devcoredump.c
> index 81b9d9bb3f57..a9e618abf8ac 100644
> --- a/drivers/gpu/drm/xe/xe_devcoredump.c
> +++ b/drivers/gpu/drm/xe/xe_devcoredump.c
> @@ -80,7 +80,8 @@ static struct xe_guc *exec_queue_to_guc(struct xe_exec_queue *q)
> return &q->gt->uc.guc;
> }
>
> -static ssize_t __xe_devcoredump_read(char *buffer, size_t count,
> +static ssize_t __xe_devcoredump_read(char *buffer, ssize_t count,
> + ssize_t start,
> struct xe_devcoredump *coredump)
> {
> struct xe_device *xe;
> @@ -94,7 +95,7 @@ static ssize_t __xe_devcoredump_read(char *buffer, size_t count,
> ss = &coredump->snapshot;
>
> iter.data = buffer;
> - iter.start = 0;
> + iter.start = start;
> iter.remain = count;
>
> p = drm_coredump_printer(&iter);
> @@ -168,6 +169,8 @@ static void xe_devcoredump_snapshot_free(struct xe_devcoredump_snapshot *ss)
> ss->vm = NULL;
> }
>
> +#define XE_DEVCOREDUMP_CHUNK_MAX (SZ_512M + SZ_1G)
> +
> static ssize_t xe_devcoredump_read(char *buffer, loff_t offset,
> size_t count, void *data, size_t datalen)
> {
> @@ -183,6 +186,9 @@ static ssize_t xe_devcoredump_read(char *buffer, loff_t offset,
> /* Ensure delayed work is captured before continuing */
> flush_work(&ss->work);
>
> + if (ss->read.size > XE_DEVCOREDUMP_CHUNK_MAX)
> + xe_pm_runtime_get(gt_to_xe(ss->gt));
> +
> mutex_lock(&coredump->lock);
>
> if (!ss->read.buffer) {
> @@ -195,12 +201,26 @@ static ssize_t xe_devcoredump_read(char *buffer, loff_t offset,
> return 0;
> }
>
> + if (offset >= ss->read.chunk_position + XE_DEVCOREDUMP_CHUNK_MAX ||
> + offset < ss->read.chunk_position) {
> + ss->read.chunk_position =
> + ALIGN_DOWN(offset, XE_DEVCOREDUMP_CHUNK_MAX);
> +
> + __xe_devcoredump_read(ss->read.buffer,
> + XE_DEVCOREDUMP_CHUNK_MAX,
> + ss->read.chunk_position, coredump);
> + }
> +
> byte_copied = count < ss->read.size - offset ? count :
> ss->read.size - offset;
> - memcpy(buffer, ss->read.buffer + offset, byte_copied);
> + memcpy(buffer, ss->read.buffer +
> + (offset % XE_DEVCOREDUMP_CHUNK_MAX), byte_copied);
>
> mutex_unlock(&coredump->lock);
>
> + if (ss->read.size > XE_DEVCOREDUMP_CHUNK_MAX)
> + xe_pm_runtime_put(gt_to_xe(ss->gt));
> +
> return byte_copied;
> }
>
> @@ -254,17 +274,32 @@ static void xe_devcoredump_deferred_snap_work(struct work_struct *work)
> xe_guc_exec_queue_snapshot_capture_delayed(ss->ge);
> xe_force_wake_put(gt_to_fw(ss->gt), fw_ref);
>
> - xe_pm_runtime_put(xe);
> + ss->read.chunk_position = 0;
>
> /* Calculate devcoredump size */
> - ss->read.size = __xe_devcoredump_read(NULL, INT_MAX, coredump);
> -
> - ss->read.buffer = kvmalloc(ss->read.size, GFP_USER);
> - if (!ss->read.buffer)
> - return;
> + ss->read.size = __xe_devcoredump_read(NULL, LONG_MAX, 0, coredump);
> +
> + if (ss->read.size > XE_DEVCOREDUMP_CHUNK_MAX) {
> + ss->read.buffer = kvmalloc(XE_DEVCOREDUMP_CHUNK_MAX,
> + GFP_USER);
> + if (!ss->read.buffer)
> + goto put_pm;
> +
> + __xe_devcoredump_read(ss->read.buffer,
> + XE_DEVCOREDUMP_CHUNK_MAX,
> + 0, coredump);
> + } else {
> + ss->read.buffer = kvmalloc(ss->read.size, GFP_USER);
> + if (!ss->read.buffer)
> + goto put_pm;
> +
> + __xe_devcoredump_read(ss->read.buffer, ss->read.size, 0,
> + coredump);
> + xe_devcoredump_snapshot_free(ss);
> + }
>
> - __xe_devcoredump_read(ss->read.buffer, ss->read.size, coredump);
> - xe_devcoredump_snapshot_free(ss);
> +put_pm:
> + xe_pm_runtime_put(xe);
> }
>
> static void devcoredump_snapshot(struct xe_devcoredump *coredump,
> @@ -425,7 +460,7 @@ void xe_print_blob_ascii85(struct drm_printer *p, const char *prefix, char suffi
> if (offset & 3)
> drm_printf(p, "Offset not word aligned: %zu", offset);
>
> - line_buff = kzalloc(DMESG_MAX_LINE_LEN, GFP_KERNEL);
> + line_buff = kzalloc(DMESG_MAX_LINE_LEN, GFP_ATOMIC);
Is this related? Or should it be a separate patch?
> if (!line_buff) {
> drm_printf(p, "Failed to allocate line buffer\n");
> return;
> diff --git a/drivers/gpu/drm/xe/xe_devcoredump_types.h b/drivers/gpu/drm/xe/xe_devcoredump_types.h
> index 1a1d16a96b2d..a174385a6d83 100644
> --- a/drivers/gpu/drm/xe/xe_devcoredump_types.h
> +++ b/drivers/gpu/drm/xe/xe_devcoredump_types.h
> @@ -66,6 +66,8 @@ struct xe_devcoredump_snapshot {
> struct {
> /** @read.size: size of devcoredump in human readable format */
> ssize_t size;
> + /** @read.chunk_position: position of devcoredump chunk */
> + ssize_t chunk_position;
> /** @read.buffer: buffer of devcoredump in human readable format */
> char *buffer;
> } read;
> diff --git a/drivers/gpu/drm/xe/xe_guc_hwconfig.c b/drivers/gpu/drm/xe/xe_guc_hwconfig.c
> index af2c817d552c..21403a250834 100644
> --- a/drivers/gpu/drm/xe/xe_guc_hwconfig.c
> +++ b/drivers/gpu/drm/xe/xe_guc_hwconfig.c
> @@ -175,7 +175,7 @@ int xe_guc_hwconfig_lookup_u32(struct xe_guc *guc, u32 attribute, u32 *val)
> if (num_dw == 0)
> return -EINVAL;
>
> - hwconfig = kzalloc(size, GFP_KERNEL);
> + hwconfig = kzalloc(size, GFP_ATOMIC);
Likewise this?
John.
> if (!hwconfig)
> return -ENOMEM;
>
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH v2 1/4] drm/xe: Add devcoredump chunking
2025-04-03 21:39 ` John Harrison
@ 2025-04-03 22:32 ` Matthew Brost
2025-04-03 22:41 ` John Harrison
0 siblings, 1 reply; 20+ messages in thread
From: Matthew Brost @ 2025-04-03 22:32 UTC (permalink / raw)
To: John Harrison; +Cc: intel-xe, dri-devel
On Thu, Apr 03, 2025 at 02:39:16PM -0700, John Harrison wrote:
> On 4/3/2025 1:27 PM, Matthew Brost wrote:
> > Chunk devcoredump into 1.5G pieces to avoid hitting the kvmalloc limit
> > of 2G. Simple algorithm reads 1.5G at time in xe_devcoredump_read
> > callback as needed.
> >
> > Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> > ---
> > drivers/gpu/drm/xe/xe_devcoredump.c | 59 ++++++++++++++++++-----
> > drivers/gpu/drm/xe/xe_devcoredump_types.h | 2 +
> > drivers/gpu/drm/xe/xe_guc_hwconfig.c | 2 +-
> > 3 files changed, 50 insertions(+), 13 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/xe/xe_devcoredump.c b/drivers/gpu/drm/xe/xe_devcoredump.c
> > index 81b9d9bb3f57..a9e618abf8ac 100644
> > --- a/drivers/gpu/drm/xe/xe_devcoredump.c
> > +++ b/drivers/gpu/drm/xe/xe_devcoredump.c
> > @@ -80,7 +80,8 @@ static struct xe_guc *exec_queue_to_guc(struct xe_exec_queue *q)
> > return &q->gt->uc.guc;
> > }
> > -static ssize_t __xe_devcoredump_read(char *buffer, size_t count,
> > +static ssize_t __xe_devcoredump_read(char *buffer, ssize_t count,
> > + ssize_t start,
> > struct xe_devcoredump *coredump)
> > {
> > struct xe_device *xe;
> > @@ -94,7 +95,7 @@ static ssize_t __xe_devcoredump_read(char *buffer, size_t count,
> > ss = &coredump->snapshot;
> > iter.data = buffer;
> > - iter.start = 0;
> > + iter.start = start;
> > iter.remain = count;
> > p = drm_coredump_printer(&iter);
> > @@ -168,6 +169,8 @@ static void xe_devcoredump_snapshot_free(struct xe_devcoredump_snapshot *ss)
> > ss->vm = NULL;
> > }
> > +#define XE_DEVCOREDUMP_CHUNK_MAX (SZ_512M + SZ_1G)
> > +
> > static ssize_t xe_devcoredump_read(char *buffer, loff_t offset,
> > size_t count, void *data, size_t datalen)
> > {
> > @@ -183,6 +186,9 @@ static ssize_t xe_devcoredump_read(char *buffer, loff_t offset,
> > /* Ensure delayed work is captured before continuing */
> > flush_work(&ss->work);
> > + if (ss->read.size > XE_DEVCOREDUMP_CHUNK_MAX)
> > + xe_pm_runtime_get(gt_to_xe(ss->gt));
> > +
> > mutex_lock(&coredump->lock);
> > if (!ss->read.buffer) {
> > @@ -195,12 +201,26 @@ static ssize_t xe_devcoredump_read(char *buffer, loff_t offset,
> > return 0;
> > }
> > + if (offset >= ss->read.chunk_position + XE_DEVCOREDUMP_CHUNK_MAX ||
> > + offset < ss->read.chunk_position) {
> > + ss->read.chunk_position =
> > + ALIGN_DOWN(offset, XE_DEVCOREDUMP_CHUNK_MAX);
> > +
> > + __xe_devcoredump_read(ss->read.buffer,
> > + XE_DEVCOREDUMP_CHUNK_MAX,
> > + ss->read.chunk_position, coredump);
> > + }
> > +
> > byte_copied = count < ss->read.size - offset ? count :
> > ss->read.size - offset;
> > - memcpy(buffer, ss->read.buffer + offset, byte_copied);
> > + memcpy(buffer, ss->read.buffer +
> > + (offset % XE_DEVCOREDUMP_CHUNK_MAX), byte_copied);
> > mutex_unlock(&coredump->lock);
> > + if (ss->read.size > XE_DEVCOREDUMP_CHUNK_MAX)
> > + xe_pm_runtime_put(gt_to_xe(ss->gt));
> > +
> > return byte_copied;
> > }
> > @@ -254,17 +274,32 @@ static void xe_devcoredump_deferred_snap_work(struct work_struct *work)
> > xe_guc_exec_queue_snapshot_capture_delayed(ss->ge);
> > xe_force_wake_put(gt_to_fw(ss->gt), fw_ref);
> > - xe_pm_runtime_put(xe);
> > + ss->read.chunk_position = 0;
> > /* Calculate devcoredump size */
> > - ss->read.size = __xe_devcoredump_read(NULL, INT_MAX, coredump);
> > -
> > - ss->read.buffer = kvmalloc(ss->read.size, GFP_USER);
> > - if (!ss->read.buffer)
> > - return;
> > + ss->read.size = __xe_devcoredump_read(NULL, LONG_MAX, 0, coredump);
> > +
> > + if (ss->read.size > XE_DEVCOREDUMP_CHUNK_MAX) {
> > + ss->read.buffer = kvmalloc(XE_DEVCOREDUMP_CHUNK_MAX,
> > + GFP_USER);
> > + if (!ss->read.buffer)
> > + goto put_pm;
> > +
> > + __xe_devcoredump_read(ss->read.buffer,
> > + XE_DEVCOREDUMP_CHUNK_MAX,
> > + 0, coredump);
> > + } else {
> > + ss->read.buffer = kvmalloc(ss->read.size, GFP_USER);
> > + if (!ss->read.buffer)
> > + goto put_pm;
> > +
> > + __xe_devcoredump_read(ss->read.buffer, ss->read.size, 0,
> > + coredump);
> > + xe_devcoredump_snapshot_free(ss);
> > + }
> > - __xe_devcoredump_read(ss->read.buffer, ss->read.size, coredump);
> > - xe_devcoredump_snapshot_free(ss);
> > +put_pm:
> > + xe_pm_runtime_put(xe);
> > }
> > static void devcoredump_snapshot(struct xe_devcoredump *coredump,
> > @@ -425,7 +460,7 @@ void xe_print_blob_ascii85(struct drm_printer *p, const char *prefix, char suffi
> > if (offset & 3)
> > drm_printf(p, "Offset not word aligned: %zu", offset);
> > - line_buff = kzalloc(DMESG_MAX_LINE_LEN, GFP_KERNEL);
> > + line_buff = kzalloc(DMESG_MAX_LINE_LEN, GFP_ATOMIC);
> Is this related? Or should it be a separate patch?
>
It is related. Now that __xe_devcoredump_read is called under
'coredump->lock' we are in the path of reclaim, __xe_devcoredump_read
calls xe_print_blob_ascii85.
Both cases the allocation is relatively small, so would be fairly
unlikely to fail.
I could make this a seperate prep patch if you think that would be
better.
> > if (!line_buff) {
> > drm_printf(p, "Failed to allocate line buffer\n");
> > return;
> > diff --git a/drivers/gpu/drm/xe/xe_devcoredump_types.h b/drivers/gpu/drm/xe/xe_devcoredump_types.h
> > index 1a1d16a96b2d..a174385a6d83 100644
> > --- a/drivers/gpu/drm/xe/xe_devcoredump_types.h
> > +++ b/drivers/gpu/drm/xe/xe_devcoredump_types.h
> > @@ -66,6 +66,8 @@ struct xe_devcoredump_snapshot {
> > struct {
> > /** @read.size: size of devcoredump in human readable format */
> > ssize_t size;
> > + /** @read.chunk_position: position of devcoredump chunk */
> > + ssize_t chunk_position;
> > /** @read.buffer: buffer of devcoredump in human readable format */
> > char *buffer;
> > } read;
> > diff --git a/drivers/gpu/drm/xe/xe_guc_hwconfig.c b/drivers/gpu/drm/xe/xe_guc_hwconfig.c
> > index af2c817d552c..21403a250834 100644
> > --- a/drivers/gpu/drm/xe/xe_guc_hwconfig.c
> > +++ b/drivers/gpu/drm/xe/xe_guc_hwconfig.c
> > @@ -175,7 +175,7 @@ int xe_guc_hwconfig_lookup_u32(struct xe_guc *guc, u32 attribute, u32 *val)
> > if (num_dw == 0)
> > return -EINVAL;
> > - hwconfig = kzalloc(size, GFP_KERNEL);
> > + hwconfig = kzalloc(size, GFP_ATOMIC);
> Likewise this?
>
Same as above.
Matt
> John.
>
> > if (!hwconfig)
> > return -ENOMEM;
>
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH v2 1/4] drm/xe: Add devcoredump chunking
2025-04-03 22:32 ` Matthew Brost
@ 2025-04-03 22:41 ` John Harrison
2025-04-04 2:33 ` Matthew Brost
0 siblings, 1 reply; 20+ messages in thread
From: John Harrison @ 2025-04-03 22:41 UTC (permalink / raw)
To: Matthew Brost; +Cc: intel-xe, dri-devel
On 4/3/2025 3:32 PM, Matthew Brost wrote:
> On Thu, Apr 03, 2025 at 02:39:16PM -0700, John Harrison wrote:
>> On 4/3/2025 1:27 PM, Matthew Brost wrote:
>>> Chunk devcoredump into 1.5G pieces to avoid hitting the kvmalloc limit
>>> of 2G. Simple algorithm reads 1.5G at time in xe_devcoredump_read
>>> callback as needed.
>>>
>>> Signed-off-by: Matthew Brost <matthew.brost@intel.com>
>>> ---
>>> drivers/gpu/drm/xe/xe_devcoredump.c | 59 ++++++++++++++++++-----
>>> drivers/gpu/drm/xe/xe_devcoredump_types.h | 2 +
>>> drivers/gpu/drm/xe/xe_guc_hwconfig.c | 2 +-
>>> 3 files changed, 50 insertions(+), 13 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/xe/xe_devcoredump.c b/drivers/gpu/drm/xe/xe_devcoredump.c
>>> index 81b9d9bb3f57..a9e618abf8ac 100644
>>> --- a/drivers/gpu/drm/xe/xe_devcoredump.c
>>> +++ b/drivers/gpu/drm/xe/xe_devcoredump.c
>>> @@ -80,7 +80,8 @@ static struct xe_guc *exec_queue_to_guc(struct xe_exec_queue *q)
>>> return &q->gt->uc.guc;
>>> }
>>> -static ssize_t __xe_devcoredump_read(char *buffer, size_t count,
>>> +static ssize_t __xe_devcoredump_read(char *buffer, ssize_t count,
>>> + ssize_t start,
>>> struct xe_devcoredump *coredump)
>>> {
>>> struct xe_device *xe;
>>> @@ -94,7 +95,7 @@ static ssize_t __xe_devcoredump_read(char *buffer, size_t count,
>>> ss = &coredump->snapshot;
>>> iter.data = buffer;
>>> - iter.start = 0;
>>> + iter.start = start;
>>> iter.remain = count;
>>> p = drm_coredump_printer(&iter);
>>> @@ -168,6 +169,8 @@ static void xe_devcoredump_snapshot_free(struct xe_devcoredump_snapshot *ss)
>>> ss->vm = NULL;
>>> }
>>> +#define XE_DEVCOREDUMP_CHUNK_MAX (SZ_512M + SZ_1G)
>>> +
>>> static ssize_t xe_devcoredump_read(char *buffer, loff_t offset,
>>> size_t count, void *data, size_t datalen)
>>> {
>>> @@ -183,6 +186,9 @@ static ssize_t xe_devcoredump_read(char *buffer, loff_t offset,
>>> /* Ensure delayed work is captured before continuing */
>>> flush_work(&ss->work);
>>> + if (ss->read.size > XE_DEVCOREDUMP_CHUNK_MAX)
>>> + xe_pm_runtime_get(gt_to_xe(ss->gt));
>>> +
>>> mutex_lock(&coredump->lock);
>>> if (!ss->read.buffer) {
>>> @@ -195,12 +201,26 @@ static ssize_t xe_devcoredump_read(char *buffer, loff_t offset,
>>> return 0;
>>> }
>>> + if (offset >= ss->read.chunk_position + XE_DEVCOREDUMP_CHUNK_MAX ||
>>> + offset < ss->read.chunk_position) {
>>> + ss->read.chunk_position =
>>> + ALIGN_DOWN(offset, XE_DEVCOREDUMP_CHUNK_MAX);
>>> +
>>> + __xe_devcoredump_read(ss->read.buffer,
>>> + XE_DEVCOREDUMP_CHUNK_MAX,
>>> + ss->read.chunk_position, coredump);
>>> + }
>>> +
>>> byte_copied = count < ss->read.size - offset ? count :
>>> ss->read.size - offset;
>>> - memcpy(buffer, ss->read.buffer + offset, byte_copied);
>>> + memcpy(buffer, ss->read.buffer +
>>> + (offset % XE_DEVCOREDUMP_CHUNK_MAX), byte_copied);
>>> mutex_unlock(&coredump->lock);
>>> + if (ss->read.size > XE_DEVCOREDUMP_CHUNK_MAX)
>>> + xe_pm_runtime_put(gt_to_xe(ss->gt));
>>> +
>>> return byte_copied;
>>> }
>>> @@ -254,17 +274,32 @@ static void xe_devcoredump_deferred_snap_work(struct work_struct *work)
>>> xe_guc_exec_queue_snapshot_capture_delayed(ss->ge);
>>> xe_force_wake_put(gt_to_fw(ss->gt), fw_ref);
>>> - xe_pm_runtime_put(xe);
>>> + ss->read.chunk_position = 0;
>>> /* Calculate devcoredump size */
>>> - ss->read.size = __xe_devcoredump_read(NULL, INT_MAX, coredump);
>>> -
>>> - ss->read.buffer = kvmalloc(ss->read.size, GFP_USER);
>>> - if (!ss->read.buffer)
>>> - return;
>>> + ss->read.size = __xe_devcoredump_read(NULL, LONG_MAX, 0, coredump);
>>> +
>>> + if (ss->read.size > XE_DEVCOREDUMP_CHUNK_MAX) {
>>> + ss->read.buffer = kvmalloc(XE_DEVCOREDUMP_CHUNK_MAX,
>>> + GFP_USER);
>>> + if (!ss->read.buffer)
>>> + goto put_pm;
>>> +
>>> + __xe_devcoredump_read(ss->read.buffer,
>>> + XE_DEVCOREDUMP_CHUNK_MAX,
>>> + 0, coredump);
>>> + } else {
>>> + ss->read.buffer = kvmalloc(ss->read.size, GFP_USER);
>>> + if (!ss->read.buffer)
>>> + goto put_pm;
>>> +
>>> + __xe_devcoredump_read(ss->read.buffer, ss->read.size, 0,
>>> + coredump);
>>> + xe_devcoredump_snapshot_free(ss);
>>> + }
>>> - __xe_devcoredump_read(ss->read.buffer, ss->read.size, coredump);
>>> - xe_devcoredump_snapshot_free(ss);
>>> +put_pm:
>>> + xe_pm_runtime_put(xe);
>>> }
>>> static void devcoredump_snapshot(struct xe_devcoredump *coredump,
>>> @@ -425,7 +460,7 @@ void xe_print_blob_ascii85(struct drm_printer *p, const char *prefix, char suffi
>>> if (offset & 3)
>>> drm_printf(p, "Offset not word aligned: %zu", offset);
>>> - line_buff = kzalloc(DMESG_MAX_LINE_LEN, GFP_KERNEL);
>>> + line_buff = kzalloc(DMESG_MAX_LINE_LEN, GFP_ATOMIC);
>> Is this related? Or should it be a separate patch?
>>
> It is related. Now that __xe_devcoredump_read is called under
> 'coredump->lock' we are in the path of reclaim, __xe_devcoredump_read
> calls xe_print_blob_ascii85.
>
> Both cases the allocation is relatively small, so would be fairly
> unlikely to fail.
>
> I could make this a seperate prep patch if you think that would be
> better.
Not worth splitting if this is the requirement. But maybe just add a
comment to the commit message about why it is necessary.
>
>>> if (!line_buff) {
>>> drm_printf(p, "Failed to allocate line buffer\n");
>>> return;
>>> diff --git a/drivers/gpu/drm/xe/xe_devcoredump_types.h b/drivers/gpu/drm/xe/xe_devcoredump_types.h
>>> index 1a1d16a96b2d..a174385a6d83 100644
>>> --- a/drivers/gpu/drm/xe/xe_devcoredump_types.h
>>> +++ b/drivers/gpu/drm/xe/xe_devcoredump_types.h
>>> @@ -66,6 +66,8 @@ struct xe_devcoredump_snapshot {
>>> struct {
>>> /** @read.size: size of devcoredump in human readable format */
>>> ssize_t size;
>>> + /** @read.chunk_position: position of devcoredump chunk */
>>> + ssize_t chunk_position;
>>> /** @read.buffer: buffer of devcoredump in human readable format */
>>> char *buffer;
>>> } read;
>>> diff --git a/drivers/gpu/drm/xe/xe_guc_hwconfig.c b/drivers/gpu/drm/xe/xe_guc_hwconfig.c
>>> index af2c817d552c..21403a250834 100644
>>> --- a/drivers/gpu/drm/xe/xe_guc_hwconfig.c
>>> +++ b/drivers/gpu/drm/xe/xe_guc_hwconfig.c
>>> @@ -175,7 +175,7 @@ int xe_guc_hwconfig_lookup_u32(struct xe_guc *guc, u32 attribute, u32 *val)
>>> if (num_dw == 0)
>>> return -EINVAL;
>>> - hwconfig = kzalloc(size, GFP_KERNEL);
>>> + hwconfig = kzalloc(size, GFP_ATOMIC);
>> Likewise this?
Hmm, hadn't realised we were doing an alloc/copy/free for every config
table access. Is that just to avoid doing iosys type reads because the
table is in LMEM? Seems a lot of overhead for each individual u32 read!
But also why are we doing table reads during core dump -> text printing?
AFAICT, the only reads are in the steering code for doing EU register
reads. Don't all the reg reads happen up front and then we stop
accessing the hardware while doing the chunked dump? If we are
re-reading hardware state when doing the text conversion we can end up
with inconsistent dumps as the state changes from one chunk to the next?
John.
>>
> Same as above.
>
> Matt
>
>> John.
>>
>>> if (!hwconfig)
>>> return -ENOMEM;
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH v2 2/4] drm/xe: Update xe_ttm_access_memory to use GPU for non-visible access
2025-04-03 20:27 ` [PATCH v2 2/4] drm/xe: Update xe_ttm_access_memory to use GPU for non-visible access Matthew Brost
2025-04-03 21:28 ` Cavitt, Jonathan
@ 2025-04-04 1:03 ` kernel test robot
2025-04-04 3:36 ` kernel test robot
2025-04-04 3:46 ` kernel test robot
3 siblings, 0 replies; 20+ messages in thread
From: kernel test robot @ 2025-04-04 1:03 UTC (permalink / raw)
To: Matthew Brost; +Cc: oe-kbuild-all
Hi Matthew,
kernel test robot noticed the following build warnings:
[auto build test WARNING on drm-xe/drm-xe-next]
[also build test WARNING on next-20250403]
[cannot apply to drm-exynos/exynos-drm-next linus/master drm/drm-next drm-intel/for-linux-next drm-intel/for-linux-next-fixes drm-misc/drm-misc-next drm-tip/drm-tip v6.14]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]
url: https://github.com/intel-lab-lkp/linux/commits/Matthew-Brost/drm-xe-Add-devcoredump-chunking/20250404-042700
base: https://gitlab.freedesktop.org/drm/xe/kernel.git drm-xe-next
patch link: https://lore.kernel.org/r/20250403202705.18488-3-matthew.brost%40intel.com
patch subject: [PATCH v2 2/4] drm/xe: Update xe_ttm_access_memory to use GPU for non-visible access
config: alpha-allyesconfig (https://download.01.org/0day-ci/archive/20250404/202504040820.oEc4vaWF-lkp@intel.com/config)
compiler: alpha-linux-gcc (GCC) 14.2.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20250404/202504040820.oEc4vaWF-lkp@intel.com/reproduce)
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202504040820.oEc4vaWF-lkp@intel.com/
All warnings (new ones prefixed by >>):
drivers/gpu/drm/xe/xe_migrate.c: In function 'xe_migrate_access_memory':
drivers/gpu/drm/xe/xe_migrate.c:1819:19: error: 'XE_CACHELINE_MASK' undeclared (first use in this function)
1819 | if (len & XE_CACHELINE_MASK ||
| ^~~~~~~~~~~~~~~~~
drivers/gpu/drm/xe/xe_migrate.c:1819:19: note: each undeclared identifier is reported only once for each function it appears in
drivers/gpu/drm/xe/xe_migrate.c:1828:35: error: 'XE_CACHELINE_BYTES' undeclared (first use in this function); did you mean 'L1_CACHE_BYTES'?
1828 | u8 bounce[XE_CACHELINE_BYTES];
| ^~~~~~~~~~~~~~~~~~
| L1_CACHE_BYTES
>> drivers/gpu/drm/xe/xe_migrate.c:1828:28: warning: unused variable 'bounce' [-Wunused-variable]
1828 | u8 bounce[XE_CACHELINE_BYTES];
| ^~~~~~
drivers/gpu/drm/xe/xe_migrate.c:1888:27: error: implicit declaration of function 'xe_migrate_vram'; did you mean 'xe_migrate_to_vram'? [-Wimplicit-function-declaration]
1888 | __fence = xe_migrate_vram(m, current_bytes,
| ^~~~~~~~~~~~~~~
| xe_migrate_to_vram
drivers/gpu/drm/xe/xe_migrate.c:1892:43: error: 'XE_MIGRATE_COPY_TO_VRAM' undeclared (first use in this function)
1892 | XE_MIGRATE_COPY_TO_VRAM :
| ^~~~~~~~~~~~~~~~~~~~~~~
drivers/gpu/drm/xe/xe_migrate.c:1893:43: error: 'XE_MIGRATE_COPY_TO_SRAM' undeclared (first use in this function)
1893 | XE_MIGRATE_COPY_TO_SRAM);
| ^~~~~~~~~~~~~~~~~~~~~~~
vim +/bounce +1828 drivers/gpu/drm/xe/xe_migrate.c
1785
1786 /**
1787 * xe_migrate_access_memory - Access memory of a BO via GPU
1788 *
1789 * @m: The migration context.
1790 * @bo: buffer object
1791 * @offset: access offset into buffer object
1792 * @buf: pointer to caller memory to read into or write from
1793 * @len: length of access
1794 * @write: write access
1795 *
1796 * Access memory of a BO via GPU either reading in or writing from a passed in
1797 * pointer. Pointer is dma mapped for GPU access and GPU commands are issued to
1798 * read to or write from pointer.
1799 *
1800 * Returns:
1801 * 0 if successful, negative error code on failure.
1802 */
1803 int xe_migrate_access_memory(struct xe_migrate *m, struct xe_bo *bo,
1804 unsigned long offset, void *buf, int len,
1805 int write)
1806 {
1807 struct xe_tile *tile = m->tile;
1808 struct xe_device *xe = tile_to_xe(tile);
1809 struct xe_res_cursor cursor;
1810 struct dma_fence *fence = NULL;
1811 dma_addr_t *dma_addr;
1812 unsigned long page_offset = (unsigned long)buf & ~PAGE_MASK;
1813 int bytes_left = len, current_page = 0;
1814 void *orig_buf = buf;
1815
1816 xe_bo_assert_held(bo);
1817
1818 /* Use bounce buffer for small access and unaligned access */
> 1819 if (len & XE_CACHELINE_MASK ||
1820 ((uintptr_t)buf | offset) & XE_CACHELINE_MASK) {
1821 int buf_offset = 0;
1822
1823 /*
1824 * Less than ideal for large unaligned access but this should be
1825 * fairly rare, can fixup if this becomes common.
1826 */
1827 do {
> 1828 u8 bounce[XE_CACHELINE_BYTES];
1829 void *ptr = (void *)bounce;
1830 int err;
1831 int copy_bytes = min_t(int, bytes_left,
1832 XE_CACHELINE_BYTES -
1833 (offset & XE_CACHELINE_MASK));
1834 int ptr_offset = offset & XE_CACHELINE_MASK;
1835
1836 err = xe_migrate_access_memory(m, bo,
1837 offset &
1838 ~XE_CACHELINE_MASK,
1839 (void *)ptr,
1840 sizeof(bounce), 0);
1841 if (err)
1842 return err;
1843
1844 if (!write) {
1845 memcpy(buf + buf_offset, ptr + ptr_offset,
1846 copy_bytes);
1847 goto next;
1848 }
1849
1850 memcpy(ptr + ptr_offset, buf + buf_offset, copy_bytes);
1851 err = xe_migrate_access_memory(m, bo,
1852 offset & ~XE_CACHELINE_MASK,
1853 (void *)ptr,
1854 sizeof(bounce), 0);
1855 if (err)
1856 return err;
1857
1858 next:
1859 bytes_left -= copy_bytes;
1860 buf_offset += copy_bytes;
1861 offset += copy_bytes;
1862 } while (bytes_left);
1863
1864 return 0;
1865 }
1866
1867 dma_addr = xe_migrate_dma_map(xe, buf, len + page_offset, write);
1868 if (IS_ERR(dma_addr))
1869 return PTR_ERR(dma_addr);
1870
1871 xe_res_first(bo->ttm.resource, offset, bo->size - offset, &cursor);
1872
1873 do {
1874 struct dma_fence *__fence;
1875 u64 vram_addr = vram_region_gpu_offset(bo->ttm.resource) +
1876 cursor.start;
1877 int current_bytes;
1878
1879 if (cursor.size > MAX_PREEMPTDISABLE_TRANSFER)
1880 current_bytes = min_t(int, bytes_left,
1881 MAX_PREEMPTDISABLE_TRANSFER);
1882 else
1883 current_bytes = min_t(int, bytes_left, cursor.size);
1884
1885 if (fence)
1886 dma_fence_put(fence);
1887
1888 __fence = xe_migrate_vram(m, current_bytes,
1889 (unsigned long)buf & ~PAGE_MASK,
1890 dma_addr + current_page,
1891 vram_addr, write ?
1892 XE_MIGRATE_COPY_TO_VRAM :
1893 XE_MIGRATE_COPY_TO_SRAM);
1894 if (IS_ERR(__fence)) {
1895 if (fence)
1896 dma_fence_wait(fence, false);
1897 fence = __fence;
1898 goto out_err;
1899 }
1900 fence = __fence;
1901
1902 buf += current_bytes;
1903 offset += current_bytes;
1904 current_page = (int)(buf - orig_buf) / PAGE_SIZE;
1905 bytes_left -= current_bytes;
1906 if (bytes_left)
1907 xe_res_next(&cursor, current_bytes);
1908 } while (bytes_left);
1909
1910 dma_fence_wait(fence, false);
1911 dma_fence_put(fence);
1912 xe_migrate_dma_unmap(xe, dma_addr, len + page_offset, write);
1913
1914 return 0;
1915
1916 out_err:
1917 xe_migrate_dma_unmap(xe, dma_addr, len + page_offset, write);
1918 return PTR_ERR(fence);
1919 }
1920
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH v2 1/4] drm/xe: Add devcoredump chunking
2025-04-03 22:41 ` John Harrison
@ 2025-04-04 2:33 ` Matthew Brost
0 siblings, 0 replies; 20+ messages in thread
From: Matthew Brost @ 2025-04-04 2:33 UTC (permalink / raw)
To: John Harrison; +Cc: intel-xe, dri-devel
On Thu, Apr 03, 2025 at 03:41:16PM -0700, John Harrison wrote:
> On 4/3/2025 3:32 PM, Matthew Brost wrote:
> > On Thu, Apr 03, 2025 at 02:39:16PM -0700, John Harrison wrote:
> > > On 4/3/2025 1:27 PM, Matthew Brost wrote:
> > > > Chunk devcoredump into 1.5G pieces to avoid hitting the kvmalloc limit
> > > > of 2G. Simple algorithm reads 1.5G at time in xe_devcoredump_read
> > > > callback as needed.
> > > >
> > > > Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> > > > ---
> > > > drivers/gpu/drm/xe/xe_devcoredump.c | 59 ++++++++++++++++++-----
> > > > drivers/gpu/drm/xe/xe_devcoredump_types.h | 2 +
> > > > drivers/gpu/drm/xe/xe_guc_hwconfig.c | 2 +-
> > > > 3 files changed, 50 insertions(+), 13 deletions(-)
> > > >
> > > > diff --git a/drivers/gpu/drm/xe/xe_devcoredump.c b/drivers/gpu/drm/xe/xe_devcoredump.c
> > > > index 81b9d9bb3f57..a9e618abf8ac 100644
> > > > --- a/drivers/gpu/drm/xe/xe_devcoredump.c
> > > > +++ b/drivers/gpu/drm/xe/xe_devcoredump.c
> > > > @@ -80,7 +80,8 @@ static struct xe_guc *exec_queue_to_guc(struct xe_exec_queue *q)
> > > > return &q->gt->uc.guc;
> > > > }
> > > > -static ssize_t __xe_devcoredump_read(char *buffer, size_t count,
> > > > +static ssize_t __xe_devcoredump_read(char *buffer, ssize_t count,
> > > > + ssize_t start,
> > > > struct xe_devcoredump *coredump)
> > > > {
> > > > struct xe_device *xe;
> > > > @@ -94,7 +95,7 @@ static ssize_t __xe_devcoredump_read(char *buffer, size_t count,
> > > > ss = &coredump->snapshot;
> > > > iter.data = buffer;
> > > > - iter.start = 0;
> > > > + iter.start = start;
> > > > iter.remain = count;
> > > > p = drm_coredump_printer(&iter);
> > > > @@ -168,6 +169,8 @@ static void xe_devcoredump_snapshot_free(struct xe_devcoredump_snapshot *ss)
> > > > ss->vm = NULL;
> > > > }
> > > > +#define XE_DEVCOREDUMP_CHUNK_MAX (SZ_512M + SZ_1G)
> > > > +
> > > > static ssize_t xe_devcoredump_read(char *buffer, loff_t offset,
> > > > size_t count, void *data, size_t datalen)
> > > > {
> > > > @@ -183,6 +186,9 @@ static ssize_t xe_devcoredump_read(char *buffer, loff_t offset,
> > > > /* Ensure delayed work is captured before continuing */
> > > > flush_work(&ss->work);
> > > > + if (ss->read.size > XE_DEVCOREDUMP_CHUNK_MAX)
> > > > + xe_pm_runtime_get(gt_to_xe(ss->gt));
> > > > +
> > > > mutex_lock(&coredump->lock);
> > > > if (!ss->read.buffer) {
> > > > @@ -195,12 +201,26 @@ static ssize_t xe_devcoredump_read(char *buffer, loff_t offset,
> > > > return 0;
> > > > }
> > > > + if (offset >= ss->read.chunk_position + XE_DEVCOREDUMP_CHUNK_MAX ||
> > > > + offset < ss->read.chunk_position) {
> > > > + ss->read.chunk_position =
> > > > + ALIGN_DOWN(offset, XE_DEVCOREDUMP_CHUNK_MAX);
> > > > +
> > > > + __xe_devcoredump_read(ss->read.buffer,
> > > > + XE_DEVCOREDUMP_CHUNK_MAX,
> > > > + ss->read.chunk_position, coredump);
> > > > + }
> > > > +
> > > > byte_copied = count < ss->read.size - offset ? count :
> > > > ss->read.size - offset;
> > > > - memcpy(buffer, ss->read.buffer + offset, byte_copied);
> > > > + memcpy(buffer, ss->read.buffer +
> > > > + (offset % XE_DEVCOREDUMP_CHUNK_MAX), byte_copied);
> > > > mutex_unlock(&coredump->lock);
> > > > + if (ss->read.size > XE_DEVCOREDUMP_CHUNK_MAX)
> > > > + xe_pm_runtime_put(gt_to_xe(ss->gt));
> > > > +
> > > > return byte_copied;
> > > > }
> > > > @@ -254,17 +274,32 @@ static void xe_devcoredump_deferred_snap_work(struct work_struct *work)
> > > > xe_guc_exec_queue_snapshot_capture_delayed(ss->ge);
> > > > xe_force_wake_put(gt_to_fw(ss->gt), fw_ref);
> > > > - xe_pm_runtime_put(xe);
> > > > + ss->read.chunk_position = 0;
> > > > /* Calculate devcoredump size */
> > > > - ss->read.size = __xe_devcoredump_read(NULL, INT_MAX, coredump);
> > > > -
> > > > - ss->read.buffer = kvmalloc(ss->read.size, GFP_USER);
> > > > - if (!ss->read.buffer)
> > > > - return;
> > > > + ss->read.size = __xe_devcoredump_read(NULL, LONG_MAX, 0, coredump);
> > > > +
> > > > + if (ss->read.size > XE_DEVCOREDUMP_CHUNK_MAX) {
> > > > + ss->read.buffer = kvmalloc(XE_DEVCOREDUMP_CHUNK_MAX,
> > > > + GFP_USER);
> > > > + if (!ss->read.buffer)
> > > > + goto put_pm;
> > > > +
> > > > + __xe_devcoredump_read(ss->read.buffer,
> > > > + XE_DEVCOREDUMP_CHUNK_MAX,
> > > > + 0, coredump);
> > > > + } else {
> > > > + ss->read.buffer = kvmalloc(ss->read.size, GFP_USER);
> > > > + if (!ss->read.buffer)
> > > > + goto put_pm;
> > > > +
> > > > + __xe_devcoredump_read(ss->read.buffer, ss->read.size, 0,
> > > > + coredump);
> > > > + xe_devcoredump_snapshot_free(ss);
> > > > + }
> > > > - __xe_devcoredump_read(ss->read.buffer, ss->read.size, coredump);
> > > > - xe_devcoredump_snapshot_free(ss);
> > > > +put_pm:
> > > > + xe_pm_runtime_put(xe);
> > > > }
> > > > static void devcoredump_snapshot(struct xe_devcoredump *coredump,
> > > > @@ -425,7 +460,7 @@ void xe_print_blob_ascii85(struct drm_printer *p, const char *prefix, char suffi
> > > > if (offset & 3)
> > > > drm_printf(p, "Offset not word aligned: %zu", offset);
> > > > - line_buff = kzalloc(DMESG_MAX_LINE_LEN, GFP_KERNEL);
> > > > + line_buff = kzalloc(DMESG_MAX_LINE_LEN, GFP_ATOMIC);
> > > Is this related? Or should it be a separate patch?
> > >
> > It is related. Now that __xe_devcoredump_read is called under
> > 'coredump->lock' we are in the path of reclaim, __xe_devcoredump_read
> > calls xe_print_blob_ascii85.
> >
> > Both cases the allocation is relatively small, so would be fairly
> > unlikely to fail.
> >
> > I could make this a seperate prep patch if you think that would be
> > better.
> Not worth splitting if this is the requirement. But maybe just add a comment
> to the commit message about why it is necessary.
>
Sure.
> >
> > > > if (!line_buff) {
> > > > drm_printf(p, "Failed to allocate line buffer\n");
> > > > return;
> > > > diff --git a/drivers/gpu/drm/xe/xe_devcoredump_types.h b/drivers/gpu/drm/xe/xe_devcoredump_types.h
> > > > index 1a1d16a96b2d..a174385a6d83 100644
> > > > --- a/drivers/gpu/drm/xe/xe_devcoredump_types.h
> > > > +++ b/drivers/gpu/drm/xe/xe_devcoredump_types.h
> > > > @@ -66,6 +66,8 @@ struct xe_devcoredump_snapshot {
> > > > struct {
> > > > /** @read.size: size of devcoredump in human readable format */
> > > > ssize_t size;
> > > > + /** @read.chunk_position: position of devcoredump chunk */
> > > > + ssize_t chunk_position;
> > > > /** @read.buffer: buffer of devcoredump in human readable format */
> > > > char *buffer;
> > > > } read;
> > > > diff --git a/drivers/gpu/drm/xe/xe_guc_hwconfig.c b/drivers/gpu/drm/xe/xe_guc_hwconfig.c
> > > > index af2c817d552c..21403a250834 100644
> > > > --- a/drivers/gpu/drm/xe/xe_guc_hwconfig.c
> > > > +++ b/drivers/gpu/drm/xe/xe_guc_hwconfig.c
> > > > @@ -175,7 +175,7 @@ int xe_guc_hwconfig_lookup_u32(struct xe_guc *guc, u32 attribute, u32 *val)
> > > > if (num_dw == 0)
> > > > return -EINVAL;
> > > > - hwconfig = kzalloc(size, GFP_KERNEL);
> > > > + hwconfig = kzalloc(size, GFP_ATOMIC);
> > > Likewise this?
> Hmm, hadn't realised we were doing an alloc/copy/free for every config table
> access. Is that just to avoid doing iosys type reads because the table is in
> LMEM? Seems a lot of overhead for each individual u32 read!
>
> But also why are we doing table reads during core dump -> text printing?
> AFAICT, the only reads are in the steering code for doing EU register reads.
> Don't all the reg reads happen up front and then we stop accessing the
> hardware while doing the chunked dump? If we are re-reading hardware state
> when doing the text conversion we can end up with inconsistent dumps as the
> state changes from one chunk to the next?
>
Honestly not familiar with this code at all, lockdep puked and just fixed it. :)
On its surface, it does like an unnecessary allocation.
Matt
> John.
>
>
> > >
> > Same as above.
> >
> > Matt
> >
> > > John.
> > >
> > > > if (!hwconfig)
> > > > return -ENOMEM;
>
^ permalink raw reply [flat|nested] 20+ messages in thread
* ✓ CI.Patch_applied: success for Large devcoredump file support (rev2)
2025-04-03 20:27 [PATCH v2 0/4] Large devcoredump file support Matthew Brost
` (3 preceding siblings ...)
2025-04-03 20:27 ` [PATCH v2 4/4] drm/xe: Abort printing coredump in VM printer output if full Matthew Brost
@ 2025-04-04 3:31 ` Patchwork
2025-04-04 3:32 ` ✓ CI.checkpatch: " Patchwork
2025-04-04 3:32 ` ✗ CI.KUnit: failure " Patchwork
6 siblings, 0 replies; 20+ messages in thread
From: Patchwork @ 2025-04-04 3:31 UTC (permalink / raw)
To: Matthew Brost; +Cc: intel-xe
== Series Details ==
Series: Large devcoredump file support (rev2)
URL : https://patchwork.freedesktop.org/series/147085/
State : success
== Summary ==
=== Applying kernel patches on branch 'drm-tip' with base: ===
Base commit: bc18da45d48d drm-tip: 2025y-04m-03d-16h-39m-16s UTC integration manifest
=== git am output follows ===
Applying: drm/xe: Add devcoredump chunking
Applying: drm/xe: Update xe_ttm_access_memory to use GPU for non-visible access
Applying: drm/print: Add drm_printer_is_full
Applying: drm/xe: Abort printing coredump in VM printer output if full
^ permalink raw reply [flat|nested] 20+ messages in thread
* ✓ CI.checkpatch: success for Large devcoredump file support (rev2)
2025-04-03 20:27 [PATCH v2 0/4] Large devcoredump file support Matthew Brost
` (4 preceding siblings ...)
2025-04-04 3:31 ` ✓ CI.Patch_applied: success for Large devcoredump file support (rev2) Patchwork
@ 2025-04-04 3:32 ` Patchwork
2025-04-04 3:32 ` ✗ CI.KUnit: failure " Patchwork
6 siblings, 0 replies; 20+ messages in thread
From: Patchwork @ 2025-04-04 3:32 UTC (permalink / raw)
To: Matthew Brost; +Cc: intel-xe
== Series Details ==
Series: Large devcoredump file support (rev2)
URL : https://patchwork.freedesktop.org/series/147085/
State : success
== Summary ==
+ KERNEL=/kernel
+ git clone https://gitlab.freedesktop.org/drm/maintainer-tools mt
Cloning into 'mt'...
warning: redirecting to https://gitlab.freedesktop.org/drm/maintainer-tools.git/
+ git -C mt rev-list -n1 origin/master
99e5a866b5e13f134e606a3e29d9508d97826fb3
+ cd /kernel
+ git config --global --add safe.directory /kernel
+ git log -n1
commit 3be0df16c3f601049941a67eda14fc2517175ec5
Author: Matthew Brost <matthew.brost@intel.com>
Date: Thu Apr 3 13:27:05 2025 -0700
drm/xe: Abort printing coredump in VM printer output if full
Abort printing coredump in VM printer output if full. Helps speedup
large coredumps which need to walked multiple times in
xe_devcoredump_read.
Signed-off-by: Matthew Brost <matthew.brost@intel.com>
Reviewed-by: Jonathan Cavitt <jonathan.cavitt@intel.com>
+ /mt/dim checkpatch bc18da45d48d337b92a7ff9546ba61da32b3b586 drm-intel
8bc892e4cfa7 drm/xe: Add devcoredump chunking
12ddfe38c995 drm/xe: Update xe_ttm_access_memory to use GPU for non-visible access
4bbe5e66b06a drm/print: Add drm_printer_is_full
3be0df16c3f6 drm/xe: Abort printing coredump in VM printer output if full
^ permalink raw reply [flat|nested] 20+ messages in thread
* ✗ CI.KUnit: failure for Large devcoredump file support (rev2)
2025-04-03 20:27 [PATCH v2 0/4] Large devcoredump file support Matthew Brost
` (5 preceding siblings ...)
2025-04-04 3:32 ` ✓ CI.checkpatch: " Patchwork
@ 2025-04-04 3:32 ` Patchwork
6 siblings, 0 replies; 20+ messages in thread
From: Patchwork @ 2025-04-04 3:32 UTC (permalink / raw)
To: Matthew Brost; +Cc: intel-xe
== Series Details ==
Series: Large devcoredump file support (rev2)
URL : https://patchwork.freedesktop.org/series/147085/
State : failure
== Summary ==
+ trap cleanup EXIT
+ /kernel/tools/testing/kunit/kunit.py run --kunitconfig /kernel/drivers/gpu/drm/xe/.kunitconfig
[03:32:02] Configuring KUnit Kernel ...
Generating .config ...
Populating config with:
$ make ARCH=um O=.kunit olddefconfig
[03:32:06] Building KUnit Kernel ...
Populating config with:
$ make ARCH=um O=.kunit olddefconfig
Building with:
$ make all compile_commands.json ARCH=um O=.kunit --jobs=48
ERROR:root:../lib/iomap.c:156:5: warning: no previous prototype for ‘ioread64_lo_hi’ [-Wmissing-prototypes]
156 | u64 ioread64_lo_hi(const void __iomem *addr)
| ^~~~~~~~~~~~~~
../lib/iomap.c:163:5: warning: no previous prototype for ‘ioread64_hi_lo’ [-Wmissing-prototypes]
163 | u64 ioread64_hi_lo(const void __iomem *addr)
| ^~~~~~~~~~~~~~
../lib/iomap.c:170:5: warning: no previous prototype for ‘ioread64be_lo_hi’ [-Wmissing-prototypes]
170 | u64 ioread64be_lo_hi(const void __iomem *addr)
| ^~~~~~~~~~~~~~~~
../lib/iomap.c:178:5: warning: no previous prototype for ‘ioread64be_hi_lo’ [-Wmissing-prototypes]
178 | u64 ioread64be_hi_lo(const void __iomem *addr)
| ^~~~~~~~~~~~~~~~
../lib/iomap.c:264:6: warning: no previous prototype for ‘iowrite64_lo_hi’ [-Wmissing-prototypes]
264 | void iowrite64_lo_hi(u64 val, void __iomem *addr)
| ^~~~~~~~~~~~~~~
../lib/iomap.c:272:6: warning: no previous prototype for ‘iowrite64_hi_lo’ [-Wmissing-prototypes]
272 | void iowrite64_hi_lo(u64 val, void __iomem *addr)
| ^~~~~~~~~~~~~~~
../lib/iomap.c:280:6: warning: no previous prototype for ‘iowrite64be_lo_hi’ [-Wmissing-prototypes]
280 | void iowrite64be_lo_hi(u64 val, void __iomem *addr)
| ^~~~~~~~~~~~~~~~~
../lib/iomap.c:288:6: warning: no previous prototype for ‘iowrite64be_hi_lo’ [-Wmissing-prototypes]
288 | void iowrite64be_hi_lo(u64 val, void __iomem *addr)
| ^~~~~~~~~~~~~~~~~
../drivers/gpu/drm/xe/xe_migrate.c: In function ‘xe_migrate_access_memory’:
../drivers/gpu/drm/xe/xe_migrate.c:1819:19: error: ‘XE_CACHELINE_MASK’ undeclared (first use in this function)
1819 | if (len & XE_CACHELINE_MASK ||
| ^~~~~~~~~~~~~~~~~
../drivers/gpu/drm/xe/xe_migrate.c:1819:19: note: each undeclared identifier is reported only once for each function it appears in
../drivers/gpu/drm/xe/xe_migrate.c:1828:35: error: ‘XE_CACHELINE_BYTES’ undeclared (first use in this function); did you mean ‘L1_CACHE_BYTES’?
1828 | u8 bounce[XE_CACHELINE_BYTES];
| ^~~~~~~~~~~~~~~~~~
| L1_CACHE_BYTES
../drivers/gpu/drm/xe/xe_migrate.c:1828:28: warning: unused variable ‘bounce’ [-Wunused-variable]
1828 | u8 bounce[XE_CACHELINE_BYTES];
| ^~~~~~
../drivers/gpu/drm/xe/xe_migrate.c:1888:27: error: implicit declaration of function ‘xe_migrate_vram’; did you mean ‘xe_migrate_to_vram’? [-Werror=implicit-function-declaration]
1888 | __fence = xe_migrate_vram(m, current_bytes,
| ^~~~~~~~~~~~~~~
| xe_migrate_to_vram
../drivers/gpu/drm/xe/xe_migrate.c:1892:43: error: ‘XE_MIGRATE_COPY_TO_VRAM’ undeclared (first use in this function)
1892 | XE_MIGRATE_COPY_TO_VRAM :
| ^~~~~~~~~~~~~~~~~~~~~~~
../drivers/gpu/drm/xe/xe_migrate.c:1893:43: error: ‘XE_MIGRATE_COPY_TO_SRAM’ undeclared (first use in this function)
1893 | XE_MIGRATE_COPY_TO_SRAM);
| ^~~~~~~~~~~~~~~~~~~~~~~
cc1: some warnings being treated as errors
make[7]: *** [../scripts/Makefile.build:207: drivers/gpu/drm/xe/xe_migrate.o] Error 1
make[7]: *** Waiting for unfinished jobs....
make[6]: *** [../scripts/Makefile.build:465: drivers/gpu/drm/xe] Error 2
make[5]: *** [../scripts/Makefile.build:465: drivers/gpu/drm] Error 2
make[4]: *** [../scripts/Makefile.build:465: drivers/gpu] Error 2
make[3]: *** [../scripts/Makefile.build:465: drivers] Error 2
make[2]: *** [/kernel/Makefile:1994: .] Error 2
make[1]: *** [/kernel/Makefile:251: __sub-make] Error 2
make: *** [Makefile:251: __sub-make] Error 2
+ cleanup
++ stat -c %u:%g /kernel
+ chown -R 1003:1003 /kernel
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH v2 2/4] drm/xe: Update xe_ttm_access_memory to use GPU for non-visible access
2025-04-03 20:27 ` [PATCH v2 2/4] drm/xe: Update xe_ttm_access_memory to use GPU for non-visible access Matthew Brost
2025-04-03 21:28 ` Cavitt, Jonathan
2025-04-04 1:03 ` kernel test robot
@ 2025-04-04 3:36 ` kernel test robot
2025-04-04 3:46 ` kernel test robot
3 siblings, 0 replies; 20+ messages in thread
From: kernel test robot @ 2025-04-04 3:36 UTC (permalink / raw)
To: Matthew Brost; +Cc: llvm, oe-kbuild-all
Hi Matthew,
kernel test robot noticed the following build errors:
[auto build test ERROR on drm-xe/drm-xe-next]
[also build test ERROR on next-20250403]
[cannot apply to drm-exynos/exynos-drm-next linus/master drm/drm-next drm-intel/for-linux-next drm-intel/for-linux-next-fixes drm-misc/drm-misc-next drm-tip/drm-tip v6.14]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]
url: https://github.com/intel-lab-lkp/linux/commits/Matthew-Brost/drm-xe-Add-devcoredump-chunking/20250404-042700
base: https://gitlab.freedesktop.org/drm/xe/kernel.git drm-xe-next
patch link: https://lore.kernel.org/r/20250403202705.18488-3-matthew.brost%40intel.com
patch subject: [PATCH v2 2/4] drm/xe: Update xe_ttm_access_memory to use GPU for non-visible access
config: x86_64-buildonly-randconfig-003-20250404 (https://download.01.org/0day-ci/archive/20250404/202504041302.vLhBU2QJ-lkp@intel.com/config)
compiler: clang version 20.1.2 (https://github.com/llvm/llvm-project 58df0ef89dd64126512e4ee27b4ac3fd8ddf6247)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20250404/202504041302.vLhBU2QJ-lkp@intel.com/reproduce)
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202504041302.vLhBU2QJ-lkp@intel.com/
All errors (new ones prefixed by >>):
>> drivers/gpu/drm/xe/xe_migrate.c:1819:12: error: use of undeclared identifier 'XE_CACHELINE_MASK'
1819 | if (len & XE_CACHELINE_MASK ||
| ^
drivers/gpu/drm/xe/xe_migrate.c:1820:34: error: use of undeclared identifier 'XE_CACHELINE_MASK'
1820 | ((uintptr_t)buf | offset) & XE_CACHELINE_MASK) {
| ^
>> drivers/gpu/drm/xe/xe_migrate.c:1828:14: error: use of undeclared identifier 'XE_CACHELINE_BYTES'
1828 | u8 bounce[XE_CACHELINE_BYTES];
| ^
drivers/gpu/drm/xe/xe_migrate.c:1833:23: error: use of undeclared identifier 'XE_CACHELINE_MASK'
1833 | (offset & XE_CACHELINE_MASK));
| ^
drivers/gpu/drm/xe/xe_migrate.c:1832:13: error: use of undeclared identifier 'XE_CACHELINE_BYTES'
1832 | XE_CACHELINE_BYTES -
| ^
drivers/gpu/drm/xe/xe_migrate.c:1834:30: error: use of undeclared identifier 'XE_CACHELINE_MASK'
1834 | int ptr_offset = offset & XE_CACHELINE_MASK;
| ^
drivers/gpu/drm/xe/xe_migrate.c:1838:15: error: use of undeclared identifier 'XE_CACHELINE_MASK'
1838 | ~XE_CACHELINE_MASK,
| ^
drivers/gpu/drm/xe/xe_migrate.c:1852:24: error: use of undeclared identifier 'XE_CACHELINE_MASK'
1852 | offset & ~XE_CACHELINE_MASK,
| ^
>> drivers/gpu/drm/xe/xe_migrate.c:1888:13: error: call to undeclared function 'xe_migrate_vram'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration]
1888 | __fence = xe_migrate_vram(m, current_bytes,
| ^
drivers/gpu/drm/xe/xe_migrate.c:1888:13: note: did you mean 'xe_migrate_to_vram'?
drivers/gpu/drm/xe/xe_migrate.h:98:19: note: 'xe_migrate_to_vram' declared here
98 | struct dma_fence *xe_migrate_to_vram(struct xe_migrate *m,
| ^
>> drivers/gpu/drm/xe/xe_migrate.c:1892:8: error: use of undeclared identifier 'XE_MIGRATE_COPY_TO_VRAM'
1892 | XE_MIGRATE_COPY_TO_VRAM :
| ^
>> drivers/gpu/drm/xe/xe_migrate.c:1893:8: error: use of undeclared identifier 'XE_MIGRATE_COPY_TO_SRAM'
1893 | XE_MIGRATE_COPY_TO_SRAM);
| ^
11 errors generated.
vim +/XE_CACHELINE_MASK +1819 drivers/gpu/drm/xe/xe_migrate.c
1785
1786 /**
1787 * xe_migrate_access_memory - Access memory of a BO via GPU
1788 *
1789 * @m: The migration context.
1790 * @bo: buffer object
1791 * @offset: access offset into buffer object
1792 * @buf: pointer to caller memory to read into or write from
1793 * @len: length of access
1794 * @write: write access
1795 *
1796 * Access memory of a BO via GPU either reading in or writing from a passed in
1797 * pointer. Pointer is dma mapped for GPU access and GPU commands are issued to
1798 * read to or write from pointer.
1799 *
1800 * Returns:
1801 * 0 if successful, negative error code on failure.
1802 */
1803 int xe_migrate_access_memory(struct xe_migrate *m, struct xe_bo *bo,
1804 unsigned long offset, void *buf, int len,
1805 int write)
1806 {
1807 struct xe_tile *tile = m->tile;
1808 struct xe_device *xe = tile_to_xe(tile);
1809 struct xe_res_cursor cursor;
1810 struct dma_fence *fence = NULL;
1811 dma_addr_t *dma_addr;
1812 unsigned long page_offset = (unsigned long)buf & ~PAGE_MASK;
1813 int bytes_left = len, current_page = 0;
1814 void *orig_buf = buf;
1815
1816 xe_bo_assert_held(bo);
1817
1818 /* Use bounce buffer for small access and unaligned access */
> 1819 if (len & XE_CACHELINE_MASK ||
1820 ((uintptr_t)buf | offset) & XE_CACHELINE_MASK) {
1821 int buf_offset = 0;
1822
1823 /*
1824 * Less than ideal for large unaligned access but this should be
1825 * fairly rare, can fixup if this becomes common.
1826 */
1827 do {
> 1828 u8 bounce[XE_CACHELINE_BYTES];
1829 void *ptr = (void *)bounce;
1830 int err;
1831 int copy_bytes = min_t(int, bytes_left,
1832 XE_CACHELINE_BYTES -
1833 (offset & XE_CACHELINE_MASK));
1834 int ptr_offset = offset & XE_CACHELINE_MASK;
1835
1836 err = xe_migrate_access_memory(m, bo,
1837 offset &
1838 ~XE_CACHELINE_MASK,
1839 (void *)ptr,
1840 sizeof(bounce), 0);
1841 if (err)
1842 return err;
1843
1844 if (!write) {
1845 memcpy(buf + buf_offset, ptr + ptr_offset,
1846 copy_bytes);
1847 goto next;
1848 }
1849
1850 memcpy(ptr + ptr_offset, buf + buf_offset, copy_bytes);
1851 err = xe_migrate_access_memory(m, bo,
1852 offset & ~XE_CACHELINE_MASK,
1853 (void *)ptr,
1854 sizeof(bounce), 0);
1855 if (err)
1856 return err;
1857
1858 next:
1859 bytes_left -= copy_bytes;
1860 buf_offset += copy_bytes;
1861 offset += copy_bytes;
1862 } while (bytes_left);
1863
1864 return 0;
1865 }
1866
1867 dma_addr = xe_migrate_dma_map(xe, buf, len + page_offset, write);
1868 if (IS_ERR(dma_addr))
1869 return PTR_ERR(dma_addr);
1870
1871 xe_res_first(bo->ttm.resource, offset, bo->size - offset, &cursor);
1872
1873 do {
1874 struct dma_fence *__fence;
1875 u64 vram_addr = vram_region_gpu_offset(bo->ttm.resource) +
1876 cursor.start;
1877 int current_bytes;
1878
1879 if (cursor.size > MAX_PREEMPTDISABLE_TRANSFER)
1880 current_bytes = min_t(int, bytes_left,
1881 MAX_PREEMPTDISABLE_TRANSFER);
1882 else
1883 current_bytes = min_t(int, bytes_left, cursor.size);
1884
1885 if (fence)
1886 dma_fence_put(fence);
1887
> 1888 __fence = xe_migrate_vram(m, current_bytes,
1889 (unsigned long)buf & ~PAGE_MASK,
1890 dma_addr + current_page,
1891 vram_addr, write ?
> 1892 XE_MIGRATE_COPY_TO_VRAM :
> 1893 XE_MIGRATE_COPY_TO_SRAM);
1894 if (IS_ERR(__fence)) {
1895 if (fence)
1896 dma_fence_wait(fence, false);
1897 fence = __fence;
1898 goto out_err;
1899 }
1900 fence = __fence;
1901
1902 buf += current_bytes;
1903 offset += current_bytes;
1904 current_page = (int)(buf - orig_buf) / PAGE_SIZE;
1905 bytes_left -= current_bytes;
1906 if (bytes_left)
1907 xe_res_next(&cursor, current_bytes);
1908 } while (bytes_left);
1909
1910 dma_fence_wait(fence, false);
1911 dma_fence_put(fence);
1912 xe_migrate_dma_unmap(xe, dma_addr, len + page_offset, write);
1913
1914 return 0;
1915
1916 out_err:
1917 xe_migrate_dma_unmap(xe, dma_addr, len + page_offset, write);
1918 return PTR_ERR(fence);
1919 }
1920
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH v2 2/4] drm/xe: Update xe_ttm_access_memory to use GPU for non-visible access
2025-04-03 20:27 ` [PATCH v2 2/4] drm/xe: Update xe_ttm_access_memory to use GPU for non-visible access Matthew Brost
` (2 preceding siblings ...)
2025-04-04 3:36 ` kernel test robot
@ 2025-04-04 3:46 ` kernel test robot
3 siblings, 0 replies; 20+ messages in thread
From: kernel test robot @ 2025-04-04 3:46 UTC (permalink / raw)
To: Matthew Brost; +Cc: oe-kbuild-all
Hi Matthew,
kernel test robot noticed the following build errors:
[auto build test ERROR on drm-xe/drm-xe-next]
[also build test ERROR on next-20250403]
[cannot apply to drm-exynos/exynos-drm-next linus/master drm/drm-next drm-intel/for-linux-next drm-intel/for-linux-next-fixes drm-misc/drm-misc-next drm-tip/drm-tip v6.14]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]
url: https://github.com/intel-lab-lkp/linux/commits/Matthew-Brost/drm-xe-Add-devcoredump-chunking/20250404-042700
base: https://gitlab.freedesktop.org/drm/xe/kernel.git drm-xe-next
patch link: https://lore.kernel.org/r/20250403202705.18488-3-matthew.brost%40intel.com
patch subject: [PATCH v2 2/4] drm/xe: Update xe_ttm_access_memory to use GPU for non-visible access
config: alpha-allyesconfig (https://download.01.org/0day-ci/archive/20250404/202504041115.bbGTufbJ-lkp@intel.com/config)
compiler: alpha-linux-gcc (GCC) 14.2.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20250404/202504041115.bbGTufbJ-lkp@intel.com/reproduce)
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202504041115.bbGTufbJ-lkp@intel.com/
All errors (new ones prefixed by >>):
drivers/gpu/drm/xe/xe_migrate.c: In function 'xe_migrate_access_memory':
>> drivers/gpu/drm/xe/xe_migrate.c:1819:19: error: 'XE_CACHELINE_MASK' undeclared (first use in this function)
1819 | if (len & XE_CACHELINE_MASK ||
| ^~~~~~~~~~~~~~~~~
drivers/gpu/drm/xe/xe_migrate.c:1819:19: note: each undeclared identifier is reported only once for each function it appears in
>> drivers/gpu/drm/xe/xe_migrate.c:1828:35: error: 'XE_CACHELINE_BYTES' undeclared (first use in this function); did you mean 'L1_CACHE_BYTES'?
1828 | u8 bounce[XE_CACHELINE_BYTES];
| ^~~~~~~~~~~~~~~~~~
| L1_CACHE_BYTES
drivers/gpu/drm/xe/xe_migrate.c:1828:28: warning: unused variable 'bounce' [-Wunused-variable]
1828 | u8 bounce[XE_CACHELINE_BYTES];
| ^~~~~~
>> drivers/gpu/drm/xe/xe_migrate.c:1888:27: error: implicit declaration of function 'xe_migrate_vram'; did you mean 'xe_migrate_to_vram'? [-Wimplicit-function-declaration]
1888 | __fence = xe_migrate_vram(m, current_bytes,
| ^~~~~~~~~~~~~~~
| xe_migrate_to_vram
>> drivers/gpu/drm/xe/xe_migrate.c:1892:43: error: 'XE_MIGRATE_COPY_TO_VRAM' undeclared (first use in this function)
1892 | XE_MIGRATE_COPY_TO_VRAM :
| ^~~~~~~~~~~~~~~~~~~~~~~
>> drivers/gpu/drm/xe/xe_migrate.c:1893:43: error: 'XE_MIGRATE_COPY_TO_SRAM' undeclared (first use in this function)
1893 | XE_MIGRATE_COPY_TO_SRAM);
| ^~~~~~~~~~~~~~~~~~~~~~~
vim +/XE_CACHELINE_MASK +1819 drivers/gpu/drm/xe/xe_migrate.c
1785
1786 /**
1787 * xe_migrate_access_memory - Access memory of a BO via GPU
1788 *
1789 * @m: The migration context.
1790 * @bo: buffer object
1791 * @offset: access offset into buffer object
1792 * @buf: pointer to caller memory to read into or write from
1793 * @len: length of access
1794 * @write: write access
1795 *
1796 * Access memory of a BO via GPU either reading in or writing from a passed in
1797 * pointer. Pointer is dma mapped for GPU access and GPU commands are issued to
1798 * read to or write from pointer.
1799 *
1800 * Returns:
1801 * 0 if successful, negative error code on failure.
1802 */
1803 int xe_migrate_access_memory(struct xe_migrate *m, struct xe_bo *bo,
1804 unsigned long offset, void *buf, int len,
1805 int write)
1806 {
1807 struct xe_tile *tile = m->tile;
1808 struct xe_device *xe = tile_to_xe(tile);
1809 struct xe_res_cursor cursor;
1810 struct dma_fence *fence = NULL;
1811 dma_addr_t *dma_addr;
1812 unsigned long page_offset = (unsigned long)buf & ~PAGE_MASK;
1813 int bytes_left = len, current_page = 0;
1814 void *orig_buf = buf;
1815
1816 xe_bo_assert_held(bo);
1817
1818 /* Use bounce buffer for small access and unaligned access */
> 1819 if (len & XE_CACHELINE_MASK ||
1820 ((uintptr_t)buf | offset) & XE_CACHELINE_MASK) {
1821 int buf_offset = 0;
1822
1823 /*
1824 * Less than ideal for large unaligned access but this should be
1825 * fairly rare, can fixup if this becomes common.
1826 */
1827 do {
> 1828 u8 bounce[XE_CACHELINE_BYTES];
1829 void *ptr = (void *)bounce;
1830 int err;
1831 int copy_bytes = min_t(int, bytes_left,
1832 XE_CACHELINE_BYTES -
1833 (offset & XE_CACHELINE_MASK));
1834 int ptr_offset = offset & XE_CACHELINE_MASK;
1835
1836 err = xe_migrate_access_memory(m, bo,
1837 offset &
1838 ~XE_CACHELINE_MASK,
1839 (void *)ptr,
1840 sizeof(bounce), 0);
1841 if (err)
1842 return err;
1843
1844 if (!write) {
1845 memcpy(buf + buf_offset, ptr + ptr_offset,
1846 copy_bytes);
1847 goto next;
1848 }
1849
1850 memcpy(ptr + ptr_offset, buf + buf_offset, copy_bytes);
1851 err = xe_migrate_access_memory(m, bo,
1852 offset & ~XE_CACHELINE_MASK,
1853 (void *)ptr,
1854 sizeof(bounce), 0);
1855 if (err)
1856 return err;
1857
1858 next:
1859 bytes_left -= copy_bytes;
1860 buf_offset += copy_bytes;
1861 offset += copy_bytes;
1862 } while (bytes_left);
1863
1864 return 0;
1865 }
1866
1867 dma_addr = xe_migrate_dma_map(xe, buf, len + page_offset, write);
1868 if (IS_ERR(dma_addr))
1869 return PTR_ERR(dma_addr);
1870
1871 xe_res_first(bo->ttm.resource, offset, bo->size - offset, &cursor);
1872
1873 do {
1874 struct dma_fence *__fence;
1875 u64 vram_addr = vram_region_gpu_offset(bo->ttm.resource) +
1876 cursor.start;
1877 int current_bytes;
1878
1879 if (cursor.size > MAX_PREEMPTDISABLE_TRANSFER)
1880 current_bytes = min_t(int, bytes_left,
1881 MAX_PREEMPTDISABLE_TRANSFER);
1882 else
1883 current_bytes = min_t(int, bytes_left, cursor.size);
1884
1885 if (fence)
1886 dma_fence_put(fence);
1887
> 1888 __fence = xe_migrate_vram(m, current_bytes,
1889 (unsigned long)buf & ~PAGE_MASK,
1890 dma_addr + current_page,
1891 vram_addr, write ?
> 1892 XE_MIGRATE_COPY_TO_VRAM :
> 1893 XE_MIGRATE_COPY_TO_SRAM);
1894 if (IS_ERR(__fence)) {
1895 if (fence)
1896 dma_fence_wait(fence, false);
1897 fence = __fence;
1898 goto out_err;
1899 }
1900 fence = __fence;
1901
1902 buf += current_bytes;
1903 offset += current_bytes;
1904 current_page = (int)(buf - orig_buf) / PAGE_SIZE;
1905 bytes_left -= current_bytes;
1906 if (bytes_left)
1907 xe_res_next(&cursor, current_bytes);
1908 } while (bytes_left);
1909
1910 dma_fence_wait(fence, false);
1911 dma_fence_put(fence);
1912 xe_migrate_dma_unmap(xe, dma_addr, len + page_offset, write);
1913
1914 return 0;
1915
1916 out_err:
1917 xe_migrate_dma_unmap(xe, dma_addr, len + page_offset, write);
1918 return PTR_ERR(fence);
1919 }
1920
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH v2 2/4] drm/xe: Update xe_ttm_access_memory to use GPU for non-visible access
2025-04-03 21:28 ` Cavitt, Jonathan
@ 2025-04-10 3:58 ` Matthew Brost
0 siblings, 0 replies; 20+ messages in thread
From: Matthew Brost @ 2025-04-10 3:58 UTC (permalink / raw)
To: Cavitt, Jonathan
Cc: intel-xe@lists.freedesktop.org, dri-devel@lists.freedesktop.org
On Thu, Apr 03, 2025 at 03:28:48PM -0600, Cavitt, Jonathan wrote:
> -----Original Message-----
> From: Intel-xe <intel-xe-bounces@lists.freedesktop.org> On Behalf Of Matthew Brost
> Sent: Thursday, April 3, 2025 1:27 PM
> To: intel-xe@lists.freedesktop.org
> Cc: dri-devel@lists.freedesktop.org
> Subject: [PATCH v2 2/4] drm/xe: Update xe_ttm_access_memory to use GPU for non-visible access
> >
> > Add migrate layer functions to access VRAM and update
> > xe_ttm_access_memory to use for non-visible access and large (more than
> > 16k) BO access. 8G devcoreump on BMG observed 3 minute CPU copy time vs.
> > 3s GPU copy time.
> >
> > v4:
> > - Fix non-page aligned accesses
> > - Add support for small / unaligned access
> > - Update commit message indicating migrate used for large accesses (Auld)
> > - Fix warning in xe_res_cursor for non-zero offset
> > v5:
> > - Fix 32 bit build (CI)
> > v6:
> > - Rebase and use SVM migration copy functions
> > v7:
> > - Fix build error (CI)
> >
> > Signed-off-by: Matthew Brost <matthew.brost@intel.com>
>
> I left some notes down below. Most are non-blocking, which I labeled as such. And
> as for the rest, it's probable that I'm just misunderstanding some parts of the code, and
> that the notes I left are not relevant.
>
> So, I'll be providing my reviewed-by now in the case that the blocking review notes
> turn out to not need to be addressed. And in the case they need to be addressed,
> you can take my reviewed-by for the next revision.
>
> Reviewed-by: Jonathan Cavitt <jonathan.cavitt@intel.com>
>
> > ---
> > drivers/gpu/drm/xe/xe_bo.c | 15 ++-
> > drivers/gpu/drm/xe/xe_migrate.c | 221 ++++++++++++++++++++++++++++++--
> > drivers/gpu/drm/xe/xe_migrate.h | 4 +
> > 3 files changed, 223 insertions(+), 17 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/xe/xe_bo.c b/drivers/gpu/drm/xe/xe_bo.c
> > index 3c7c2353d3c8..c7e6b03d4aef 100644
> > --- a/drivers/gpu/drm/xe/xe_bo.c
> > +++ b/drivers/gpu/drm/xe/xe_bo.c
> > @@ -1414,6 +1414,7 @@ static int xe_ttm_access_memory(struct ttm_buffer_object *ttm_bo,
> > struct xe_res_cursor cursor;
> > struct xe_vram_region *vram;
> > int bytes_left = len;
> > + int err = 0;
> >
> > xe_bo_assert_held(bo);
> > xe_device_assert_mem_access(xe);
> > @@ -1421,9 +1422,14 @@ static int xe_ttm_access_memory(struct ttm_buffer_object *ttm_bo,
> > if (!mem_type_is_vram(ttm_bo->resource->mem_type))
> > return -EIO;
> >
> > - /* FIXME: Use GPU for non-visible VRAM */
> > - if (!xe_ttm_resource_visible(ttm_bo->resource))
> > - return -EIO;
> > + if (!xe_ttm_resource_visible(ttm_bo->resource) || len >= SZ_16K) {
> > + struct xe_migrate *migrate =
> > + mem_type_to_migrate(xe, ttm_bo->resource->mem_type);
> > +
> > + err = xe_migrate_access_memory(migrate, bo, offset, buf, len,
> > + write);
> > + goto out;
> > + }
> >
> > vram = res_to_mem_region(ttm_bo->resource);
> > xe_res_first(ttm_bo->resource, offset & PAGE_MASK,
> > @@ -1447,7 +1453,8 @@ static int xe_ttm_access_memory(struct ttm_buffer_object *ttm_bo,
> > xe_res_next(&cursor, PAGE_SIZE);
> > } while (bytes_left);
> >
> > - return len;
> > +out:
> > + return err ?: len;
> > }
> >
> > const struct ttm_device_funcs xe_ttm_funcs = {
> > diff --git a/drivers/gpu/drm/xe/xe_migrate.c b/drivers/gpu/drm/xe/xe_migrate.c
> > index ff0fc2fb0eb9..ba8568691d99 100644
> > --- a/drivers/gpu/drm/xe/xe_migrate.c
> > +++ b/drivers/gpu/drm/xe/xe_migrate.c
> > @@ -670,6 +670,7 @@ static void emit_copy(struct xe_gt *gt, struct xe_bb *bb,
> > u32 mocs = 0;
> > u32 tile_y = 0;
> >
> > + xe_gt_assert(gt, !(pitch & 3));
> > xe_gt_assert(gt, size / pitch <= S16_MAX);
> > xe_gt_assert(gt, pitch / 4 <= S16_MAX);
> > xe_gt_assert(gt, pitch <= U16_MAX);
> > @@ -1602,8 +1603,12 @@ enum xe_migrate_copy_dir {
> > XE_MIGRATE_COPY_TO_SRAM,
> > };
> >
> > +#define XE_CACHELINE_BYTES 64ull
> > +#define XE_CACHELINE_MASK (XE_CACHELINE_BYTES - 1)
> > +
> > static struct dma_fence *xe_migrate_vram(struct xe_migrate *m,
> > - unsigned long npages,
> > + unsigned long len,
> > + unsigned long sram_offset,
> > dma_addr_t *sram_addr, u64 vram_addr,
> > const enum xe_migrate_copy_dir dir)
> > {
> > @@ -1613,17 +1618,21 @@ static struct dma_fence *xe_migrate_vram(struct xe_migrate *m,
> > struct dma_fence *fence = NULL;
> > u32 batch_size = 2;
> > u64 src_L0_ofs, dst_L0_ofs;
> > - u64 round_update_size;
> > struct xe_sched_job *job;
> > struct xe_bb *bb;
> > u32 update_idx, pt_slot = 0;
> > + unsigned long npages = DIV_ROUND_UP(len + sram_offset, PAGE_SIZE);
> > + unsigned int pitch = len >= PAGE_SIZE && !(len & ~PAGE_MASK) ?
> > + PAGE_SIZE : 4;
> > int err;
> >
> > - if (npages * PAGE_SIZE > MAX_PREEMPTDISABLE_TRANSFER)
> > - return ERR_PTR(-EINVAL);
> > + if (drm_WARN_ON(&xe->drm, (len & XE_CACHELINE_MASK) ||
> > + (sram_offset | vram_addr) & XE_CACHELINE_MASK))
> > + return ERR_PTR(-EOPNOTSUPP);
> >
> > - round_update_size = npages * PAGE_SIZE;
> > - batch_size += pte_update_cmd_size(round_update_size);
> > + xe_assert(xe, npages * PAGE_SIZE <= MAX_PREEMPTDISABLE_TRANSFER);
>
> Hmm... Does this case still need to return an error value? I don't think replacing it with an
> xe_assert is entirely valid, as asserts should not be used as a replacement of proper error
> handling. Or so I've been told.
>
> So, I guess the question is: what error was this preventing previously, and is it still a concern?
>
This should have been an assert before - it just validating the caller
passing in sane arguments which we do everywhere in Xe. If that changes,
then boom the assert will pop.
> > +
> > + batch_size += pte_update_cmd_size(len);
> > batch_size += EMIT_COPY_DW;
> >
> > bb = xe_bb_new(gt, batch_size, use_usm_batch);
> > @@ -1633,22 +1642,21 @@ static struct dma_fence *xe_migrate_vram(struct xe_migrate *m,
> > }
> >
> > build_pt_update_batch_sram(m, bb, pt_slot * XE_PAGE_SIZE,
> > - sram_addr, round_update_size);
> > + sram_addr, len + sram_offset);
> >
> > if (dir == XE_MIGRATE_COPY_TO_VRAM) {
> > - src_L0_ofs = xe_migrate_vm_addr(pt_slot, 0);
> > + src_L0_ofs = xe_migrate_vm_addr(pt_slot, 0) + sram_offset;
> > dst_L0_ofs = xe_migrate_vram_ofs(xe, vram_addr, false);
> >
> > } else {
> > src_L0_ofs = xe_migrate_vram_ofs(xe, vram_addr, false);
> > - dst_L0_ofs = xe_migrate_vm_addr(pt_slot, 0);
> > + dst_L0_ofs = xe_migrate_vm_addr(pt_slot, 0) + sram_offset;
> > }
> >
> > bb->cs[bb->len++] = MI_BATCH_BUFFER_END;
> > update_idx = bb->len;
> >
> > - emit_copy(gt, bb, src_L0_ofs, dst_L0_ofs, round_update_size,
> > - XE_PAGE_SIZE);
> > + emit_copy(gt, bb, src_L0_ofs, dst_L0_ofs, len, pitch);
> >
> > job = xe_bb_create_migration_job(m->q, bb,
> > xe_migrate_batch_base(m, use_usm_batch),
> > @@ -1696,7 +1704,7 @@ struct dma_fence *xe_migrate_to_vram(struct xe_migrate *m,
> > dma_addr_t *src_addr,
> > u64 dst_addr)
> > {
> > - return xe_migrate_vram(m, npages, src_addr, dst_addr,
> > + return xe_migrate_vram(m, npages * PAGE_SIZE, 0, src_addr, dst_addr,
> > XE_MIGRATE_COPY_TO_VRAM);
> > }
> >
> > @@ -1717,12 +1725,199 @@ struct dma_fence *xe_migrate_from_vram(struct xe_migrate *m,
> > u64 src_addr,
> > dma_addr_t *dst_addr)
> > {
> > - return xe_migrate_vram(m, npages, dst_addr, src_addr,
> > + return xe_migrate_vram(m, npages * PAGE_SIZE, 0, dst_addr, src_addr,
> > XE_MIGRATE_COPY_TO_SRAM);
> > }
> >
> > #endif
> >
> > +static void xe_migrate_dma_unmap(struct xe_device *xe, dma_addr_t *dma_addr,
> > + int len, int write)
> > +{
> > + unsigned long i, npages = DIV_ROUND_UP(len, PAGE_SIZE);
> > +
> > + for (i = 0; i < npages; ++i) {
> > + if (!dma_addr[i])
> > + continue;
>
> Non-blocking:
> I'm guessing the dma_addr array can be non-contiguous (I.E. position 3 having a
> dma address does not imply positions 1 and 2 have them). Because otherwise
> this can be a break condition instead of a continue.
>
It should be a break, let me fix that.
> > +
> > + dma_unmap_page(xe->drm.dev, dma_addr[i], PAGE_SIZE,
> > + write ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
> > + }
> > + kfree(dma_addr);
> > +}
> > +
> > +static dma_addr_t *xe_migrate_dma_map(struct xe_device *xe,
> > + void *buf, int len, int write)
> > +{
> > + dma_addr_t *dma_addr;
> > + unsigned long i, npages = DIV_ROUND_UP(len, PAGE_SIZE);
> > +
> > + dma_addr = kcalloc(npages, sizeof(*dma_addr), GFP_KERNEL);
> > + if (!dma_addr)
> > + return ERR_PTR(-ENOMEM);
> > +
> > + for (i = 0; i < npages; ++i) {
> > + dma_addr_t addr;
> > + struct page *page;
> > +
> > + if (is_vmalloc_addr(buf))
> > + page = vmalloc_to_page(buf);
> > + else
> > + page = virt_to_page(buf);
> > +
> > + addr = dma_map_page(xe->drm.dev,
> > + page, 0, PAGE_SIZE,
> > + write ? DMA_TO_DEVICE :
> > + DMA_FROM_DEVICE);
> > + if (dma_mapping_error(xe->drm.dev, addr))
> > + goto err_fault;
> > +
> > + dma_addr[i] = addr;
> > + buf += PAGE_SIZE;
> > + }
> > +
> > + return dma_addr;
> > +
> > +err_fault:
> > + xe_migrate_dma_unmap(xe, dma_addr, len, write);
> > + return ERR_PTR(-EFAULT);
> > +}
> > +
> > +/**
> > + * xe_migrate_access_memory - Access memory of a BO via GPU
> > + *
> > + * @m: The migration context.
> > + * @bo: buffer object
> > + * @offset: access offset into buffer object
> > + * @buf: pointer to caller memory to read into or write from
> > + * @len: length of access
> > + * @write: write access
> > + *
> > + * Access memory of a BO via GPU either reading in or writing from a passed in
> > + * pointer. Pointer is dma mapped for GPU access and GPU commands are issued to
> > + * read to or write from pointer.
> > + *
> > + * Returns:
> > + * 0 if successful, negative error code on failure.
> > + */
> > +int xe_migrate_access_memory(struct xe_migrate *m, struct xe_bo *bo,
> > + unsigned long offset, void *buf, int len,
> > + int write)
> > +{
> > + struct xe_tile *tile = m->tile;
> > + struct xe_device *xe = tile_to_xe(tile);
> > + struct xe_res_cursor cursor;
> > + struct dma_fence *fence = NULL;
> > + dma_addr_t *dma_addr;
> > + unsigned long page_offset = (unsigned long)buf & ~PAGE_MASK;
> > + int bytes_left = len, current_page = 0;
> > + void *orig_buf = buf;
> > +
> > + xe_bo_assert_held(bo);
> > +
> > + /* Use bounce buffer for small access and unaligned access */
> > + if (len & XE_CACHELINE_MASK ||
> > + ((uintptr_t)buf | offset) & XE_CACHELINE_MASK) {
> > + int buf_offset = 0;
> > +
> > + /*
> > + * Less than ideal for large unaligned access but this should be
> > + * fairly rare, can fixup if this becomes common.
> > + */
> > + do {
> > + u8 bounce[XE_CACHELINE_BYTES];
> > + void *ptr = (void *)bounce;
> > + int err;
> > + int copy_bytes = min_t(int, bytes_left,
> > + XE_CACHELINE_BYTES -
> > + (offset & XE_CACHELINE_MASK));
> > + int ptr_offset = offset & XE_CACHELINE_MASK;
> > +
> > + err = xe_migrate_access_memory(m, bo,
> > + offset &
> > + ~XE_CACHELINE_MASK,
> > + (void *)ptr,
> > + sizeof(bounce), 0);
> > + if (err)
> > + return err;
> > +
> > + if (!write) {
> > + memcpy(buf + buf_offset, ptr + ptr_offset,
> > + copy_bytes);
> > + goto next;
> > + }
>
> Non-blocking:
> This should debatably be an if-else block, rather than relying on a "goto next"
> here, as "goto next" is not used elsewhere in the function.
>
> """
> if (write) {
> memcpy(ptr + ptr_offset, buf + buf_offset, copy_bytes);
> err = xe_migrate_access_memory(m, bo,
> offset & ~XE_CACHELINE_MASK,
> (void *)ptr,
> sizeof(bounce), 0);
> } else {
> memcpy(buf + buf_offset, ptr + ptr_offset,
> copy_bytes);
> }
>
> if (err)
> return err;
> """
Yea, let clean this up.
>
> > +
> > + memcpy(ptr + ptr_offset, buf + buf_offset, copy_bytes);
> > + err = xe_migrate_access_memory(m, bo,
> > + offset & ~XE_CACHELINE_MASK,
> > + (void *)ptr,
> > + sizeof(bounce), 0);
> > + if (err)
> > + return err;
> > +
> > +next:
> > + bytes_left -= copy_bytes;
> > + buf_offset += copy_bytes;
> > + offset += copy_bytes;
> > + } while (bytes_left);
> > +
> > + return 0;
> > + }
> > +
> > + dma_addr = xe_migrate_dma_map(xe, buf, len + page_offset, write);
> > + if (IS_ERR(dma_addr))
> > + return PTR_ERR(dma_addr);
> > +
> > + xe_res_first(bo->ttm.resource, offset, bo->size - offset, &cursor);
> > +
> > + do {
> > + struct dma_fence *__fence;
> > + u64 vram_addr = vram_region_gpu_offset(bo->ttm.resource) +
> > + cursor.start;
> > + int current_bytes;
> > +
> > + if (cursor.size > MAX_PREEMPTDISABLE_TRANSFER)
> > + current_bytes = min_t(int, bytes_left,
> > + MAX_PREEMPTDISABLE_TRANSFER);
> > + else
> > + current_bytes = min_t(int, bytes_left, cursor.size);
> > +
> > + if (fence)
> > + dma_fence_put(fence);
> > +
> > + __fence = xe_migrate_vram(m, current_bytes,
> > + (unsigned long)buf & ~PAGE_MASK,
> > + dma_addr + current_page,
> > + vram_addr, write ?
> > + XE_MIGRATE_COPY_TO_VRAM :
> > + XE_MIGRATE_COPY_TO_SRAM);
> > + if (IS_ERR(__fence)) {
> > + if (fence)
> > + dma_fence_wait(fence, false);
> > + fence = __fence;
> > + goto out_err;
> > + }
> > + fence = __fence;
>
> Non-blocking:
> It would be nice if we didn't have two paths assigning fence the __fence value,
> but any attempt I do to try and change that ends in having to compute the IS_ERR
> value for __fence twice. E.G.:
>
> """
> if (IS_ERR(__fence) && fence)
> dma_fence_wait(fence, false);
> fence = __fence;
> if (IS_ERR(fence))
> goto out_err;
> """
>
> So, either way works.
>
I prefer the way I have it.
> > +
> > + buf += current_bytes;
> > + offset += current_bytes;
> > + current_page = (int)(buf - orig_buf) / PAGE_SIZE;
> > + bytes_left -= current_bytes;
> > + if (bytes_left)
> > + xe_res_next(&cursor, current_bytes);
> > + } while (bytes_left);
> > +
> > + dma_fence_wait(fence, false);
> > + dma_fence_put(fence);
> > + xe_migrate_dma_unmap(xe, dma_addr, len + page_offset, write);
> > +
> > + return 0;
> > +
> > +out_err:
> > + xe_migrate_dma_unmap(xe, dma_addr, len + page_offset, write);
> > + return PTR_ERR(fence);
>
> Non-blocking:
> The branching execution might be able to be shortened. Perhaps:
> """
> dma_fence_wait(fence, false);
> dma_fence_put(fence);
>
> out_err:
> xe_migtate_dma_unmap(xe, dma_addr, len + page_offset, write);
> return IS_ERR(fence) ? PTR_ERR(fence) : 0;
> }
> """
Sure.
Matt
> -Jonathan Cavitt
>
> > +}
> > +
> > #if IS_ENABLED(CONFIG_DRM_XE_KUNIT_TEST)
> > #include "tests/xe_migrate.c"
> > #endif
> > diff --git a/drivers/gpu/drm/xe/xe_migrate.h b/drivers/gpu/drm/xe/xe_migrate.h
> > index 6ff9a963425c..fb9839c1bae0 100644
> > --- a/drivers/gpu/drm/xe/xe_migrate.h
> > +++ b/drivers/gpu/drm/xe/xe_migrate.h
> > @@ -112,6 +112,10 @@ struct dma_fence *xe_migrate_copy(struct xe_migrate *m,
> > struct ttm_resource *dst,
> > bool copy_only_ccs);
> >
> > +int xe_migrate_access_memory(struct xe_migrate *m, struct xe_bo *bo,
> > + unsigned long offset, void *buf, int len,
> > + int write);
> > +
> > #define XE_MIGRATE_CLEAR_FLAG_BO_DATA BIT(0)
> > #define XE_MIGRATE_CLEAR_FLAG_CCS_DATA BIT(1)
> > #define XE_MIGRATE_CLEAR_FLAG_FULL (XE_MIGRATE_CLEAR_FLAG_BO_DATA | \
> > --
> > 2.34.1
> >
> >
^ permalink raw reply [flat|nested] 20+ messages in thread
end of thread, other threads:[~2025-04-10 3:57 UTC | newest]
Thread overview: 20+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-04-03 20:27 [PATCH v2 0/4] Large devcoredump file support Matthew Brost
2025-04-03 20:27 ` [PATCH v2 1/4] drm/xe: Add devcoredump chunking Matthew Brost
2025-04-03 21:28 ` Cavitt, Jonathan
2025-04-03 21:39 ` John Harrison
2025-04-03 22:32 ` Matthew Brost
2025-04-03 22:41 ` John Harrison
2025-04-04 2:33 ` Matthew Brost
2025-04-03 20:27 ` [PATCH v2 2/4] drm/xe: Update xe_ttm_access_memory to use GPU for non-visible access Matthew Brost
2025-04-03 21:28 ` Cavitt, Jonathan
2025-04-10 3:58 ` Matthew Brost
2025-04-04 1:03 ` kernel test robot
2025-04-04 3:36 ` kernel test robot
2025-04-04 3:46 ` kernel test robot
2025-04-03 20:27 ` [PATCH v2 3/4] drm/print: Add drm_printer_is_full Matthew Brost
2025-04-03 21:28 ` Cavitt, Jonathan
2025-04-03 20:27 ` [PATCH v2 4/4] drm/xe: Abort printing coredump in VM printer output if full Matthew Brost
2025-04-03 21:28 ` Cavitt, Jonathan
2025-04-04 3:31 ` ✓ CI.Patch_applied: success for Large devcoredump file support (rev2) Patchwork
2025-04-04 3:32 ` ✓ CI.checkpatch: " Patchwork
2025-04-04 3:32 ` ✗ CI.KUnit: failure " Patchwork
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.