Re: [Intel-xe] [PATCH 4/4] drm/xe: Implement VM snapshot support

Intel-XE Archive on lore.kernel.org
 help / color / mirror / Atom feed

From: "Thomas Hellström" <thomas.hellstrom@linux.intel.com>
To: Maarten@mblankhorst.nl,
	"Lankhorst <maarten.lankhorst"@linux.intel.com,
	intel-xe@lists.freedesktop.org
Cc: Maarten Lankhorst <dev@lankhorst.se>
Subject: Re: [Intel-xe] [PATCH 4/4] drm/xe: Implement VM snapshot support
Date: Fri, 27 Oct 2023 14:17:54 +0200	[thread overview]
Message-ID: <78608338-86d7-bc42-4a7e-ade7210c59a6@linux.intel.com> (raw)
In-Reply-To: <20231024122256.19512-5-dev@lankhorst.se>

Hi, Maarten


On 10/24/23 14:22, Maarten@mblankhorst.nl wrote:
> From: Maarten Lankhorst <dev@lankhorst.se>
>
> Signed-off-by: Maarten Lankhorst <dev@lankhorst.se>
> ---
>   drivers/gpu/drm/xe/xe_devcoredump.c       |   9 ++
>   drivers/gpu/drm/xe/xe_devcoredump_types.h |   2 +
>   drivers/gpu/drm/xe/xe_vm.c                | 111 ++++++++++++++++++++++
>   drivers/gpu/drm/xe/xe_vm.h                |   4 +
>   4 files changed, 126 insertions(+)
>
> diff --git a/drivers/gpu/drm/xe/xe_devcoredump.c b/drivers/gpu/drm/xe/xe_devcoredump.c
> index 68abc0b195be..298be162ed0c 100644
> --- a/drivers/gpu/drm/xe/xe_devcoredump.c
> +++ b/drivers/gpu/drm/xe/xe_devcoredump.c
> @@ -16,6 +16,7 @@
>   #include "xe_guc_ct.h"
>   #include "xe_guc_submit.h"
>   #include "xe_hw_engine.h"
> +#include "xe_vm.h"
>   
>   /**
>    * DOC: Xe device coredump
> @@ -98,6 +99,10 @@ static ssize_t xe_devcoredump_read(char *buffer, loff_t offset,
>   		if (coredump->snapshot.hwe[i])
>   			xe_hw_engine_snapshot_print(coredump->snapshot.hwe[i],
>   						    &p);
> +	if (coredump->snapshot.vm) {
> +		drm_printf(&p, "\n**** VM state ****\n");
> +		xe_vm_snapshot_print(coredump->snapshot.vm, &p);
> +	}
>   
>   	return count - iter.remain;
>   }
> @@ -116,6 +121,7 @@ static void xe_devcoredump_free(void *data)
>   	for (i = 0; i < XE_NUM_HW_ENGINES; i++)
>   		if (coredump->snapshot.hwe[i])
>   			xe_hw_engine_snapshot_free(coredump->snapshot.hwe[i]);
> +	xe_vm_snapshot_free(coredump->snapshot.vm);
>   
>   	coredump->captured = false;
>   	drm_info(&coredump_to_xe(coredump)->drm,
> @@ -151,6 +157,8 @@ static void devcoredump_snapshot(struct xe_devcoredump *coredump,
>   
>   	coredump->snapshot.ct = xe_guc_ct_snapshot_capture(&guc->ct, true);
>   	coredump->snapshot.ge = xe_guc_exec_queue_snapshot_capture(q);
> +	if (q->vm)
> +		coredump->snapshot.vm = xe_vm_snapshot_capture(q->vm);
>   
>   	for_each_hw_engine(hwe, q->gt, id) {
>   		if (hwe->class != q->hwe->class ||
> @@ -194,3 +202,4 @@ void xe_devcoredump(struct xe_exec_queue *q)
>   		      xe_devcoredump_read, xe_devcoredump_free);
>   }
>   #endif
> +
> diff --git a/drivers/gpu/drm/xe/xe_devcoredump_types.h b/drivers/gpu/drm/xe/xe_devcoredump_types.h
> index 7fdad9c3d3dd..93c2ad7bdc54 100644
> --- a/drivers/gpu/drm/xe/xe_devcoredump_types.h
> +++ b/drivers/gpu/drm/xe/xe_devcoredump_types.h
> @@ -33,6 +33,8 @@ struct xe_devcoredump_snapshot {
>   	struct xe_guc_submit_exec_queue_snapshot *ge;
>   	/** @hwe: HW Engine snapshot array */
>   	struct xe_hw_engine_snapshot *hwe[XE_NUM_HW_ENGINES];
> +	/** @vm: Snapshot of VM state */
> +	struct xe_vm_snapshot *vm;
>   };
>   
>   /**
> diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c
> index b42ca1069c5b..13494c81e964 100644
> --- a/drivers/gpu/drm/xe/xe_vm.c
> +++ b/drivers/gpu/drm/xe/xe_vm.c
> @@ -3329,3 +3329,114 @@ int xe_analyze_vm(struct drm_printer *p, struct xe_vm *vm, int gt_id)
>   
>   	return 0;
>   }
> +
> +struct xe_vm_snapshot {
Kerneldoc structs and external functinos.
> +	unsigned long num_snaps;
> +	struct {
> +		uint64_t ofs, bo_ofs;
> +		unsigned long len;
> +		struct xe_bo *bo;
> +		void *data;
> +	} snap[];
> +};
> +
> +struct xe_vm_snapshot *xe_vm_snapshot_capture(struct xe_vm *vm)
> +{
> +	unsigned long num_snaps = 0, i;
> +	struct xe_vm_snapshot *snap;
> +	struct drm_gpuva *gpuva;
> +
> +	mutex_lock(&vm->snap_mutex); /*interruptible locks? */
> +	drm_gpuvm_for_each_va(gpuva, &vm->gpuvm) {
> +		if (gpuva->flags & XE_VMA_DUMPABLE)
> +			num_snaps++;
> +	}
> +
> +	snap = kvmalloc(offsetof(struct xe_vm_snapshot, snap[num_snaps]), GFP_NOWAIT);
> +	if (!snap)
> +		goto out_unlock;
> +
> +	snap->num_snaps = num_snaps;
> +	i = 0;
> +	drm_gpuvm_for_each_va(gpuva, &vm->gpuvm) {
> +		struct xe_vma *vma = gpuva_to_vma(gpuva);
> +		struct xe_bo *bo = gem_to_xe_bo(vma->gpuva.gem.obj);
> +
> +		if (!(gpuva->flags & XE_VMA_DUMPABLE))
> +			continue;
> +
> +		snap->snap[i].ofs = xe_vma_start(vma);
> +		snap->snap[i].len = xe_vma_size(vma);
> +		snap->snap[i].bo = xe_bo_get(bo);
> +		snap->snap[i].bo_ofs = xe_vma_bo_offset(vma);
> +		i++;
> +	}
> +
> +out_unlock:
> +	mutex_unlock(&vm->snap_mutex);
> +	return snap;
> +}
> +
> +void xe_vm_snapshot_print(struct xe_vm_snapshot *snap, struct drm_printer *p)
> +{
> +	unsigned long i, j;
> +
> +	for (i = 0; i < snap->num_snaps; i++) {
> +		struct xe_bo *bo = snap->snap[i].bo;
> +		struct iosys_map src;
> +		int err;
> +
> +		if (snap->snap[i].data)
> +			goto print;
> +
> +		/* Capture data, but try only once */
> +		if (bo)
> +			snap->snap[i].data = kvmalloc(snap->snap[i].len, GFP_USER);
> +		if (!snap->snap[i].data)
> +			goto kvfree;
> +
> +		/* TODO: Locking, VMAP magic etc here.. */
> +		dma_resv_lock(bo->ttm.base.resv, NULL);

Hm. What is the thinking around deep pipelining here?
I figure a hang that caused the coredump may in theory be several 
migrations behind the current state of the bo, and the bo may not even 
have been dumpable when executing whatever caused the hang...

Does it make sense to wait for any KERNEL fences here?

/Thomas


> +		err = ttm_bo_vmap(&bo->ttm, &src);
> +		if (!err) {
> +			xe_map_memcpy_from(xe_bo_device(bo),
> +					   snap->snap[i].data,
> +					   &src, snap->snap[i].bo_ofs,
> +					   snap->snap[i].len);
> +			ttm_bo_vunmap(&bo->ttm, &src);
> +		}
> +		dma_resv_unlock(bo->ttm.base.resv);
> +
> +		if (err)
> +			goto kvfree;
> +
> +print:
> +		for (j = 0; j < snap->snap[i].len; j += 64) {
> +			uint32_t *x = snap->snap[i].data + j;
> +
> +			drm_printf(p, "[%llx] = { %x, %x, %x, %x, %x, %x, %x, %x, %x, %x, %x, %x, %x, %x, %x, %x }\n",
> +				   snap->snap[i].ofs + j, x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7],
> +				   x[8], x[9], x[10], x[11], x[12], x[13], x[14], x[15]);
> +		}
> +		xe_bo_put(bo);
> +		snap->snap[i].bo = NULL;
> +		continue;
> +kvfree:
> +		kvfree(snap->snap[i].data);
> +		snap->snap[i].data = NULL;
> +
> +		drm_printf(p, "Unable to capture range [%llx-%llx]\n",
> +			   snap->snap[i].ofs, snap->snap[i].ofs + snap->snap[i].len - 1);
> +	}
> +}
> +
> +void xe_vm_snapshot_free(struct xe_vm_snapshot *snap)
> +{
> +	unsigned long i;
> +
> +	for (i = 0; i < snap->num_snaps; i++) {
> +		kvfree(snap->snap[i].data);
> +		xe_bo_put(snap->snap[i].bo);
> +	}
> +	kvfree(snap);
> +}
> diff --git a/drivers/gpu/drm/xe/xe_vm.h b/drivers/gpu/drm/xe/xe_vm.h
> index b08c75fbd8a1..ef23fe3c2f40 100644
> --- a/drivers/gpu/drm/xe/xe_vm.h
> +++ b/drivers/gpu/drm/xe/xe_vm.h
> @@ -233,3 +233,7 @@ static inline void vm_dbg(const struct drm_device *dev,
>   { /* noop */ }
>   #endif
>   #endif
> +
> +struct xe_vm_snapshot *xe_vm_snapshot_capture(struct xe_vm *vm);
> +void xe_vm_snapshot_print(struct xe_vm_snapshot *snap, struct drm_printer *p);
> +void xe_vm_snapshot_free(struct xe_vm_snapshot *snap);

next prev parent reply	other threads:[~2023-10-27 12:18 UTC|newest]

Thread overview: 17+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-10-24 12:22 [Intel-xe] [PATCH 0/4] drm/xe: Implement snapshot support Maarten, Lankhorst <maarten.lankhorst
2023-10-24 12:22 ` [Intel-xe] [PATCH 1/4] drm/xe: Add uapi for dumpable bos Maarten, Lankhorst <maarten.lankhorst
2023-11-13 21:41   ` Souza, Jose
2023-10-24 12:22 ` [Intel-xe] [PATCH 2/4] drm/xe: Annotate each dumpable vma as such Maarten, Lankhorst <maarten.lankhorst
2023-10-24 12:22 ` [Intel-xe] [PATCH 3/4] drm/xe: Add vm snapshot mutex for easily taking a vm snapshot during devcoredump Maarten, Lankhorst <maarten.lankhorst
2023-10-24 12:22 ` [Intel-xe] [PATCH 4/4] drm/xe: Implement VM snapshot support Maarten, Lankhorst <maarten.lankhorst
2023-10-27 12:17   ` Thomas Hellström [this message]
2023-10-27 17:47     ` Maarten Lankhorst
2023-11-13 21:52   ` Souza, Jose
2024-01-22 18:03   ` Souza, Jose
2023-10-24 13:07 ` [Intel-xe] ✓ CI.Patch_applied: success for drm/xe: Implement " Patchwork
2023-10-24 13:07 ` [Intel-xe] ✗ CI.checkpatch: warning " Patchwork
2023-10-24 13:09 ` [Intel-xe] ✓ CI.KUnit: success " Patchwork
2023-10-24 13:16 ` [Intel-xe] ✓ CI.Build: " Patchwork
2023-10-24 13:16 ` [Intel-xe] ✓ CI.Hooks: " Patchwork
2023-10-24 13:18 ` [Intel-xe] ✓ CI.checksparse: " Patchwork
2023-10-24 13:52 ` [Intel-xe] ✓ CI.BAT: " Patchwork

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=78608338-86d7-bc42-4a7e-ade7210c59a6@linux.intel.com \
    --to=thomas.hellstrom@linux.intel.com \
    --cc="Lankhorst <maarten.lankhorst"@linux.intel.com \
    --cc=Maarten@mblankhorst.nl \
    --cc=dev@lankhorst.se \
    --cc=intel-xe@lists.freedesktop.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox