Re: [PATCH v6 11/32] drm/xe: Nuke VM's mapping upon close

Intel-XE Archive on lore.kernel.org
 help / color / mirror / Atom feed

From: Matthew Auld <matthew.auld@intel.com>
To: Matthew Brost <matthew.brost@intel.com>,
	intel-xe@lists.freedesktop.org, dri-devel@lists.freedesktop.org
Cc: himal.prasad.ghimiray@intel.com, apopple@nvidia.com,
	airlied@gmail.com, thomas.hellstrom@linux.intel.com,
	simona.vetter@ffwll.ch, felix.kuehling@amd.com, dakr@kernel.org
Subject: Re: [PATCH v6 11/32] drm/xe: Nuke VM's mapping upon close
Date: Tue, 25 Feb 2025 18:05:13 +0000	[thread overview]
Message-ID: <bf1b2e41-1380-45de-8772-59ebbcb5cf56@intel.com> (raw)
In-Reply-To: <20250225044311.3178695-12-matthew.brost@intel.com>

On 25/02/2025 04:42, Matthew Brost wrote:
> Clear root PT entry and invalidate entire VM's address space when
> closing the VM. Will prevent the GPU from accessing any of the VM's
> memory after closing.
> 
> v2:
>   - s/vma/vm in kernel doc (CI)
>   - Don't nuke migration VM as this occur at driver unload (CI)
> v3:
>   - Rebase and pull into SVM series (Thomas)
>   - Wait for pending binds (Thomas)
> v5:
>   - Remove xe_gt_tlb_invalidation_fence_fini in error case (Matt Auld)
>   - Drop local migration bool (Thomas)
> 
> Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> Reviewed-by: Thomas Hellström <thomas.hellstrom@linux.intel.com>
> ---
>   drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c | 22 +++++++++++++++++++++
>   drivers/gpu/drm/xe/xe_gt_tlb_invalidation.h |  2 ++
>   drivers/gpu/drm/xe/xe_pt.c                  | 14 +++++++++++++
>   drivers/gpu/drm/xe/xe_pt.h                  |  3 +++
>   drivers/gpu/drm/xe/xe_vm.c                  | 20 +++++++++++++++++++
>   5 files changed, 61 insertions(+)
> 
> diff --git a/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c b/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c
> index 0a93831c0a02..03072e094991 100644
> --- a/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c
> +++ b/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c
> @@ -410,6 +410,28 @@ int xe_gt_tlb_invalidation_range(struct xe_gt *gt,
>   	return send_tlb_invalidation(&gt->uc.guc, fence, action, len);
>   }
>   
> +/**
> + * xe_gt_tlb_invalidation_vm - Issue a TLB invalidation on this GT for a VM
> + * @gt: graphics tile
> + * @vm: VM to invalidate
> + *
> + * Invalidate entire VM's address space
> + */
> +void xe_gt_tlb_invalidation_vm(struct xe_gt *gt, struct xe_vm *vm)
> +{
> +	struct xe_gt_tlb_invalidation_fence fence;
> +	u64 range = 1ull << vm->xe->info.va_bits;
> +	int ret;
> +
> +	xe_gt_tlb_invalidation_fence_init(gt, &fence, true);
> +
> +	ret = xe_gt_tlb_invalidation_range(gt, &fence, 0, range, vm->usm.asid);
> +	if (ret < 0)
> +		return;
> +
> +	xe_gt_tlb_invalidation_fence_wait(&fence);
> +}
> +
>   /**
>    * xe_gt_tlb_invalidation_vma - Issue a TLB invalidation on this GT for a VMA
>    * @gt: GT structure
> diff --git a/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.h b/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.h
> index 672acfcdf0d7..abe9b03d543e 100644
> --- a/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.h
> +++ b/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.h
> @@ -12,6 +12,7 @@
>   
>   struct xe_gt;
>   struct xe_guc;
> +struct xe_vm;
>   struct xe_vma;
>   
>   int xe_gt_tlb_invalidation_init_early(struct xe_gt *gt);
> @@ -21,6 +22,7 @@ int xe_gt_tlb_invalidation_ggtt(struct xe_gt *gt);
>   int xe_gt_tlb_invalidation_vma(struct xe_gt *gt,
>   			       struct xe_gt_tlb_invalidation_fence *fence,
>   			       struct xe_vma *vma);
> +void xe_gt_tlb_invalidation_vm(struct xe_gt *gt, struct xe_vm *vm);
>   int xe_gt_tlb_invalidation_range(struct xe_gt *gt,
>   				 struct xe_gt_tlb_invalidation_fence *fence,
>   				 u64 start, u64 end, u32 asid);
> diff --git a/drivers/gpu/drm/xe/xe_pt.c b/drivers/gpu/drm/xe/xe_pt.c
> index 99b97bf37c05..c5060011ad43 100644
> --- a/drivers/gpu/drm/xe/xe_pt.c
> +++ b/drivers/gpu/drm/xe/xe_pt.c
> @@ -214,6 +214,20 @@ void xe_pt_destroy(struct xe_pt *pt, u32 flags, struct llist_head *deferred)
>   	xe_pt_free(pt);
>   }
>   
> +/**
> + * xe_pt_clear() - Clear a page-table.
> + * @xe: xe device.
> + * @pt: The page-table.
> + *
> + * Clears page-table by setting to zero.
> + */
> +void xe_pt_clear(struct xe_device *xe, struct xe_pt *pt)
> +{
> +	struct iosys_map *map = &pt->bo->vmap;
> +
> +	xe_map_memset(xe, map, 0, 0, SZ_4K);
> +}
> +
>   /**
>    * DOC: Pagetable building
>    *
> diff --git a/drivers/gpu/drm/xe/xe_pt.h b/drivers/gpu/drm/xe/xe_pt.h
> index 9ab386431cad..8e43912ae8e9 100644
> --- a/drivers/gpu/drm/xe/xe_pt.h
> +++ b/drivers/gpu/drm/xe/xe_pt.h
> @@ -13,6 +13,7 @@ struct dma_fence;
>   struct xe_bo;
>   struct xe_device;
>   struct xe_exec_queue;
> +struct xe_svm_range;
>   struct xe_sync_entry;
>   struct xe_tile;
>   struct xe_vm;
> @@ -35,6 +36,8 @@ void xe_pt_populate_empty(struct xe_tile *tile, struct xe_vm *vm,
>   
>   void xe_pt_destroy(struct xe_pt *pt, u32 flags, struct llist_head *deferred);
>   
> +void xe_pt_clear(struct xe_device *xe, struct xe_pt *pt);
> +
>   int xe_pt_update_ops_prepare(struct xe_tile *tile, struct xe_vma_ops *vops);
>   struct dma_fence *xe_pt_update_ops_run(struct xe_tile *tile,
>   				       struct xe_vma_ops *vops);
> diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c
> index 1454f98e0cf6..f06b77a757bd 100644
> --- a/drivers/gpu/drm/xe/xe_vm.c
> +++ b/drivers/gpu/drm/xe/xe_vm.c
> @@ -1584,7 +1584,27 @@ struct xe_vm *xe_vm_create(struct xe_device *xe, u32 flags)
>   static void xe_vm_close(struct xe_vm *vm)
>   {
>   	down_write(&vm->lock);
> +
>   	vm->size = 0;
> +
> +	if (!((vm->flags & XE_VM_FLAG_MIGRATION))) {
> +		struct xe_tile *tile;
> +		struct xe_gt *gt;
> +		u8 id;
> +
> +		/* Wait for pending binds */
> +		dma_resv_wait_timeout(xe_vm_resv(vm),
> +				      DMA_RESV_USAGE_BOOKKEEP,
> +				      false, MAX_SCHEDULE_TIMEOUT);
> +
> +		for_each_tile(tile, vm->xe, id)
> +			if (vm->pt_root[id])
> +				xe_pt_clear(vm->xe, vm->pt_root[id]);

CI looks to be crashing here with various flavours of:

https://intel-gfx-ci.01.org/tree/intel-xe/xe-pw-137870v6/shard-dg2-466/igt@xe_fault_injection@vm-bind-fail-vm_bind_ioctl_ops_create.html

1> [310.004839] BUG: unable to handle page fault for address: 
ffffc901bc66e000
<1> [310.004854] #PF: supervisor write access in kernel mode
<1> [310.004861] #PF: error_code(0x0002) - not-present page
<6> [310.004867] PGD 100000067 P4D 100000067 PUD 0
<4> [310.005041]  <TASK>
<4> [310.005047]  ? show_regs+0x6c/0x80
<4> [310.005058]  ? __die+0x24/0x80
<4> [310.005068]  ? page_fault_oops+0x175/0x5d0
<4> [310.005079]  ? memset+0xb/0x20
<4> [310.005089]  ? kernelmode_fixup_or_oops.isra.0+0x69/0x90
<4> [310.005101]  ? __bad_area_nosemaphore+0x1bd/0x2f0
<4> [310.005112]  ? bad_area_nosemaphore+0x16/0x30
<4> [310.005122]  ? do_kern_addr_fault.part.0+0x62/0x80
<4> [310.005133]  ? exc_page_fault+0x2c8/0x300
<4> [310.005142]  ? asm_exc_page_fault+0x27/0x30
<4> [310.005155]  ? memset+0xb/0x20
<4> [310.005164]  ? memset_io+0x62/0x80
<4> [310.005174]  xe_pt_clear+0x81/0xa0 [xe]

I took a look and I think the root cause is that the error injection 
IGTs are unbinding the device (like hotunplug) with the vm still being 
left open (closed when driver instance is closed) so here the top level 
vram mapping has already been unmapped from the kernel address space 
during the unbind, and this here is then trying to write through that 
mapping. I think we need to sprinkle some drm_dev_enter() here, and 
maybe also for the inval below. If the device is already gone then we 
should be able to skip it.

> +
> +		for_each_gt(gt, vm->xe, id)
> +			xe_gt_tlb_invalidation_vm(gt, vm);
> +	}
> +
>   	up_write(&vm->lock);
>   }
>

next prev parent reply	other threads:[~2025-02-25 18:05 UTC|newest]

Thread overview: 56+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-02-25  4:42 [PATCH v6 00/32] Introduce GPU SVM and Xe SVM implementation Matthew Brost
2025-02-25  4:42 ` [PATCH v6 01/32] drm/xe: Retry BO allocation Matthew Brost
2025-02-25  4:42 ` [PATCH v6 02/32] mm/migrate: Add migrate_device_pfns Matthew Brost
2025-02-25  4:42 ` [PATCH v6 03/32] mm/migrate: Trylock device page in do_swap_page Matthew Brost
2025-02-25  4:42 ` [PATCH v6 04/32] drm/pagemap: Add DRM pagemap Matthew Brost
2025-02-25 15:06   ` Matthew Auld
2025-02-25 18:16     ` Matthew Brost
2025-02-25  4:42 ` [PATCH v6 05/32] drm/xe/bo: Introduce xe_bo_put_async Matthew Brost
2025-02-25  4:42 ` [PATCH v6 06/32] drm/gpusvm: Add support for GPU Shared Virtual Memory Matthew Brost
2025-02-25 15:14   ` Matthew Auld
2025-02-25 18:16     ` Matthew Brost
2025-02-25  4:42 ` [PATCH v6 07/32] drm/xe: Select DRM_GPUSVM Kconfig Matthew Brost
2025-02-25  4:42 ` [PATCH v6 08/32] drm/xe/uapi: Add DRM_XE_VM_BIND_FLAG_CPU_ADDR_MIRROR Matthew Brost
2025-02-25  4:42 ` [PATCH v6 09/32] drm/xe: Add SVM init / close / fini to faulting VMs Matthew Brost
2025-02-25  4:42 ` [PATCH v6 10/32] drm/xe: Add dma_addr res cursor Matthew Brost
2025-02-25  4:42 ` [PATCH v6 11/32] drm/xe: Nuke VM's mapping upon close Matthew Brost
2025-02-25 18:05   ` Matthew Auld [this message]
2025-02-25 18:14     ` Matthew Brost
2025-02-25  4:42 ` [PATCH v6 12/32] drm/xe: Add SVM range invalidation and page fault Matthew Brost
2025-02-25  4:42 ` [PATCH v6 13/32] drm/gpuvm: Add DRM_GPUVA_OP_DRIVER Matthew Brost
2025-02-25  4:42 ` [PATCH v6 14/32] drm/xe: Add (re)bind to SVM page fault handler Matthew Brost
2025-02-26 17:00   ` Thomas Hellström
2025-02-26 17:18   ` Ghimiray, Himal Prasad
2025-02-25  4:42 ` [PATCH v6 15/32] drm/xe: Add SVM garbage collector Matthew Brost
2025-02-25  4:42 ` [PATCH v6 16/32] drm/xe: Add unbind to " Matthew Brost
2025-02-25  4:42 ` [PATCH v6 17/32] drm/xe: Do not allow CPU address mirror VMA unbind if the GPU has bindings Matthew Brost
2025-02-27 17:01   ` Thomas Hellström
2025-02-25  4:42 ` [PATCH v6 18/32] drm/xe: Enable CPU address mirror uAPI Matthew Brost
2025-02-25  4:42 ` [PATCH v6 19/32] drm/xe/uapi: Add DRM_XE_QUERY_CONFIG_FLAG_HAS_CPU_ADDR_MIRROR Matthew Brost
2025-02-25  4:42 ` [PATCH v6 20/32] drm/xe: Add migrate layer functions for SVM support Matthew Brost
2025-02-25  4:43 ` [PATCH v6 21/32] drm/xe: Add SVM device memory mirroring Matthew Brost
2025-02-25  4:43 ` [PATCH v6 22/32] drm/xe: Add drm_gpusvm_devmem to xe_bo Matthew Brost
2025-02-25  4:43 ` [PATCH v6 23/32] drm/xe: Add drm_pagemap ops to SVM Matthew Brost
2025-02-25  4:43 ` [PATCH v6 24/32] drm/xe: Add GPUSVM device memory copy vfunc functions Matthew Brost
2025-02-25  4:43 ` [PATCH v6 25/32] drm/xe: Add Xe SVM populate_devmem_pfn GPU SVM vfunc Matthew Brost
2025-02-25  4:43 ` [PATCH v6 26/32] drm/xe: Add Xe SVM devmem_release " Matthew Brost
2025-02-25  4:43 ` [PATCH v6 27/32] drm/xe: Add SVM VRAM migration Matthew Brost
2025-02-26 16:47   ` Thomas Hellström
2025-02-26 17:16   ` Ghimiray, Himal Prasad
2025-02-25  4:43 ` [PATCH v6 28/32] drm/xe: Basic SVM BO eviction Matthew Brost
2025-02-25  4:43 ` [PATCH v6 29/32] drm/xe: Add SVM debug Matthew Brost
2025-02-25  4:43 ` [PATCH v6 30/32] drm/xe: Add modparam for SVM notifier size Matthew Brost
2025-02-25  4:43 ` [PATCH v6 31/32] drm/xe: Add always_migrate_to_vram modparam Matthew Brost
2025-02-25  4:43 ` [PATCH v6 32/32] drm/doc: gpusvm: Add GPU SVM documentation Matthew Brost
2025-02-28  2:34   ` Alistair Popple
2025-02-28  4:36     ` Matthew Brost
2025-02-28  5:53       ` Alistair Popple
2025-03-01  0:35         ` Matthew Brost
2025-02-25  4:50 ` ✓ CI.Patch_applied: success for Introduce GPU SVM and Xe SVM implementation (rev6) Patchwork
2025-02-25  4:51 ` ✗ CI.checkpatch: warning " Patchwork
2025-02-25  4:52 ` ✓ CI.KUnit: success " Patchwork
2025-02-25  5:08 ` ✓ CI.Build: " Patchwork
2025-02-25  5:10 ` ✗ CI.Hooks: failure " Patchwork
2025-02-25  5:12 ` ✗ CI.checksparse: warning " Patchwork
2025-02-25  5:32 ` ✓ Xe.CI.BAT: success " Patchwork
2025-02-25  9:55 ` ✗ Xe.CI.Full: failure " Patchwork

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=bf1b2e41-1380-45de-8772-59ebbcb5cf56@intel.com \
    --to=matthew.auld@intel.com \
    --cc=airlied@gmail.com \
    --cc=apopple@nvidia.com \
    --cc=dakr@kernel.org \
    --cc=dri-devel@lists.freedesktop.org \
    --cc=felix.kuehling@amd.com \
    --cc=himal.prasad.ghimiray@intel.com \
    --cc=intel-xe@lists.freedesktop.org \
    --cc=matthew.brost@intel.com \
    --cc=simona.vetter@ffwll.ch \
    --cc=thomas.hellstrom@linux.intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox