Intel-XE Archive on lore.kernel.org
 help / color / mirror / Atom feed
From: Matthew Brost <matthew.brost@intel.com>
To: Matthew Auld <matthew.auld@intel.com>
Cc: <intel-xe@lists.freedesktop.org>,
	<dri-devel@lists.freedesktop.org>,
	<himal.prasad.ghimiray@intel.com>, <apopple@nvidia.com>,
	<airlied@gmail.com>,  <thomas.hellstrom@linux.intel.com>,
	<simona.vetter@ffwll.ch>, <felix.kuehling@amd.com>,
	<dakr@kernel.org>
Subject: Re: [PATCH v6 11/32] drm/xe: Nuke VM's mapping upon close
Date: Tue, 25 Feb 2025 10:14:06 -0800	[thread overview]
Message-ID: <Z74Ibtxgmew7A4e7@lstrano-desk.jf.intel.com> (raw)
In-Reply-To: <bf1b2e41-1380-45de-8772-59ebbcb5cf56@intel.com>

On Tue, Feb 25, 2025 at 06:05:13PM +0000, Matthew Auld wrote:
> On 25/02/2025 04:42, Matthew Brost wrote:
> > Clear root PT entry and invalidate entire VM's address space when
> > closing the VM. Will prevent the GPU from accessing any of the VM's
> > memory after closing.
> > 
> > v2:
> >   - s/vma/vm in kernel doc (CI)
> >   - Don't nuke migration VM as this occur at driver unload (CI)
> > v3:
> >   - Rebase and pull into SVM series (Thomas)
> >   - Wait for pending binds (Thomas)
> > v5:
> >   - Remove xe_gt_tlb_invalidation_fence_fini in error case (Matt Auld)
> >   - Drop local migration bool (Thomas)
> > 
> > Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> > Reviewed-by: Thomas Hellström <thomas.hellstrom@linux.intel.com>
> > ---
> >   drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c | 22 +++++++++++++++++++++
> >   drivers/gpu/drm/xe/xe_gt_tlb_invalidation.h |  2 ++
> >   drivers/gpu/drm/xe/xe_pt.c                  | 14 +++++++++++++
> >   drivers/gpu/drm/xe/xe_pt.h                  |  3 +++
> >   drivers/gpu/drm/xe/xe_vm.c                  | 20 +++++++++++++++++++
> >   5 files changed, 61 insertions(+)
> > 
> > diff --git a/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c b/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c
> > index 0a93831c0a02..03072e094991 100644
> > --- a/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c
> > +++ b/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c
> > @@ -410,6 +410,28 @@ int xe_gt_tlb_invalidation_range(struct xe_gt *gt,
> >   	return send_tlb_invalidation(&gt->uc.guc, fence, action, len);
> >   }
> > +/**
> > + * xe_gt_tlb_invalidation_vm - Issue a TLB invalidation on this GT for a VM
> > + * @gt: graphics tile
> > + * @vm: VM to invalidate
> > + *
> > + * Invalidate entire VM's address space
> > + */
> > +void xe_gt_tlb_invalidation_vm(struct xe_gt *gt, struct xe_vm *vm)
> > +{
> > +	struct xe_gt_tlb_invalidation_fence fence;
> > +	u64 range = 1ull << vm->xe->info.va_bits;
> > +	int ret;
> > +
> > +	xe_gt_tlb_invalidation_fence_init(gt, &fence, true);
> > +
> > +	ret = xe_gt_tlb_invalidation_range(gt, &fence, 0, range, vm->usm.asid);
> > +	if (ret < 0)
> > +		return;
> > +
> > +	xe_gt_tlb_invalidation_fence_wait(&fence);
> > +}
> > +
> >   /**
> >    * xe_gt_tlb_invalidation_vma - Issue a TLB invalidation on this GT for a VMA
> >    * @gt: GT structure
> > diff --git a/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.h b/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.h
> > index 672acfcdf0d7..abe9b03d543e 100644
> > --- a/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.h
> > +++ b/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.h
> > @@ -12,6 +12,7 @@
> >   struct xe_gt;
> >   struct xe_guc;
> > +struct xe_vm;
> >   struct xe_vma;
> >   int xe_gt_tlb_invalidation_init_early(struct xe_gt *gt);
> > @@ -21,6 +22,7 @@ int xe_gt_tlb_invalidation_ggtt(struct xe_gt *gt);
> >   int xe_gt_tlb_invalidation_vma(struct xe_gt *gt,
> >   			       struct xe_gt_tlb_invalidation_fence *fence,
> >   			       struct xe_vma *vma);
> > +void xe_gt_tlb_invalidation_vm(struct xe_gt *gt, struct xe_vm *vm);
> >   int xe_gt_tlb_invalidation_range(struct xe_gt *gt,
> >   				 struct xe_gt_tlb_invalidation_fence *fence,
> >   				 u64 start, u64 end, u32 asid);
> > diff --git a/drivers/gpu/drm/xe/xe_pt.c b/drivers/gpu/drm/xe/xe_pt.c
> > index 99b97bf37c05..c5060011ad43 100644
> > --- a/drivers/gpu/drm/xe/xe_pt.c
> > +++ b/drivers/gpu/drm/xe/xe_pt.c
> > @@ -214,6 +214,20 @@ void xe_pt_destroy(struct xe_pt *pt, u32 flags, struct llist_head *deferred)
> >   	xe_pt_free(pt);
> >   }
> > +/**
> > + * xe_pt_clear() - Clear a page-table.
> > + * @xe: xe device.
> > + * @pt: The page-table.
> > + *
> > + * Clears page-table by setting to zero.
> > + */
> > +void xe_pt_clear(struct xe_device *xe, struct xe_pt *pt)
> > +{
> > +	struct iosys_map *map = &pt->bo->vmap;
> > +
> > +	xe_map_memset(xe, map, 0, 0, SZ_4K);
> > +}
> > +
> >   /**
> >    * DOC: Pagetable building
> >    *
> > diff --git a/drivers/gpu/drm/xe/xe_pt.h b/drivers/gpu/drm/xe/xe_pt.h
> > index 9ab386431cad..8e43912ae8e9 100644
> > --- a/drivers/gpu/drm/xe/xe_pt.h
> > +++ b/drivers/gpu/drm/xe/xe_pt.h
> > @@ -13,6 +13,7 @@ struct dma_fence;
> >   struct xe_bo;
> >   struct xe_device;
> >   struct xe_exec_queue;
> > +struct xe_svm_range;
> >   struct xe_sync_entry;
> >   struct xe_tile;
> >   struct xe_vm;
> > @@ -35,6 +36,8 @@ void xe_pt_populate_empty(struct xe_tile *tile, struct xe_vm *vm,
> >   void xe_pt_destroy(struct xe_pt *pt, u32 flags, struct llist_head *deferred);
> > +void xe_pt_clear(struct xe_device *xe, struct xe_pt *pt);
> > +
> >   int xe_pt_update_ops_prepare(struct xe_tile *tile, struct xe_vma_ops *vops);
> >   struct dma_fence *xe_pt_update_ops_run(struct xe_tile *tile,
> >   				       struct xe_vma_ops *vops);
> > diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c
> > index 1454f98e0cf6..f06b77a757bd 100644
> > --- a/drivers/gpu/drm/xe/xe_vm.c
> > +++ b/drivers/gpu/drm/xe/xe_vm.c
> > @@ -1584,7 +1584,27 @@ struct xe_vm *xe_vm_create(struct xe_device *xe, u32 flags)
> >   static void xe_vm_close(struct xe_vm *vm)
> >   {
> >   	down_write(&vm->lock);
> > +
> >   	vm->size = 0;
> > +
> > +	if (!((vm->flags & XE_VM_FLAG_MIGRATION))) {
> > +		struct xe_tile *tile;
> > +		struct xe_gt *gt;
> > +		u8 id;
> > +
> > +		/* Wait for pending binds */
> > +		dma_resv_wait_timeout(xe_vm_resv(vm),
> > +				      DMA_RESV_USAGE_BOOKKEEP,
> > +				      false, MAX_SCHEDULE_TIMEOUT);
> > +
> > +		for_each_tile(tile, vm->xe, id)
> > +			if (vm->pt_root[id])
> > +				xe_pt_clear(vm->xe, vm->pt_root[id]);
> 
> CI looks to be crashing here with various flavours of:
> 
> https://intel-gfx-ci.01.org/tree/intel-xe/xe-pw-137870v6/shard-dg2-466/igt@xe_fault_injection@vm-bind-fail-vm_bind_ioctl_ops_create.html
> 
> 1> [310.004839] BUG: unable to handle page fault for address:
> ffffc901bc66e000
> <1> [310.004854] #PF: supervisor write access in kernel mode
> <1> [310.004861] #PF: error_code(0x0002) - not-present page
> <6> [310.004867] PGD 100000067 P4D 100000067 PUD 0
> <4> [310.005041]  <TASK>
> <4> [310.005047]  ? show_regs+0x6c/0x80
> <4> [310.005058]  ? __die+0x24/0x80
> <4> [310.005068]  ? page_fault_oops+0x175/0x5d0
> <4> [310.005079]  ? memset+0xb/0x20
> <4> [310.005089]  ? kernelmode_fixup_or_oops.isra.0+0x69/0x90
> <4> [310.005101]  ? __bad_area_nosemaphore+0x1bd/0x2f0
> <4> [310.005112]  ? bad_area_nosemaphore+0x16/0x30
> <4> [310.005122]  ? do_kern_addr_fault.part.0+0x62/0x80
> <4> [310.005133]  ? exc_page_fault+0x2c8/0x300
> <4> [310.005142]  ? asm_exc_page_fault+0x27/0x30
> <4> [310.005155]  ? memset+0xb/0x20
> <4> [310.005164]  ? memset_io+0x62/0x80
> <4> [310.005174]  xe_pt_clear+0x81/0xa0 [xe]
> 
> I took a look and I think the root cause is that the error injection IGTs
> are unbinding the device (like hotunplug) with the vm still being left open
> (closed when driver instance is closed) so here the top level vram mapping
> has already been unmapped from the kernel address space during the unbind,
> and this here is then trying to write through that mapping. I think we need
> to sprinkle some drm_dev_enter() here, and maybe also for the inval below.
> If the device is already gone then we should be able to skip it.
> 

Thanks for looking here, this would've confused me for me bit. Will add
drm_dev_enter/exit.

Matt

> > +
> > +		for_each_gt(gt, vm->xe, id)
> > +			xe_gt_tlb_invalidation_vm(gt, vm);
> > +	}
> > +
> >   	up_write(&vm->lock);
> >   }
> 

  reply	other threads:[~2025-02-25 18:13 UTC|newest]

Thread overview: 56+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-02-25  4:42 [PATCH v6 00/32] Introduce GPU SVM and Xe SVM implementation Matthew Brost
2025-02-25  4:42 ` [PATCH v6 01/32] drm/xe: Retry BO allocation Matthew Brost
2025-02-25  4:42 ` [PATCH v6 02/32] mm/migrate: Add migrate_device_pfns Matthew Brost
2025-02-25  4:42 ` [PATCH v6 03/32] mm/migrate: Trylock device page in do_swap_page Matthew Brost
2025-02-25  4:42 ` [PATCH v6 04/32] drm/pagemap: Add DRM pagemap Matthew Brost
2025-02-25 15:06   ` Matthew Auld
2025-02-25 18:16     ` Matthew Brost
2025-02-25  4:42 ` [PATCH v6 05/32] drm/xe/bo: Introduce xe_bo_put_async Matthew Brost
2025-02-25  4:42 ` [PATCH v6 06/32] drm/gpusvm: Add support for GPU Shared Virtual Memory Matthew Brost
2025-02-25 15:14   ` Matthew Auld
2025-02-25 18:16     ` Matthew Brost
2025-02-25  4:42 ` [PATCH v6 07/32] drm/xe: Select DRM_GPUSVM Kconfig Matthew Brost
2025-02-25  4:42 ` [PATCH v6 08/32] drm/xe/uapi: Add DRM_XE_VM_BIND_FLAG_CPU_ADDR_MIRROR Matthew Brost
2025-02-25  4:42 ` [PATCH v6 09/32] drm/xe: Add SVM init / close / fini to faulting VMs Matthew Brost
2025-02-25  4:42 ` [PATCH v6 10/32] drm/xe: Add dma_addr res cursor Matthew Brost
2025-02-25  4:42 ` [PATCH v6 11/32] drm/xe: Nuke VM's mapping upon close Matthew Brost
2025-02-25 18:05   ` Matthew Auld
2025-02-25 18:14     ` Matthew Brost [this message]
2025-02-25  4:42 ` [PATCH v6 12/32] drm/xe: Add SVM range invalidation and page fault Matthew Brost
2025-02-25  4:42 ` [PATCH v6 13/32] drm/gpuvm: Add DRM_GPUVA_OP_DRIVER Matthew Brost
2025-02-25  4:42 ` [PATCH v6 14/32] drm/xe: Add (re)bind to SVM page fault handler Matthew Brost
2025-02-26 17:00   ` Thomas Hellström
2025-02-26 17:18   ` Ghimiray, Himal Prasad
2025-02-25  4:42 ` [PATCH v6 15/32] drm/xe: Add SVM garbage collector Matthew Brost
2025-02-25  4:42 ` [PATCH v6 16/32] drm/xe: Add unbind to " Matthew Brost
2025-02-25  4:42 ` [PATCH v6 17/32] drm/xe: Do not allow CPU address mirror VMA unbind if the GPU has bindings Matthew Brost
2025-02-27 17:01   ` Thomas Hellström
2025-02-25  4:42 ` [PATCH v6 18/32] drm/xe: Enable CPU address mirror uAPI Matthew Brost
2025-02-25  4:42 ` [PATCH v6 19/32] drm/xe/uapi: Add DRM_XE_QUERY_CONFIG_FLAG_HAS_CPU_ADDR_MIRROR Matthew Brost
2025-02-25  4:42 ` [PATCH v6 20/32] drm/xe: Add migrate layer functions for SVM support Matthew Brost
2025-02-25  4:43 ` [PATCH v6 21/32] drm/xe: Add SVM device memory mirroring Matthew Brost
2025-02-25  4:43 ` [PATCH v6 22/32] drm/xe: Add drm_gpusvm_devmem to xe_bo Matthew Brost
2025-02-25  4:43 ` [PATCH v6 23/32] drm/xe: Add drm_pagemap ops to SVM Matthew Brost
2025-02-25  4:43 ` [PATCH v6 24/32] drm/xe: Add GPUSVM device memory copy vfunc functions Matthew Brost
2025-02-25  4:43 ` [PATCH v6 25/32] drm/xe: Add Xe SVM populate_devmem_pfn GPU SVM vfunc Matthew Brost
2025-02-25  4:43 ` [PATCH v6 26/32] drm/xe: Add Xe SVM devmem_release " Matthew Brost
2025-02-25  4:43 ` [PATCH v6 27/32] drm/xe: Add SVM VRAM migration Matthew Brost
2025-02-26 16:47   ` Thomas Hellström
2025-02-26 17:16   ` Ghimiray, Himal Prasad
2025-02-25  4:43 ` [PATCH v6 28/32] drm/xe: Basic SVM BO eviction Matthew Brost
2025-02-25  4:43 ` [PATCH v6 29/32] drm/xe: Add SVM debug Matthew Brost
2025-02-25  4:43 ` [PATCH v6 30/32] drm/xe: Add modparam for SVM notifier size Matthew Brost
2025-02-25  4:43 ` [PATCH v6 31/32] drm/xe: Add always_migrate_to_vram modparam Matthew Brost
2025-02-25  4:43 ` [PATCH v6 32/32] drm/doc: gpusvm: Add GPU SVM documentation Matthew Brost
2025-02-28  2:34   ` Alistair Popple
2025-02-28  4:36     ` Matthew Brost
2025-02-28  5:53       ` Alistair Popple
2025-03-01  0:35         ` Matthew Brost
2025-02-25  4:50 ` ✓ CI.Patch_applied: success for Introduce GPU SVM and Xe SVM implementation (rev6) Patchwork
2025-02-25  4:51 ` ✗ CI.checkpatch: warning " Patchwork
2025-02-25  4:52 ` ✓ CI.KUnit: success " Patchwork
2025-02-25  5:08 ` ✓ CI.Build: " Patchwork
2025-02-25  5:10 ` ✗ CI.Hooks: failure " Patchwork
2025-02-25  5:12 ` ✗ CI.checksparse: warning " Patchwork
2025-02-25  5:32 ` ✓ Xe.CI.BAT: success " Patchwork
2025-02-25  9:55 ` ✗ Xe.CI.Full: failure " Patchwork

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=Z74Ibtxgmew7A4e7@lstrano-desk.jf.intel.com \
    --to=matthew.brost@intel.com \
    --cc=airlied@gmail.com \
    --cc=apopple@nvidia.com \
    --cc=dakr@kernel.org \
    --cc=dri-devel@lists.freedesktop.org \
    --cc=felix.kuehling@amd.com \
    --cc=himal.prasad.ghimiray@intel.com \
    --cc=intel-xe@lists.freedesktop.org \
    --cc=matthew.auld@intel.com \
    --cc=simona.vetter@ffwll.ch \
    --cc=thomas.hellstrom@linux.intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox