From: Matthew Brost <matthew.brost@intel.com>
To: "Thomas Hellström" <thomas.hellstrom@linux.intel.com>
Cc: <intel-xe@lists.freedesktop.org>,
<dri-devel@lists.freedesktop.org>, <apopple@nvidia.com>,
<airlied@gmail.com>, <christian.koenig@amd.com>,
<simona.vetter@ffwll.ch>, <felix.kuehling@amd.com>,
<dakr@kernel.org>
Subject: Re: [PATCH v2 12/29] drm/xe: Add SVM garbage collector
Date: Mon, 16 Dec 2024 15:46:04 -0800 [thread overview]
Message-ID: <Z2C7vIFdVBdbigFA@lstrano-desk.jf.intel.com> (raw)
In-Reply-To: <334bb2c9d2ff97caf712f5f8c17b494c63869265.camel@linux.intel.com>
On Mon, Dec 16, 2024 at 11:36:20AM +0100, Thomas Hellström wrote:
> On Wed, 2024-12-11 at 11:17 -0800, Matthew Brost wrote:
> > On Tue, Nov 19, 2024 at 03:45:33PM +0100, Thomas Hellström wrote:
> > > On Tue, 2024-10-15 at 20:25 -0700, Matthew Brost wrote:
> > > > Add basic SVM garbage collector which can destroy an SVM range
> > > > upon
> > > > an
> > > > MMU UNMAP event.
> > > >
> > > > v2:
> > > > - Flush garbage collector in xe_svm_close
> > > >
> > > > Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> > > > ---
> > > > drivers/gpu/drm/xe/xe_svm.c | 87
> > > > +++++++++++++++++++++++++++++++-
> > > > drivers/gpu/drm/xe/xe_svm.h | 1 +
> > > > drivers/gpu/drm/xe/xe_vm.c | 4 ++
> > > > drivers/gpu/drm/xe/xe_vm_types.h | 5 ++
> > > > 4 files changed, 95 insertions(+), 2 deletions(-)
> > > >
> > > > diff --git a/drivers/gpu/drm/xe/xe_svm.c
> > > > b/drivers/gpu/drm/xe/xe_svm.c
> > > > index a9addaea316d..9c2f44cba166 100644
> > > > --- a/drivers/gpu/drm/xe/xe_svm.c
> > > > +++ b/drivers/gpu/drm/xe/xe_svm.c
> > > > @@ -30,6 +30,7 @@ xe_svm_range_alloc(struct drm_gpusvm *gpusvm)
> > > > if (!range)
> > > > return ERR_PTR(-ENOMEM);
> > > >
> > > > + INIT_LIST_HEAD(&range->garbage_collector_link);
> > > > xe_vm_get(gpusvm_to_vm(gpusvm));
> > > >
> > > > return &range->base;
> > > > @@ -46,6 +47,24 @@ static struct xe_svm_range *to_xe_range(struct
> > > > drm_gpusvm_range *r)
> > > > return container_of(r, struct xe_svm_range, base);
> > > > }
> > > >
> > > > +static void
> > > > +xe_svm_garbage_collector_add_range(struct xe_vm *vm, struct
> > > > xe_svm_range *range,
> > > > + const struct
> > > > mmu_notifier_range
> > > > *mmu_range)
> > > > +{
> > > > + struct xe_device *xe = vm->xe;
> > > > +
> > > > + drm_gpusvm_range_set_unmapped(&range->base, mmu_range);
> > > > +
> > > > + spin_lock(&vm->svm.garbage_collector.lock);
> > > > + if (list_empty(&range->garbage_collector_link))
> > > > + list_add_tail(&range->garbage_collector_link,
> > > > + &vm-
> > > > > svm.garbage_collector.range_list);
> > > > + spin_unlock(&vm->svm.garbage_collector.lock);
> > > > +
> > > > + queue_work(xe_device_get_root_tile(xe)->primary_gt-
> > > > > usm.pf_wq,
> > > > + &vm->svm.garbage_collector.work);
> > > > +}
> > > > +
> > > > static u8
> > > > xe_svm_range_notifier_event_begin(struct xe_vm *vm, struct
> > > > drm_gpusvm_range *r,
> > > > const struct
> > > > mmu_notifier_range
> > > > *mmu_range,
> > > > @@ -88,7 +107,9 @@ xe_svm_range_notifier_event_end(struct xe_vm
> > > > *vm,
> > > > struct drm_gpusvm_range *r,
> > > > struct drm_gpusvm_ctx ctx = { .in_notifier = true, };
> > > >
> > > > drm_gpusvm_range_unmap_pages(&vm->svm.gpusvm, r, &ctx);
> > > > - /* TODO: Add range to garbage collector */
> > > > + if (mmu_range->event == MMU_NOTIFY_UNMAP)
> > > > + xe_svm_garbage_collector_add_range(vm,
> > > > to_xe_range(r),
> > > > + mmu_range);
> > > > }
> > > >
> > > > static void xe_svm_invalidate(struct drm_gpusvm *gpusvm,
> > > > @@ -184,6 +205,58 @@ static void xe_svm_invalidate(struct
> > > > drm_gpusvm
> > > > *gpusvm,
> > > > xe_svm_range_notifier_event_end(vm, r,
> > > > mmu_range);
> > > > }
> > > >
> > > > +static int __xe_svm_garbage_collector(struct xe_vm *vm,
> > > > + struct xe_svm_range
> > > > *range)
> > > > +{
> > > > + /* TODO: Do unbind */
> > > > +
> > > > + drm_gpusvm_range_remove(&vm->svm.gpusvm, &range->base);
> > > > +
> > > > + return 0;
> > > > +}
> > > > +
> > > > +static int xe_svm_garbage_collector(struct xe_vm *vm)
> > > > +{
> > > > + struct xe_svm_range *range, *next;
> > > > + int err;
> > > > +
> > > > + lockdep_assert_held_write(&vm->lock);
> > > > +
> > > > + if (xe_vm_is_closed_or_banned(vm))
> > > > + return -ENOENT;
> > > > +
> > > > + spin_lock(&vm->svm.garbage_collector.lock);
> > > > + list_for_each_entry_safe(range, next,
> > > > + &vm-
> > > > > svm.garbage_collector.range_list,
> > > > + garbage_collector_link) {
> > > > + list_del(&range->garbage_collector_link);
> > > > + spin_unlock(&vm->svm.garbage_collector.lock);
> > >
> > > This looks broken, what if someone removed the "next" entry here?
> > > You probably want to use list_next_entry_or_null();
> > >
> >
> > Yea, let me fix this loop structure.
> >
> > > > +
> > > > + err = __xe_svm_garbage_collector(vm, range);
> > > > + if (err) {
> > > > + drm_warn(&vm->xe->drm,
> > > > + "Garbage collection failed:
> > > > %d\n",
> > > > err);
> > > > + xe_vm_kill(vm, true);
> > > > + return err;
> > > > + }
> > > > +
> > > > + spin_lock(&vm->svm.garbage_collector.lock);
> > > > + }
> > > > + spin_unlock(&vm->svm.garbage_collector.lock);
> > > > +
> > > > + return 0;
> > > > +}
> > > > +
> > > > +static void xe_svm_garbage_collector_work_func(struct
> > > > work_struct
> > > > *w)
> > > > +{
> > > > + struct xe_vm *vm = container_of(w, struct xe_vm,
> > > > + svm.garbage_collector.wo
> > > > rk);
> > > > +
> > > > + down_write(&vm->lock);
> > > > + xe_svm_garbage_collector(vm);
> > > > + up_write(&vm->lock);
> > > > +}
> > > > +
> > > > static const struct drm_gpusvm_ops gpusvm_ops = {
> > > > .range_alloc = xe_svm_range_alloc,
> > > > .range_free = xe_svm_range_free,
> > > > @@ -198,6 +271,11 @@ static const u64 fault_chunk_sizes[] = {
> > > >
> > > > int xe_svm_init(struct xe_vm *vm)
> > > > {
> > > > + spin_lock_init(&vm->svm.garbage_collector.lock);
> > > > + INIT_LIST_HEAD(&vm->svm.garbage_collector.range_list);
> > > > + INIT_WORK(&vm->svm.garbage_collector.work,
> > > > + xe_svm_garbage_collector_work_func);
> > > > +
> > > > return drm_gpusvm_init(&vm->svm.gpusvm, "Xe SVM", &vm-
> > > > >xe-
> > > > > drm,
> > > > current->mm, NULL, 0, vm->size,
> > > > SZ_512M, &gpusvm_ops,
> > > > fault_chunk_sizes,
> > > > @@ -211,6 +289,8 @@ void xe_svm_close(struct xe_vm *vm)
> > > > /* Flush running notifiers making xe_vm_close() visable
> > > > */
> > > > xe_svm_notifier_lock(vm);
> > > > xe_svm_notifier_unlock(vm);
> > > > +
> > > > + flush_work(&vm->svm.garbage_collector.work);
> > > > }
> > > >
> > > > void xe_svm_fini(struct xe_vm *vm)
> > > > @@ -241,7 +321,10 @@ int xe_svm_handle_pagefault(struct xe_vm
> > > > *vm,
> > > > struct xe_vma *vma,
> > > > lockdep_assert_held_write(&vm->lock);
> > > >
> > > > retry:
> > > > - /* TODO: Run garbage collector */
> > > > + /* Always process UNMAPs first so view SVM ranges is
> > > > current
> > > > */
> > > > + err = xe_svm_garbage_collector(vm);
> > > > + if (err)
> > > > + return err;
> > > >
> > > > r = drm_gpusvm_range_find_or_insert(&vm->svm.gpusvm,
> > > > fault_addr,
> > > > xe_vma_start(vma),
> > > > xe_vma_end(vma),
> > > > diff --git a/drivers/gpu/drm/xe/xe_svm.h
> > > > b/drivers/gpu/drm/xe/xe_svm.h
> > > > index ee0bd1ae655b..06d90d0f71a6 100644
> > > > --- a/drivers/gpu/drm/xe/xe_svm.h
> > > > +++ b/drivers/gpu/drm/xe/xe_svm.h
> > > > @@ -17,6 +17,7 @@ struct xe_vma;
> > > >
> > > > struct xe_svm_range {
> > > > struct drm_gpusvm_range base;
> > > > + struct list_head garbage_collector_link;
> > > > u8 tile_present;
> > > > u8 tile_invalidated;
> > > > };
> > > > diff --git a/drivers/gpu/drm/xe/xe_vm.c
> > > > b/drivers/gpu/drm/xe/xe_vm.c
> > > > index 63aa0a25d3b7..399cbbdbddd5 100644
> > > > --- a/drivers/gpu/drm/xe/xe_vm.c
> > > > +++ b/drivers/gpu/drm/xe/xe_vm.c
> > > > @@ -3071,6 +3071,10 @@ int xe_vm_bind_ioctl(struct drm_device
> > > > *dev,
> > > > void *data, struct drm_file *file)
> > > > goto put_exec_queue;
> > > > }
> > > >
> > > > + /* Ensure all UNMAPs visable */
> > > > + if (xe_vm_in_fault_mode(vm))
> > > > + flush_work(&vm->svm.garbage_collector.work);
> > >
> > > Hmm, what is someone added an UNMAP here?
> > >
> >
> > What we really trying to guard to against is user space doing
> > something
> > like this:
> >
> > addr = malloc();
> > gpu access
> > free(addr)
> > bind_bo(addr);
> >
> > We want to make sure all SVM mappings from the GPU access have
> > processed
> > the UNMAP events from the 'free(addr)'. So I think the code is fine
> > as
> > is - we just want to make sure UNMAP events prior to the IOCTL are
> > processed.
>
> But the notion of "prior" only exists in the presence of some form of
> synchronization, like a lock. Let's say another thread calls a free
> either
>
> a) before the flush_work
> b) racing with the flush_work
> c) after the flush_work
>
> Is there any difference WRT correctness and how do we differentiate?
>
> I don't think it's clear what this flush_work actually protects
> against.
>
I still think this is ok.
Let's say we have 2 threads...
- Thread A munmap(address A) - This address has a SVM GPU binding, we will get an UNMAP notifier
- Thread B address B = mmap() - This happens to equal to address A
- Thread B bind BO(address B) - We flush_work which ensure the UNMAP event is processed allow current view SVM state, avoiding bind returning -EBUSY
The key here is it is impossible for address A == B unless an UNMAP
event is queued in the garbage collector unless I'm completely missing
something. This is really the only race we care about - we care that
UNMAP events prior the bind matching the bind address are processed.
If other UNMAP events occur while processing the bind, that is fine as
they shouldn't colliding. Worst case if a collision occurs we'd return
-EBUSY in the bind IOCTL.
Does this make sense?
Matt
> Thanks,
> Thomas
>
>
>
>
> >
> > Matt
> >
> >
> > > Thanks,
> > > Thomas
> > >
> > > > +
> > > > err = down_write_killable(&vm->lock);
> > > > if (err)
> > > > goto put_vm;
> > > > diff --git a/drivers/gpu/drm/xe/xe_vm_types.h
> > > > b/drivers/gpu/drm/xe/xe_vm_types.h
> > > > index b736e53779d2..2eae3575c409 100644
> > > > --- a/drivers/gpu/drm/xe/xe_vm_types.h
> > > > +++ b/drivers/gpu/drm/xe/xe_vm_types.h
> > > > @@ -146,6 +146,11 @@ struct xe_vm {
> > > > struct {
> > > > /** @svm.gpusvm: base GPUSVM used to track fault
> > > > allocations */
> > > > struct drm_gpusvm gpusvm;
> > > > + struct {
> > > > + spinlock_t lock;
> > > > + struct list_head range_list;
> > > > + struct work_struct work;
> > > > + } garbage_collector;
> > > > } svm;
> > > >
> > > > struct xe_device *xe;
> > >
>
next prev parent reply other threads:[~2024-12-16 23:45 UTC|newest]
Thread overview: 129+ messages / expand[flat|nested] mbox.gz Atom feed top
2024-10-16 3:24 [PATCH v2 00/29] Introduce GPU SVM and Xe SVM implementation Matthew Brost
2024-10-16 3:24 ` [PATCH v2 01/29] drm/xe: Retry BO allocation Matthew Brost
2024-10-16 3:24 ` [PATCH v2 02/29] mm/migrate: Add migrate_device_prepopulated_range Matthew Brost
2024-10-16 4:04 ` Alistair Popple
2024-10-16 4:46 ` Matthew Brost
2024-10-17 0:56 ` Matthew Brost
2024-10-17 1:49 ` Alistair Popple
2024-10-17 2:45 ` Matthew Brost
2024-10-17 3:21 ` Alistair Popple
2024-10-17 4:07 ` Matthew Brost
2024-10-17 5:49 ` Alistair Popple
2024-10-17 15:40 ` Matthew Brost
2024-10-17 21:58 ` Alistair Popple
2024-10-18 0:54 ` Matthew Brost
2024-10-18 5:59 ` Alistair Popple
2024-10-18 6:39 ` Mika Penttilä
2024-10-18 7:16 ` Matthew Brost
2024-10-18 7:33 ` Matthew Brost
2024-10-18 7:34 ` Alistair Popple
2024-10-18 7:57 ` Matthew Brost
2024-10-18 4:02 ` Mika Penttilä
2024-10-18 5:55 ` Alistair Popple
2024-10-16 3:24 ` [PATCH v2 03/29] mm/migrate: Trylock device page in do_swap_page Matthew Brost
2024-10-16 4:00 ` Alistair Popple
2024-10-16 4:41 ` Matthew Brost
2024-10-17 1:51 ` Alistair Popple
2024-10-25 0:31 ` Matthew Brost
2024-10-29 6:37 ` Alistair Popple
2024-11-01 17:19 ` Matthew Brost
2024-11-28 23:31 ` Alistair Popple
2024-12-13 22:16 ` Matthew Brost
2024-12-14 5:59 ` Matthew Brost
2024-10-16 3:24 ` [PATCH v2 04/29] drm/pagemap: Add DRM pagemap Matthew Brost
2024-10-16 3:24 ` [PATCH v2 05/29] drm/gpusvm: Add support for GPU Shared Virtual Memory Matthew Brost
2024-10-31 18:58 ` Thomas Hellström
2024-11-04 22:53 ` Matthew Brost
2024-11-04 15:25 ` Thomas Hellström
2024-11-04 17:21 ` Matthew Brost
2024-11-04 18:59 ` Thomas Hellström
2024-11-04 23:07 ` Matthew Brost
2024-11-05 10:22 ` Thomas Hellström
2024-11-05 16:12 ` Matthew Brost
2024-11-05 16:28 ` Thomas Hellström
2024-11-05 14:48 ` Thomas Hellström
2024-11-05 16:32 ` Matthew Brost
2024-11-20 3:00 ` Gwan-gyeong Mun
2024-11-29 0:00 ` Alistair Popple
2024-12-14 1:16 ` Matthew Brost
2024-10-16 3:24 ` [PATCH v2 06/29] drm/xe/uapi: Add DRM_XE_VM_BIND_FLAG_SYSTEM_ALLOCATON flag Matthew Brost
2024-11-18 13:44 ` Thomas Hellström
2024-11-19 16:01 ` Matthew Brost
2024-10-16 3:24 ` [PATCH v2 07/29] drm/xe: Add SVM init / close / fini to faulting VMs Matthew Brost
2024-11-19 12:13 ` Thomas Hellström
2024-11-19 16:22 ` Matthew Brost
2024-10-16 3:24 ` [PATCH v2 08/29] drm/xe: Add dma_addr res cursor Matthew Brost
2024-11-19 12:15 ` Thomas Hellström
2024-11-19 16:24 ` Matthew Brost
2024-10-16 3:24 ` [PATCH v2 09/29] drm/xe: Add SVM range invalidation Matthew Brost
2024-11-19 13:56 ` Thomas Hellström
2024-12-11 19:01 ` Matthew Brost
2024-12-14 23:11 ` Matthew Brost
2024-12-16 10:01 ` Thomas Hellström
2024-12-16 16:09 ` Matthew Brost
2024-12-16 17:35 ` Thomas Hellström
2024-10-16 3:24 ` [PATCH v2 10/29] drm/gpuvm: Add DRM_GPUVA_OP_USER Matthew Brost
2024-11-19 13:57 ` Thomas Hellström
2024-11-19 16:26 ` Matthew Brost
2024-10-16 3:25 ` [PATCH v2 11/29] drm/xe: Add (re)bind to SVM page fault handler Matthew Brost
2024-11-19 14:26 ` Thomas Hellström
2024-12-11 19:07 ` Matthew Brost
2024-12-16 10:03 ` Thomas Hellström
2024-10-16 3:25 ` [PATCH v2 12/29] drm/xe: Add SVM garbage collector Matthew Brost
2024-11-19 14:45 ` Thomas Hellström
2024-12-11 19:17 ` Matthew Brost
2024-12-16 10:36 ` Thomas Hellström
2024-12-16 23:46 ` Matthew Brost [this message]
2024-10-16 3:25 ` [PATCH v2 13/29] drm/xe: Add unbind to " Matthew Brost
2024-11-19 15:31 ` Thomas Hellström
2024-11-19 23:44 ` Matthew Brost
2024-10-16 3:25 ` [PATCH v2 14/29] drm/xe: Do not allow system allocator VMA unbind if the GPU has bindings Matthew Brost
2024-11-19 16:33 ` Thomas Hellström
2024-11-19 23:37 ` Matthew Brost
2024-10-16 3:25 ` [PATCH v2 15/29] drm/xe: Enable system allocator uAPI Matthew Brost
2024-11-19 16:34 ` Thomas Hellström
2024-10-16 3:25 ` [PATCH v2 16/29] drm/xe: Add migrate layer functions for SVM support Matthew Brost
2024-11-19 16:45 ` Thomas Hellström
2024-11-19 23:08 ` Matthew Brost
2024-11-20 8:04 ` Thomas Hellström
2024-12-11 19:11 ` Matthew Brost
2024-10-16 3:25 ` [PATCH v2 17/29] drm/xe: Add SVM device memory mirroring Matthew Brost
2024-11-19 16:50 ` Thomas Hellström
2024-11-20 3:05 ` Gwan-gyeong Mun
2024-12-11 19:44 ` Matthew Brost
2024-10-16 3:25 ` [PATCH v2 18/29] drm/xe: Add drm_gpusvm_devmem to xe_bo Matthew Brost
2024-11-19 16:51 ` Thomas Hellström
2024-12-15 4:38 ` Matthew Brost
2024-10-16 3:25 ` [PATCH v2 19/29] drm/xe: Add GPUSVM devic memory copy vfunc functions Matthew Brost
2024-12-02 10:13 ` Thomas Hellström
2024-12-12 3:59 ` Matthew Brost
2024-10-16 3:25 ` [PATCH v2 20/29] drm/xe: Add drm_pagemap ops to SVM Matthew Brost
2024-10-16 3:25 ` [PATCH v2 21/29] drm/xe: Add Xe SVM populate_devmem_pfn vfunc Matthew Brost
2024-12-02 10:19 ` Thomas Hellström
2024-10-16 3:25 ` [PATCH v2 22/29] drm/xe: Add Xe SVM devmem_release vfunc Matthew Brost
2024-12-02 10:21 ` Thomas Hellström
2024-10-16 3:25 ` [PATCH v2 23/29] drm/xe: Add BO flags required for SVM Matthew Brost
2024-12-02 10:44 ` Thomas Hellström
2024-12-11 21:42 ` Matthew Brost
2024-12-16 10:44 ` Thomas Hellström
2024-10-16 3:25 ` [PATCH v2 24/29] drm/xe: Add SVM VRAM migration Matthew Brost
2024-12-02 12:06 ` Thomas Hellström
2024-12-11 20:17 ` Matthew Brost
2024-10-16 3:25 ` [PATCH v2 25/29] drm/xe: Basic SVM BO eviction Matthew Brost
2024-12-02 12:27 ` Thomas Hellström
2024-12-11 19:47 ` Matthew Brost
2024-10-16 3:25 ` [PATCH v2 26/29] drm/xe: Add SVM debug Matthew Brost
2024-12-02 12:33 ` Thomas Hellström
2024-12-17 1:05 ` Matthew Brost
2024-10-16 3:25 ` [PATCH v2 27/29] drm/xe: Add modparam for SVM notifier size Matthew Brost
2024-12-02 12:37 ` Thomas Hellström
2024-12-11 19:50 ` Matthew Brost
2024-10-16 3:25 ` [PATCH v2 28/29] drm/xe: Add always_migrate_to_vram modparam Matthew Brost
2024-12-02 12:40 ` Thomas Hellström
2024-12-11 19:51 ` Matthew Brost
2024-10-16 3:25 ` [PATCH v2 29/29] drm/doc: gpusvm: Add GPU SVM documentation Matthew Brost
2024-12-02 13:00 ` Thomas Hellström
2024-12-17 23:14 ` Matthew Brost
2024-10-16 3:30 ` ✓ CI.Patch_applied: success for Introduce GPU SVM and Xe SVM implementation (rev2) Patchwork
2024-10-16 3:31 ` ✗ CI.checkpatch: warning " Patchwork
2024-10-16 3:31 ` ✗ CI.KUnit: failure " Patchwork
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=Z2C7vIFdVBdbigFA@lstrano-desk.jf.intel.com \
--to=matthew.brost@intel.com \
--cc=airlied@gmail.com \
--cc=apopple@nvidia.com \
--cc=christian.koenig@amd.com \
--cc=dakr@kernel.org \
--cc=dri-devel@lists.freedesktop.org \
--cc=felix.kuehling@amd.com \
--cc=intel-xe@lists.freedesktop.org \
--cc=simona.vetter@ffwll.ch \
--cc=thomas.hellstrom@linux.intel.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox