From: "Thomas Hellström" <thomas.hellstrom@linux.intel.com>
To: Matthew Brost <matthew.brost@intel.com>, intel-xe@lists.freedesktop.org
Cc: matthew.auld@intel.com
Subject: Re: [PATCH v2 3/3] drm/xe: Add staging tree for VM binds
Date: Mon, 24 Feb 2025 10:22:37 +0100 [thread overview]
Message-ID: <d481a85557a539fe694ce85d7e4d31edc50b132a.camel@linux.intel.com> (raw)
In-Reply-To: <20250224040529.3025963-4-matthew.brost@intel.com>
Hi,
On Sun, 2025-02-23 at 20:05 -0800, Matthew Brost wrote:
> Concurrent VM bind staging and zapping of PTEs from a userptr
> notifier
> do not work because the view of PTEs is not stable. VM binds cannot
> acquire the notifier lock during staging, as memory allocations are
> required. To resolve this race condition, use a staging tree for VM
> binds that is committed only under the userptr notifier lock during
> the
> final step of the bind. This ensures a consistent view of the PTEs in
> the userptr notifier.
>
> A follow up may only use staging for VM in fault mode as this is the
> only mode in which the above race exists.
>
> Suggested-by: Thomas Hellström <thomas.hellstrom@linux.intel.com>
> Cc: <stable@vger.kernel.org>
> Fixes: e8babb280b5e ("drm/xe: Convert multiple bind ops into single
> job")
> Fixes: a708f6501c69 ("drm/xe: Update PT layer with better error
> handling")
> Signed-off-by: Matthew Brost <matthew.brost@intel.com>
This looks even simpler than what I initially thought. Ideally I think
the staging tree would exist only under the vm lock during the multi
bind-unbind operation, but that adds code complication and execution
time compared to this solution which is more memory-hungry.
Minor comment below.
> ---
> drivers/gpu/drm/xe/xe_pt.c | 50 +++++++++++++++++++++++++------
> --
> drivers/gpu/drm/xe/xe_pt_walk.c | 3 +-
> drivers/gpu/drm/xe/xe_pt_walk.h | 4 +++
> 3 files changed, 44 insertions(+), 13 deletions(-)
>
> diff --git a/drivers/gpu/drm/xe/xe_pt.c b/drivers/gpu/drm/xe/xe_pt.c
> index add521b5c6ae..118b81cd7205 100644
> --- a/drivers/gpu/drm/xe/xe_pt.c
> +++ b/drivers/gpu/drm/xe/xe_pt.c
> @@ -28,6 +28,8 @@ struct xe_pt_dir {
> struct xe_pt pt;
> /** @children: Array of page-table child nodes */
> struct xe_ptw *children[XE_PDES];
> + /** @staging: Array of page-table staging nodes */
> + struct xe_ptw *staging[XE_PDES];
> };
>
> #if IS_ENABLED(CONFIG_DRM_XE_DEBUG_VM)
> @@ -50,7 +52,7 @@ static struct xe_pt_dir *as_xe_pt_dir(struct xe_pt
> *pt)
>
> static struct xe_pt *xe_pt_entry(struct xe_pt_dir *pt_dir, unsigned
> int index)
> {
> - return container_of(pt_dir->children[index], struct xe_pt,
> base);
> + return container_of(pt_dir->staging[index], struct xe_pt,
> base);
It looks like the only user here is with staging == true, but I'm a
little worried about future users. Perhaps include a staging bool in
the interface and assert that it is true? Or rename to
xe_pt_entry_staging()?
> }
>
> static u64 __xe_pt_empty_pte(struct xe_tile *tile, struct xe_vm *vm,
> @@ -125,6 +127,7 @@ struct xe_pt *xe_pt_create(struct xe_vm *vm,
> struct xe_tile *tile,
> }
> pt->bo = bo;
> pt->base.children = level ? as_xe_pt_dir(pt)->children :
> NULL;
> + pt->base.staging = level ? as_xe_pt_dir(pt)->staging : NULL;
>
> if (vm->xef)
> xe_drm_client_add_bo(vm->xef->client, pt->bo);
> @@ -377,8 +380,10 @@ xe_pt_insert_entry(struct xe_pt_stage_bind_walk
> *xe_walk, struct xe_pt *parent,
> /* Continue building a non-connected subtree. */
> struct iosys_map *map = &parent->bo->vmap;
>
> - if (unlikely(xe_child))
> + if (unlikely(xe_child)) {
> parent->base.children[offset] = &xe_child-
> >base;
> + parent->base.staging[offset] = &xe_child-
> >base;
> + }
>
> xe_pt_write(xe_walk->vm->xe, map, offset, pte);
> parent->num_live++;
> @@ -619,6 +624,7 @@ xe_pt_stage_bind(struct xe_tile *tile, struct
> xe_vma *vma,
> .ops = &xe_pt_stage_bind_ops,
> .shifts = xe_normal_pt_shifts,
> .max_level = XE_PT_HIGHEST_LEVEL,
> + .staging = true,
> },
> .vm = xe_vma_vm(vma),
> .tile = tile,
> @@ -812,6 +818,7 @@ static const struct xe_pt_walk_ops
> xe_pt_zap_ptes_ops = {
>
> struct xe_pt_zap_ptes_flags {
> bool scratch:1;
> + bool staging:1;
> };
As mentioned in the comments to the previous commit. I don't think
zapping is needed when aborting a bind.
Otherwise LGTM
/Thomas
>
> static bool __xe_pt_zap_ptes(struct xe_tile *tile, struct xe_vma
> *vma,
> @@ -822,6 +829,7 @@ static bool __xe_pt_zap_ptes(struct xe_tile
> *tile, struct xe_vma *vma,
> .ops = &xe_pt_zap_ptes_ops,
> .shifts = xe_normal_pt_shifts,
> .max_level = XE_PT_HIGHEST_LEVEL,
> + .staging = flags.staging,
> },
> .tile = tile,
> .vm = xe_vma_vm(vma),
> @@ -905,7 +913,7 @@ static void xe_pt_cancel_bind(struct xe_vma *vma,
> }
> }
>
> -static void xe_pt_commit_locks_assert(struct xe_vma *vma)
> +static void xe_pt_commit_prepare_locks_assert(struct xe_vma *vma)
> {
> struct xe_vm *vm = xe_vma_vm(vma);
>
> @@ -917,6 +925,16 @@ static void xe_pt_commit_locks_assert(struct
> xe_vma *vma)
> xe_vm_assert_held(vm);
> }
>
> +static void xe_pt_commit_locks_assert(struct xe_vma *vma)
> +{
> + struct xe_vm *vm = xe_vma_vm(vma);
> +
> + xe_pt_commit_prepare_locks_assert(vma);
> +
> + if (xe_vma_is_userptr(vma))
> + lockdep_assert_held_read(&vm-
> >userptr.notifier_lock);
> +}
> +
> static void xe_pt_commit(struct xe_vma *vma,
> struct xe_vm_pgtable_update *entries,
> u32 num_entries, struct llist_head
> *deferred)
> @@ -927,13 +945,17 @@ static void xe_pt_commit(struct xe_vma *vma,
>
> for (i = 0; i < num_entries; i++) {
> struct xe_pt *pt = entries[i].pt;
> + struct xe_pt_dir *pt_dir;
>
> if (!pt->level)
> continue;
>
> + pt_dir = as_xe_pt_dir(pt);
> for (j = 0; j < entries[i].qwords; j++) {
> struct xe_pt *oldpte =
> entries[i].pt_entries[j].pt;
> + int j_ = j + entries[i].ofs;
>
> + pt_dir->children[j_] = pt_dir->staging[j_];
> xe_pt_destroy(oldpte, xe_vma_vm(vma)->flags,
> deferred);
> }
> }
> @@ -945,7 +967,7 @@ static void xe_pt_abort_bind(struct xe_vma *vma,
> {
> int i, j;
>
> - xe_pt_commit_locks_assert(vma);
> + xe_pt_commit_prepare_locks_assert(vma);
>
> for (i = num_entries - 1; i >= 0; --i) {
> struct xe_pt *pt = entries[i].pt;
> @@ -963,7 +985,7 @@ static void xe_pt_abort_bind(struct xe_vma *vma,
> struct xe_pt *newpte = xe_pt_entry(pt_dir,
> j_);
> struct xe_pt *oldpte =
> entries[i].pt_entries[j].pt;
>
> - pt_dir->children[j_] = oldpte ? &oldpte-
> >base : 0;
> + pt_dir->staging[j_] = oldpte ? &oldpte->base
> : 0;
> xe_pt_destroy(newpte, xe_vma_vm(vma)->flags,
> NULL);
> }
> }
> @@ -975,7 +997,7 @@ static void xe_pt_commit_prepare_bind(struct
> xe_vma *vma,
> {
> u32 i, j;
>
> - xe_pt_commit_locks_assert(vma);
> + xe_pt_commit_prepare_locks_assert(vma);
>
> for (i = 0; i < num_entries; i++) {
> struct xe_pt *pt = entries[i].pt;
> @@ -996,7 +1018,7 @@ static void xe_pt_commit_prepare_bind(struct
> xe_vma *vma,
> if (xe_pt_entry(pt_dir, j_))
> oldpte = xe_pt_entry(pt_dir, j_);
>
> - pt_dir->children[j_] = &newpte->base;
> + pt_dir->staging[j_] = &newpte->base;
> entries[i].pt_entries[j].pt = oldpte;
> }
> }
> @@ -1237,7 +1259,10 @@ static void
> vma_convert_to_invalidation(struct xe_tile *tile, struct xe_vma
> *vma,
> struct xe_vm_pgtable_update_ops
> *pt_update)
> {
> - struct xe_pt_zap_ptes_flags flags = { .scratch = true, };
> + struct xe_pt_zap_ptes_flags flags = {
> + .scratch = true,
> + .staging = true,
> + };
> int i, j, k;
>
> /*
> @@ -1590,6 +1615,7 @@ static unsigned int xe_pt_stage_unbind(struct
> xe_tile *tile, struct xe_vma *vma,
> .ops = &xe_pt_stage_unbind_ops,
> .shifts = xe_normal_pt_shifts,
> .max_level = XE_PT_HIGHEST_LEVEL,
> + .staging = true,
> },
> .tile = tile,
> .modified_start = xe_vma_start(vma),
> @@ -1631,7 +1657,7 @@ static void xe_pt_abort_unbind(struct xe_vma
> *vma,
> {
> int i, j;
>
> - xe_pt_commit_locks_assert(vma);
> + xe_pt_commit_prepare_locks_assert(vma);
>
> for (i = num_entries - 1; i >= 0; --i) {
> struct xe_vm_pgtable_update *entry = &entries[i];
> @@ -1644,7 +1670,7 @@ static void xe_pt_abort_unbind(struct xe_vma
> *vma,
> continue;
>
> for (j = entry->ofs; j < entry->ofs + entry->qwords;
> j++)
> - pt_dir->children[j] =
> + pt_dir->staging[j] =
> entries[i].pt_entries[j - entry-
> >ofs].pt ?
> &entries[i].pt_entries[j - entry-
> >ofs].pt->base : NULL;
> }
> @@ -1657,7 +1683,7 @@ xe_pt_commit_prepare_unbind(struct xe_vma *vma,
> {
> int i, j;
>
> - xe_pt_commit_locks_assert(vma);
> + xe_pt_commit_prepare_locks_assert(vma);
>
> for (i = 0; i < num_entries; ++i) {
> struct xe_vm_pgtable_update *entry = &entries[i];
> @@ -1672,7 +1698,7 @@ xe_pt_commit_prepare_unbind(struct xe_vma *vma,
> for (j = entry->ofs; j < entry->ofs + entry->qwords;
> j++) {
> entry->pt_entries[j - entry->ofs].pt =
> xe_pt_entry(pt_dir, j);
> - pt_dir->children[j] = NULL;
> + pt_dir->staging[j] = NULL;
> }
> }
> }
> diff --git a/drivers/gpu/drm/xe/xe_pt_walk.c
> b/drivers/gpu/drm/xe/xe_pt_walk.c
> index b8b3d2aea492..be602a763ff3 100644
> --- a/drivers/gpu/drm/xe/xe_pt_walk.c
> +++ b/drivers/gpu/drm/xe/xe_pt_walk.c
> @@ -74,7 +74,8 @@ int xe_pt_walk_range(struct xe_ptw *parent,
> unsigned int level,
> u64 addr, u64 end, struct xe_pt_walk *walk)
> {
> pgoff_t offset = xe_pt_offset(addr, level, walk);
> - struct xe_ptw **entries = parent->children ? parent-
> >children : NULL;
> + struct xe_ptw **entries = walk->staging ? (parent->staging
> ?: NULL) :
> + (parent->children ?: NULL);
> const struct xe_pt_walk_ops *ops = walk->ops;
> enum page_walk_action action;
> struct xe_ptw *child;
> diff --git a/drivers/gpu/drm/xe/xe_pt_walk.h
> b/drivers/gpu/drm/xe/xe_pt_walk.h
> index 5ecc4d2f0f65..5c02c244f7de 100644
> --- a/drivers/gpu/drm/xe/xe_pt_walk.h
> +++ b/drivers/gpu/drm/xe/xe_pt_walk.h
> @@ -11,12 +11,14 @@
> /**
> * struct xe_ptw - base class for driver pagetable subclassing.
> * @children: Pointer to an array of children if any.
> + * @staging: Pointer to an array of staging if any.
> *
> * Drivers could subclass this, and if it's a page-directory,
> typically
> * embed an array of xe_ptw pointers.
> */
> struct xe_ptw {
> struct xe_ptw **children;
> + struct xe_ptw **staging;
> };
>
> /**
> @@ -41,6 +43,8 @@ struct xe_pt_walk {
> * as shared pagetables.
> */
> bool shared_pt_mode;
> + /** @staging: Walk staging PT structure */
> + bool staging;
> };
>
> /**
next prev parent reply other threads:[~2025-02-24 13:44 UTC|newest]
Thread overview: 14+ messages / expand[flat|nested] mbox.gz Atom feed top
2025-02-24 4:05 [PATCH v2 0/3] Userptr fixes Matthew Brost
2025-02-24 4:05 ` [PATCH v2 1/3] drm/xe/userptr: fix EFAULT handling Matthew Brost
2025-02-24 4:05 ` [PATCH v2 2/3] drm/xe: Userptr invalidation race with binds fixes Matthew Brost
2025-02-24 8:21 ` Thomas Hellström
2025-02-24 15:06 ` Matthew Brost
2025-02-24 15:20 ` Thomas Hellström
2025-02-24 15:35 ` Matthew Brost
2025-02-24 16:22 ` Thomas Hellström
2025-02-24 16:30 ` Matthew Brost
2025-02-24 4:05 ` [PATCH v2 3/3] drm/xe: Add staging tree for VM binds Matthew Brost
2025-02-24 9:22 ` Thomas Hellström [this message]
2025-02-24 14:41 ` Matthew Brost
2025-02-24 16:15 ` ✗ CI.Patch_applied: failure for Userptr fixes Patchwork
-- strict thread matches above, loose matches on Subject: below --
2025-02-24 3:24 [PATCH v2 0/3] " Matthew Brost
2025-02-24 3:24 ` [PATCH v2 3/3] drm/xe: Add staging tree for VM binds Matthew Brost
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=d481a85557a539fe694ce85d7e4d31edc50b132a.camel@linux.intel.com \
--to=thomas.hellstrom@linux.intel.com \
--cc=intel-xe@lists.freedesktop.org \
--cc=matthew.auld@intel.com \
--cc=matthew.brost@intel.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox