* Re: [PATCH 1/1] drm/xe: Avoid serializing unbind jobs on prior TLB invalidations
2025-10-17 16:52 ` [PATCH 1/1] drm/xe: Avoid serializing unbind jobs on prior TLB invalidations Matthew Brost
@ 2025-10-21 17:55 ` Summers, Stuart
2025-10-21 20:36 ` Matthew Brost
2025-10-22 8:00 ` Tvrtko Ursulin
2025-10-23 12:28 ` Thomas Hellström
2 siblings, 1 reply; 15+ messages in thread
From: Summers, Stuart @ 2025-10-21 17:55 UTC (permalink / raw)
To: intel-xe@lists.freedesktop.org, Brost, Matthew
Cc: Santa, Carlos, thomas.hellstrom@linux.intel.com
On Fri, 2025-10-17 at 09:52 -0700, Matthew Brost wrote:
> When a burst of unbind jobs is issued, a dependency chain can form
> between the TLB invalidation of a previous unbind job and the current
> one. This leads to undesirable serialization, causing current jobs to
> wait unnecessarily for prior TLB invalidations, execute on the GPU
> when
> not needed, and significantly slow down the unbind burst—resulting in
> up
> to a 4× slowdown.
>
> To break this chain, mask the last bind queue dependency if the last
> fence's DMA context matches the TLB invalidation context. This allows
> full pipelining of unbinds and TLB invalidations while preserving
> correct dma-fence signaling semantics.
>
> Link: https://gitlab.freedesktop.org/drm/xe/kernel/-/issues/6047
> Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> ---
> drivers/gpu/drm/xe/xe_exec.c | 3 +-
> drivers/gpu/drm/xe/xe_exec_queue.c | 18 +++++++++--
> drivers/gpu/drm/xe/xe_exec_queue.h | 3 +-
> drivers/gpu/drm/xe/xe_pt.c | 15 +++++++--
> drivers/gpu/drm/xe/xe_sched_job.c | 44
> ++++++++++++++++++++++++++-
> drivers/gpu/drm/xe/xe_sched_job.h | 7 ++++-
> drivers/gpu/drm/xe/xe_tlb_inval_job.c | 14 +++++++++
> drivers/gpu/drm/xe/xe_tlb_inval_job.h | 2 ++
> 8 files changed, 98 insertions(+), 8 deletions(-)
>
> diff --git a/drivers/gpu/drm/xe/xe_exec.c
> b/drivers/gpu/drm/xe/xe_exec.c
> index 0dc27476832b..6034cfc8be06 100644
> --- a/drivers/gpu/drm/xe/xe_exec.c
> +++ b/drivers/gpu/drm/xe/xe_exec.c
> @@ -294,7 +294,8 @@ int xe_exec_ioctl(struct drm_device *dev, void
> *data, struct drm_file *file)
> goto err_put_job;
>
> if (!xe_vm_in_lr_mode(vm)) {
> - err = xe_sched_job_last_fence_add_dep(job, vm);
> + err = xe_sched_job_last_fence_add_dep(job, vm,
> NO_MASK_DEP,
> + NO_MASK_DEP);
> if (err)
> goto err_put_job;
>
> diff --git a/drivers/gpu/drm/xe/xe_exec_queue.c
> b/drivers/gpu/drm/xe/xe_exec_queue.c
> index 90cbc95f8e2e..d6f69d9bccba 100644
> --- a/drivers/gpu/drm/xe/xe_exec_queue.c
> +++ b/drivers/gpu/drm/xe/xe_exec_queue.c
> @@ -25,6 +25,7 @@
> #include "xe_migrate.h"
> #include "xe_pm.h"
> #include "xe_ring_ops_types.h"
> +#include "xe_sched_job.h"
> #include "xe_trace.h"
> #include "xe_vm.h"
> #include "xe_pxp.h"
> @@ -1106,11 +1107,17 @@ void xe_exec_queue_last_fence_set(struct
> xe_exec_queue *q, struct xe_vm *vm,
> * xe_exec_queue_last_fence_test_dep - Test last fence dependency of
> queue
> * @q: The exec queue
> * @vm: The VM the engine does a bind or exec for
> + * @mask_ctx0: Mask dma-fence context0
> + * @mask_ctx1: Mask dma-fence context1
> + *
> + * Test last fence dependency of queue, skipping masked dma fence
> contexts.
> *
> * Returns:
> - * -ETIME if there exists an unsignalled last fence dependency, zero
> otherwise.
> + * -ETIME if there exists an unsignalled and unmasked last fence
> dependency,
> + * zero otherwise.
> */
> -int xe_exec_queue_last_fence_test_dep(struct xe_exec_queue *q,
> struct xe_vm *vm)
> +int xe_exec_queue_last_fence_test_dep(struct xe_exec_queue *q,
> struct xe_vm *vm,
> + u64 mask_ctx0, u64 mask_ctx1)
> {
> struct dma_fence *fence;
> int err = 0;
> @@ -1119,6 +1126,13 @@ int xe_exec_queue_last_fence_test_dep(struct
> xe_exec_queue *q, struct xe_vm *vm)
> if (fence) {
> err = test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence-
> >flags) ?
> 0 : -ETIME;
> +
> + if (err == -ETIME) {
> + if (xe_sched_job_mask_dependency(fence,
> mask_ctx0,
> + mask_ctx1))
> + err = 0;
> + }
> +
> dma_fence_put(fence);
> }
>
> diff --git a/drivers/gpu/drm/xe/xe_exec_queue.h
> b/drivers/gpu/drm/xe/xe_exec_queue.h
> index a4dfbe858bda..99a35b22a46c 100644
> --- a/drivers/gpu/drm/xe/xe_exec_queue.h
> +++ b/drivers/gpu/drm/xe/xe_exec_queue.h
> @@ -85,7 +85,8 @@ struct dma_fence
> *xe_exec_queue_last_fence_get_for_resume(struct xe_exec_queue *
> void xe_exec_queue_last_fence_set(struct xe_exec_queue *e, struct
> xe_vm *vm,
> struct dma_fence *fence);
> int xe_exec_queue_last_fence_test_dep(struct xe_exec_queue *q,
> - struct xe_vm *vm);
> + struct xe_vm *vm, u64
> mask_ctx0,
> + u64 mask_ctx1);
> void xe_exec_queue_update_run_ticks(struct xe_exec_queue *q);
>
> int xe_exec_queue_contexts_hwsp_rebase(struct xe_exec_queue *q, void
> *scratch);
> diff --git a/drivers/gpu/drm/xe/xe_pt.c b/drivers/gpu/drm/xe/xe_pt.c
> index d22fd1ccc0ba..bba9ae559f57 100644
> --- a/drivers/gpu/drm/xe/xe_pt.c
> +++ b/drivers/gpu/drm/xe/xe_pt.c
> @@ -1341,10 +1341,21 @@ static int xe_pt_vm_dependencies(struct
> xe_sched_job *job,
> }
>
> if (!(pt_update_ops->q->flags & EXEC_QUEUE_FLAG_KERNEL)) {
> + u64 mask_ctx0 = NO_MASK_DEP, mask_ctx1 = NO_MASK_DEP;
> +
> + if (ijob)
> + mask_ctx0 =
> xe_tlb_inval_job_fence_context(ijob);
> + if (mjob)
> + mask_ctx1 =
> xe_tlb_inval_job_fence_context(mjob);
Can we rename these ictx and mctx for consistency?
Also, do we really need both of these here? Shouldn't we always have
the primary GT inval (ictx) and so just need to check the one? My
understanding is the reason being that we might be adding either one of
these as the last fence so we need both checks. But in that case would
it be better to check against all dependencies or even just the last
two? Wouldn't that also help if multiple apps are trying to free at
once here so we have interleaved unbind dependencies?
> +
> if (job)
> - err = xe_sched_job_last_fence_add_dep(job,
> vm);
> + err = xe_sched_job_last_fence_add_dep(job,
> vm,
> +
> mask_ctx0,
> +
> mask_ctx1);
> else
> - err =
> xe_exec_queue_last_fence_test_dep(pt_update_ops->q, vm);
> + err =
> xe_exec_queue_last_fence_test_dep(pt_update_ops->q,
> + vm,
> mask_ctx0,
> + mask_
> ctx1);
> }
>
> for (i = 0; job && !err && i < vops->num_syncs; i++)
> diff --git a/drivers/gpu/drm/xe/xe_sched_job.c
> b/drivers/gpu/drm/xe/xe_sched_job.c
> index d21bf8f26964..7cbdd87904c6 100644
> --- a/drivers/gpu/drm/xe/xe_sched_job.c
> +++ b/drivers/gpu/drm/xe/xe_sched_job.c
> @@ -6,6 +6,7 @@
> #include "xe_sched_job.h"
>
> #include <uapi/drm/xe_drm.h>
> +#include <linux/dma-fence-array.h>
> #include <linux/dma-fence-chain.h>
> #include <linux/slab.h>
>
> @@ -295,19 +296,60 @@ void xe_sched_job_push(struct xe_sched_job
> *job)
> xe_sched_job_put(job);
> }
>
> +/**
> + * xe_sched_job_mask_dependency() - Determine if a dma-fence
> dependency can be masked
> + * @fence: The dma-fence to check
> + * @mask_ctx0: First context to compare against the fence's context
> + * @mask_ctx1: Second context to compare against the fence's context
> + *
> + * This function checks whether the context of the given dma-fence
> matches
> + * either of the provided mask contexts. If a match is found, the
> dependency
> + * represented by the fence can be skipped. If the fence is a dma-
> fence-array,
> + * its individual fences are unwound and checked.
> + *
> + * Return: true if the fence can be masked (i.e., skipped), false
> otherwise.
> + */
> +bool xe_sched_job_mask_dependency(struct dma_fence *fence, u64
> mask_ctx0,
> + u64 mask_ctx1)
> +{
> + if (dma_fence_is_array(fence)) {
> + struct dma_fence *__fence;
> + int index;
> +
> + dma_fence_array_for_each(__fence, index, fence)
> + if (__fence->context == mask_ctx0 ||
> + __fence->context == mask_ctx1)
> + return true;
> + } else if (fence->context == mask_ctx0 ||
> + fence->context == mask_ctx1) {
> + return true;
> + }
> +
> + return false;
> +}
> +
> /**
> * xe_sched_job_last_fence_add_dep - Add last fence dependency to
> job
> * @job:job to add the last fence dependency to
> * @vm: virtual memory job belongs to
> + * @mask_ctx0: Mask dma-fence context0
> + * @mask_ctx1: Mask dma-fence context1
> + *
> + * Add last fence dependency to job, skipping masked dma fence
> contexts.
> *
> * Returns:
> * 0 on success, or an error on failing to expand the array.
> */
> -int xe_sched_job_last_fence_add_dep(struct xe_sched_job *job, struct
> xe_vm *vm)
> +int xe_sched_job_last_fence_add_dep(struct xe_sched_job *job, struct
> xe_vm *vm,
> + u64 mask_ctx0, u64 mask_ctx1)
> {
> struct dma_fence *fence;
>
> fence = xe_exec_queue_last_fence_get(job->q, vm);
> + if (xe_sched_job_mask_dependency(fence, mask_ctx0,
> mask_ctx1)) {
> + dma_fence_put(fence);
> + return 0;
> + }
>
> return drm_sched_job_add_dependency(&job->drm, fence);
> }
> diff --git a/drivers/gpu/drm/xe/xe_sched_job.h
> b/drivers/gpu/drm/xe/xe_sched_job.h
> index 3dc72c5c1f13..81d8e848e605 100644
> --- a/drivers/gpu/drm/xe/xe_sched_job.h
> +++ b/drivers/gpu/drm/xe/xe_sched_job.h
> @@ -58,7 +58,8 @@ bool xe_sched_job_completed(struct xe_sched_job
> *job);
> void xe_sched_job_arm(struct xe_sched_job *job);
> void xe_sched_job_push(struct xe_sched_job *job);
>
> -int xe_sched_job_last_fence_add_dep(struct xe_sched_job *job, struct
> xe_vm *vm);
> +int xe_sched_job_last_fence_add_dep(struct xe_sched_job *job, struct
> xe_vm *vm,
> + u64 mask_ctx0, u64 mask_ctx1);
> void xe_sched_job_init_user_fence(struct xe_sched_job *job,
> struct xe_sync_entry *sync);
>
> @@ -93,4 +94,8 @@ void xe_sched_job_snapshot_print(struct
> xe_sched_job_snapshot *snapshot, struct
> int xe_sched_job_add_deps(struct xe_sched_job *job, struct dma_resv
> *resv,
> enum dma_resv_usage usage);
>
> +#define NO_MASK_DEP (~0x0ull)
> +bool xe_sched_job_mask_dependency(struct dma_fence *fence, u64
> mask_ctx0,
> + u64 mask_ctx1);
> +
> #endif
> diff --git a/drivers/gpu/drm/xe/xe_tlb_inval_job.c
> b/drivers/gpu/drm/xe/xe_tlb_inval_job.c
> index 492def04a559..f2fe7f9fbb22 100644
> --- a/drivers/gpu/drm/xe/xe_tlb_inval_job.c
> +++ b/drivers/gpu/drm/xe/xe_tlb_inval_job.c
> @@ -32,6 +32,8 @@ struct xe_tlb_inval_job {
> u64 start;
> /** @end: End address to invalidate */
> u64 end;
> + /** @fence_context: Fence context for job */
> + u64 fence_context;
> /** @asid: Address space ID to invalidate */
> u32 asid;
> /** @fence_armed: Fence has been armed */
> @@ -101,6 +103,7 @@ xe_tlb_inval_job_create(struct xe_exec_queue *q,
> struct xe_tlb_inval *tlb_inval,
> job->asid = asid;
> job->fence_armed = false;
> job->dep.ops = &dep_job_ops;
This means the "finished" context per the entity definition right? Can
you either add a note here or change the job->fence_context name to
reflect that? Or otherwise why is this adding the +1 here?
Thanks,
Stuart
> + job->fence_context = entity->fence_context + 1;
> kref_init(&job->refcount);
> xe_exec_queue_get(q); /* Pairs with put in
> xe_tlb_inval_job_destroy */
>
> @@ -266,3 +269,14 @@ void xe_tlb_inval_job_put(struct
> xe_tlb_inval_job *job)
> if (!IS_ERR_OR_NULL(job))
> kref_put(&job->refcount, xe_tlb_inval_job_destroy);
> }
> +
> +/**
> + * xe_tlb_inval_job_fence_context() - TLB invalidation job fence
> context
> + * @job: TLB invalidation job object
> + *
> + * Return: TLB invalidation job fence context
> + */
> +u64 xe_tlb_inval_job_fence_context(struct xe_tlb_inval_job *job)
> +{
> + return job->fence_context;
> +}
> diff --git a/drivers/gpu/drm/xe/xe_tlb_inval_job.h
> b/drivers/gpu/drm/xe/xe_tlb_inval_job.h
> index e63edcb26b50..2576165c2228 100644
> --- a/drivers/gpu/drm/xe/xe_tlb_inval_job.h
> +++ b/drivers/gpu/drm/xe/xe_tlb_inval_job.h
> @@ -30,4 +30,6 @@ void xe_tlb_inval_job_get(struct xe_tlb_inval_job
> *job);
>
> void xe_tlb_inval_job_put(struct xe_tlb_inval_job *job);
>
> +u64 xe_tlb_inval_job_fence_context(struct xe_tlb_inval_job *job);
> +
> #endif
^ permalink raw reply [flat|nested] 15+ messages in thread* Re: [PATCH 1/1] drm/xe: Avoid serializing unbind jobs on prior TLB invalidations
2025-10-21 17:55 ` Summers, Stuart
@ 2025-10-21 20:36 ` Matthew Brost
2025-10-21 20:43 ` Summers, Stuart
0 siblings, 1 reply; 15+ messages in thread
From: Matthew Brost @ 2025-10-21 20:36 UTC (permalink / raw)
To: Summers, Stuart
Cc: intel-xe@lists.freedesktop.org, Santa, Carlos,
thomas.hellstrom@linux.intel.com
On Tue, Oct 21, 2025 at 11:55:56AM -0600, Summers, Stuart wrote:
> On Fri, 2025-10-17 at 09:52 -0700, Matthew Brost wrote:
> > When a burst of unbind jobs is issued, a dependency chain can form
> > between the TLB invalidation of a previous unbind job and the current
> > one. This leads to undesirable serialization, causing current jobs to
> > wait unnecessarily for prior TLB invalidations, execute on the GPU
> > when
> > not needed, and significantly slow down the unbind burst—resulting in
> > up
> > to a 4× slowdown.
> >
> > To break this chain, mask the last bind queue dependency if the last
> > fence's DMA context matches the TLB invalidation context. This allows
> > full pipelining of unbinds and TLB invalidations while preserving
> > correct dma-fence signaling semantics.
> >
> > Link: https://gitlab.freedesktop.org/drm/xe/kernel/-/issues/6047
> > Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> > ---
> > drivers/gpu/drm/xe/xe_exec.c | 3 +-
> > drivers/gpu/drm/xe/xe_exec_queue.c | 18 +++++++++--
> > drivers/gpu/drm/xe/xe_exec_queue.h | 3 +-
> > drivers/gpu/drm/xe/xe_pt.c | 15 +++++++--
> > drivers/gpu/drm/xe/xe_sched_job.c | 44
> > ++++++++++++++++++++++++++-
> > drivers/gpu/drm/xe/xe_sched_job.h | 7 ++++-
> > drivers/gpu/drm/xe/xe_tlb_inval_job.c | 14 +++++++++
> > drivers/gpu/drm/xe/xe_tlb_inval_job.h | 2 ++
> > 8 files changed, 98 insertions(+), 8 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/xe/xe_exec.c
> > b/drivers/gpu/drm/xe/xe_exec.c
> > index 0dc27476832b..6034cfc8be06 100644
> > --- a/drivers/gpu/drm/xe/xe_exec.c
> > +++ b/drivers/gpu/drm/xe/xe_exec.c
> > @@ -294,7 +294,8 @@ int xe_exec_ioctl(struct drm_device *dev, void
> > *data, struct drm_file *file)
> > goto err_put_job;
> >
> > if (!xe_vm_in_lr_mode(vm)) {
> > - err = xe_sched_job_last_fence_add_dep(job, vm);
> > + err = xe_sched_job_last_fence_add_dep(job, vm,
> > NO_MASK_DEP,
> > + NO_MASK_DEP);
> > if (err)
> > goto err_put_job;
> >
> > diff --git a/drivers/gpu/drm/xe/xe_exec_queue.c
> > b/drivers/gpu/drm/xe/xe_exec_queue.c
> > index 90cbc95f8e2e..d6f69d9bccba 100644
> > --- a/drivers/gpu/drm/xe/xe_exec_queue.c
> > +++ b/drivers/gpu/drm/xe/xe_exec_queue.c
> > @@ -25,6 +25,7 @@
> > #include "xe_migrate.h"
> > #include "xe_pm.h"
> > #include "xe_ring_ops_types.h"
> > +#include "xe_sched_job.h"
> > #include "xe_trace.h"
> > #include "xe_vm.h"
> > #include "xe_pxp.h"
> > @@ -1106,11 +1107,17 @@ void xe_exec_queue_last_fence_set(struct
> > xe_exec_queue *q, struct xe_vm *vm,
> > * xe_exec_queue_last_fence_test_dep - Test last fence dependency of
> > queue
> > * @q: The exec queue
> > * @vm: The VM the engine does a bind or exec for
> > + * @mask_ctx0: Mask dma-fence context0
> > + * @mask_ctx1: Mask dma-fence context1
> > + *
> > + * Test last fence dependency of queue, skipping masked dma fence
> > contexts.
> > *
> > * Returns:
> > - * -ETIME if there exists an unsignalled last fence dependency, zero
> > otherwise.
> > + * -ETIME if there exists an unsignalled and unmasked last fence
> > dependency,
> > + * zero otherwise.
> > */
> > -int xe_exec_queue_last_fence_test_dep(struct xe_exec_queue *q,
> > struct xe_vm *vm)
> > +int xe_exec_queue_last_fence_test_dep(struct xe_exec_queue *q,
> > struct xe_vm *vm,
> > + u64 mask_ctx0, u64 mask_ctx1)
> > {
> > struct dma_fence *fence;
> > int err = 0;
> > @@ -1119,6 +1126,13 @@ int xe_exec_queue_last_fence_test_dep(struct
> > xe_exec_queue *q, struct xe_vm *vm)
> > if (fence) {
> > err = test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence-
> > >flags) ?
> > 0 : -ETIME;
> > +
> > + if (err == -ETIME) {
> > + if (xe_sched_job_mask_dependency(fence,
> > mask_ctx0,
> > + mask_ctx1))
> > + err = 0;
> > + }
> > +
> > dma_fence_put(fence);
> > }
> >
> > diff --git a/drivers/gpu/drm/xe/xe_exec_queue.h
> > b/drivers/gpu/drm/xe/xe_exec_queue.h
> > index a4dfbe858bda..99a35b22a46c 100644
> > --- a/drivers/gpu/drm/xe/xe_exec_queue.h
> > +++ b/drivers/gpu/drm/xe/xe_exec_queue.h
> > @@ -85,7 +85,8 @@ struct dma_fence
> > *xe_exec_queue_last_fence_get_for_resume(struct xe_exec_queue *
> > void xe_exec_queue_last_fence_set(struct xe_exec_queue *e, struct
> > xe_vm *vm,
> > struct dma_fence *fence);
> > int xe_exec_queue_last_fence_test_dep(struct xe_exec_queue *q,
> > - struct xe_vm *vm);
> > + struct xe_vm *vm, u64
> > mask_ctx0,
> > + u64 mask_ctx1);
> > void xe_exec_queue_update_run_ticks(struct xe_exec_queue *q);
> >
> > int xe_exec_queue_contexts_hwsp_rebase(struct xe_exec_queue *q, void
> > *scratch);
> > diff --git a/drivers/gpu/drm/xe/xe_pt.c b/drivers/gpu/drm/xe/xe_pt.c
> > index d22fd1ccc0ba..bba9ae559f57 100644
> > --- a/drivers/gpu/drm/xe/xe_pt.c
> > +++ b/drivers/gpu/drm/xe/xe_pt.c
> > @@ -1341,10 +1341,21 @@ static int xe_pt_vm_dependencies(struct
> > xe_sched_job *job,
> > }
> >
> > if (!(pt_update_ops->q->flags & EXEC_QUEUE_FLAG_KERNEL)) {
> > + u64 mask_ctx0 = NO_MASK_DEP, mask_ctx1 = NO_MASK_DEP;
> > +
> > + if (ijob)
> > + mask_ctx0 =
> > xe_tlb_inval_job_fence_context(ijob);
> > + if (mjob)
> > + mask_ctx1 =
> > xe_tlb_inval_job_fence_context(mjob);
>
> Can we rename these ictx and mctx for consistency?
>
Yes.
> Also, do we really need both of these here? Shouldn't we always have
> the primary GT inval (ictx) and so just need to check the one? My
The code as written, we'd only need to check primary GT but Matt R
eventually wants the driver to be able to boot without the primary GT.
There is a bit of work to done to that work but I didn't want to make it
worse in this patch.
> understanding is the reason being that we might be adding either one of
> these as the last fence so we need both checks. But in that case would
> it be better to check against all dependencies or even just the last
> two? Wouldn't that also help if multiple apps are trying to free at
> once here so we have interleaved unbind dependencies?
>
Depending on how the bind is setup, we may check further dependecies
in dma-resv - see all the other checks in this function. This is
covering the case for last queue dependecy only which is at least
sufficent help with ChromeOS case where this is triggered with a burst
of user unbinds.
We might still have an issue with a burst of SVM unbinds where this can
serialize though, that would however likely need some DRM scheduler
changes though fairly similar to this patch. I can maybe think on that
one in a follow up in a later series. Also I probably should switch over
SVM unbinds to drain the entire garbage collector list and issue a
single unbind job too.
> > +
> > if (job)
> > - err = xe_sched_job_last_fence_add_dep(job,
> > vm);
> > + err = xe_sched_job_last_fence_add_dep(job,
> > vm,
> > +
> > mask_ctx0,
> > +
> > mask_ctx1);
> > else
> > - err =
> > xe_exec_queue_last_fence_test_dep(pt_update_ops->q, vm);
> > + err =
> > xe_exec_queue_last_fence_test_dep(pt_update_ops->q,
> > + vm,
> > mask_ctx0,
> > + mask_
> > ctx1);
> > }
> >
> > for (i = 0; job && !err && i < vops->num_syncs; i++)
> > diff --git a/drivers/gpu/drm/xe/xe_sched_job.c
> > b/drivers/gpu/drm/xe/xe_sched_job.c
> > index d21bf8f26964..7cbdd87904c6 100644
> > --- a/drivers/gpu/drm/xe/xe_sched_job.c
> > +++ b/drivers/gpu/drm/xe/xe_sched_job.c
> > @@ -6,6 +6,7 @@
> > #include "xe_sched_job.h"
> >
> > #include <uapi/drm/xe_drm.h>
> > +#include <linux/dma-fence-array.h>
> > #include <linux/dma-fence-chain.h>
> > #include <linux/slab.h>
> >
> > @@ -295,19 +296,60 @@ void xe_sched_job_push(struct xe_sched_job
> > *job)
> > xe_sched_job_put(job);
> > }
> >
> > +/**
> > + * xe_sched_job_mask_dependency() - Determine if a dma-fence
> > dependency can be masked
> > + * @fence: The dma-fence to check
> > + * @mask_ctx0: First context to compare against the fence's context
> > + * @mask_ctx1: Second context to compare against the fence's context
> > + *
> > + * This function checks whether the context of the given dma-fence
> > matches
> > + * either of the provided mask contexts. If a match is found, the
> > dependency
> > + * represented by the fence can be skipped. If the fence is a dma-
> > fence-array,
> > + * its individual fences are unwound and checked.
> > + *
> > + * Return: true if the fence can be masked (i.e., skipped), false
> > otherwise.
> > + */
> > +bool xe_sched_job_mask_dependency(struct dma_fence *fence, u64
> > mask_ctx0,
> > + u64 mask_ctx1)
> > +{
> > + if (dma_fence_is_array(fence)) {
> > + struct dma_fence *__fence;
> > + int index;
> > +
> > + dma_fence_array_for_each(__fence, index, fence)
> > + if (__fence->context == mask_ctx0 ||
> > + __fence->context == mask_ctx1)
> > + return true;
> > + } else if (fence->context == mask_ctx0 ||
> > + fence->context == mask_ctx1) {
> > + return true;
> > + }
> > +
> > + return false;
> > +}
> > +
> > /**
> > * xe_sched_job_last_fence_add_dep - Add last fence dependency to
> > job
> > * @job:job to add the last fence dependency to
> > * @vm: virtual memory job belongs to
> > + * @mask_ctx0: Mask dma-fence context0
> > + * @mask_ctx1: Mask dma-fence context1
> > + *
> > + * Add last fence dependency to job, skipping masked dma fence
> > contexts.
> > *
> > * Returns:
> > * 0 on success, or an error on failing to expand the array.
> > */
> > -int xe_sched_job_last_fence_add_dep(struct xe_sched_job *job, struct
> > xe_vm *vm)
> > +int xe_sched_job_last_fence_add_dep(struct xe_sched_job *job, struct
> > xe_vm *vm,
> > + u64 mask_ctx0, u64 mask_ctx1)
> > {
> > struct dma_fence *fence;
> >
> > fence = xe_exec_queue_last_fence_get(job->q, vm);
> > + if (xe_sched_job_mask_dependency(fence, mask_ctx0,
> > mask_ctx1)) {
> > + dma_fence_put(fence);
> > + return 0;
> > + }
> >
> > return drm_sched_job_add_dependency(&job->drm, fence);
> > }
> > diff --git a/drivers/gpu/drm/xe/xe_sched_job.h
> > b/drivers/gpu/drm/xe/xe_sched_job.h
> > index 3dc72c5c1f13..81d8e848e605 100644
> > --- a/drivers/gpu/drm/xe/xe_sched_job.h
> > +++ b/drivers/gpu/drm/xe/xe_sched_job.h
> > @@ -58,7 +58,8 @@ bool xe_sched_job_completed(struct xe_sched_job
> > *job);
> > void xe_sched_job_arm(struct xe_sched_job *job);
> > void xe_sched_job_push(struct xe_sched_job *job);
> >
> > -int xe_sched_job_last_fence_add_dep(struct xe_sched_job *job, struct
> > xe_vm *vm);
> > +int xe_sched_job_last_fence_add_dep(struct xe_sched_job *job, struct
> > xe_vm *vm,
> > + u64 mask_ctx0, u64 mask_ctx1);
> > void xe_sched_job_init_user_fence(struct xe_sched_job *job,
> > struct xe_sync_entry *sync);
> >
> > @@ -93,4 +94,8 @@ void xe_sched_job_snapshot_print(struct
> > xe_sched_job_snapshot *snapshot, struct
> > int xe_sched_job_add_deps(struct xe_sched_job *job, struct dma_resv
> > *resv,
> > enum dma_resv_usage usage);
> >
> > +#define NO_MASK_DEP (~0x0ull)
> > +bool xe_sched_job_mask_dependency(struct dma_fence *fence, u64
> > mask_ctx0,
> > + u64 mask_ctx1);
> > +
> > #endif
> > diff --git a/drivers/gpu/drm/xe/xe_tlb_inval_job.c
> > b/drivers/gpu/drm/xe/xe_tlb_inval_job.c
> > index 492def04a559..f2fe7f9fbb22 100644
> > --- a/drivers/gpu/drm/xe/xe_tlb_inval_job.c
> > +++ b/drivers/gpu/drm/xe/xe_tlb_inval_job.c
> > @@ -32,6 +32,8 @@ struct xe_tlb_inval_job {
> > u64 start;
> > /** @end: End address to invalidate */
> > u64 end;
> > + /** @fence_context: Fence context for job */
> > + u64 fence_context;
> > /** @asid: Address space ID to invalidate */
> > u32 asid;
> > /** @fence_armed: Fence has been armed */
> > @@ -101,6 +103,7 @@ xe_tlb_inval_job_create(struct xe_exec_queue *q,
> > struct xe_tlb_inval *tlb_inval,
> > job->asid = asid;
> > job->fence_armed = false;
> > job->dep.ops = &dep_job_ops;
>
> This means the "finished" context per the entity definition right? Can
> you either add a note here or change the job->fence_context name to
> reflect that? Or otherwise why is this adding the +1 here?
>
The schedule context is entity->context, the finished context is entity
+ 1 - this is in DRM scheduler doc. I can add a comment to this for now,
and roll a better fix which is DRM scheduler helper to fish out the
finished context into this series [1]. DRM scheduler stuff moves slow so
that latter may take a minute and didn't want to block a fix on that.
Matt
[1] https://patchwork.freedesktop.org/series/155314/
> Thanks,
> Stuart
>
> > + job->fence_context = entity->fence_context + 1;
> > kref_init(&job->refcount);
> > xe_exec_queue_get(q); /* Pairs with put in
> > xe_tlb_inval_job_destroy */
> >
> > @@ -266,3 +269,14 @@ void xe_tlb_inval_job_put(struct
> > xe_tlb_inval_job *job)
> > if (!IS_ERR_OR_NULL(job))
> > kref_put(&job->refcount, xe_tlb_inval_job_destroy);
> > }
> > +
> > +/**
> > + * xe_tlb_inval_job_fence_context() - TLB invalidation job fence
> > context
> > + * @job: TLB invalidation job object
> > + *
> > + * Return: TLB invalidation job fence context
> > + */
> > +u64 xe_tlb_inval_job_fence_context(struct xe_tlb_inval_job *job)
> > +{
> > + return job->fence_context;
> > +}
> > diff --git a/drivers/gpu/drm/xe/xe_tlb_inval_job.h
> > b/drivers/gpu/drm/xe/xe_tlb_inval_job.h
> > index e63edcb26b50..2576165c2228 100644
> > --- a/drivers/gpu/drm/xe/xe_tlb_inval_job.h
> > +++ b/drivers/gpu/drm/xe/xe_tlb_inval_job.h
> > @@ -30,4 +30,6 @@ void xe_tlb_inval_job_get(struct xe_tlb_inval_job
> > *job);
> >
> > void xe_tlb_inval_job_put(struct xe_tlb_inval_job *job);
> >
> > +u64 xe_tlb_inval_job_fence_context(struct xe_tlb_inval_job *job);
> > +
> > #endif
>
^ permalink raw reply [flat|nested] 15+ messages in thread* Re: [PATCH 1/1] drm/xe: Avoid serializing unbind jobs on prior TLB invalidations
2025-10-21 20:36 ` Matthew Brost
@ 2025-10-21 20:43 ` Summers, Stuart
2025-10-21 20:50 ` Matthew Brost
0 siblings, 1 reply; 15+ messages in thread
From: Summers, Stuart @ 2025-10-21 20:43 UTC (permalink / raw)
To: Brost, Matthew
Cc: intel-xe@lists.freedesktop.org, Santa, Carlos,
thomas.hellstrom@linux.intel.com
On Tue, 2025-10-21 at 13:36 -0700, Matthew Brost wrote:
> On Tue, Oct 21, 2025 at 11:55:56AM -0600, Summers, Stuart wrote:
> > On Fri, 2025-10-17 at 09:52 -0700, Matthew Brost wrote:
> > > When a burst of unbind jobs is issued, a dependency chain can
> > > form
> > > between the TLB invalidation of a previous unbind job and the
> > > current
> > > one. This leads to undesirable serialization, causing current
> > > jobs to
> > > wait unnecessarily for prior TLB invalidations, execute on the
> > > GPU
> > > when
> > > not needed, and significantly slow down the unbind
> > > burst—resulting in
> > > up
> > > to a 4× slowdown.
> > >
> > > To break this chain, mask the last bind queue dependency if the
> > > last
> > > fence's DMA context matches the TLB invalidation context. This
> > > allows
> > > full pipelining of unbinds and TLB invalidations while preserving
> > > correct dma-fence signaling semantics.
> > >
> > > Link: https://gitlab.freedesktop.org/drm/xe/kernel/-/issues/6047
> > > Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> > > ---
> > > drivers/gpu/drm/xe/xe_exec.c | 3 +-
> > > drivers/gpu/drm/xe/xe_exec_queue.c | 18 +++++++++--
> > > drivers/gpu/drm/xe/xe_exec_queue.h | 3 +-
> > > drivers/gpu/drm/xe/xe_pt.c | 15 +++++++--
> > > drivers/gpu/drm/xe/xe_sched_job.c | 44
> > > ++++++++++++++++++++++++++-
> > > drivers/gpu/drm/xe/xe_sched_job.h | 7 ++++-
> > > drivers/gpu/drm/xe/xe_tlb_inval_job.c | 14 +++++++++
> > > drivers/gpu/drm/xe/xe_tlb_inval_job.h | 2 ++
> > > 8 files changed, 98 insertions(+), 8 deletions(-)
> > >
> > > diff --git a/drivers/gpu/drm/xe/xe_exec.c
> > > b/drivers/gpu/drm/xe/xe_exec.c
> > > index 0dc27476832b..6034cfc8be06 100644
> > > --- a/drivers/gpu/drm/xe/xe_exec.c
> > > +++ b/drivers/gpu/drm/xe/xe_exec.c
> > > @@ -294,7 +294,8 @@ int xe_exec_ioctl(struct drm_device *dev,
> > > void
> > > *data, struct drm_file *file)
> > > goto err_put_job;
> > >
> > > if (!xe_vm_in_lr_mode(vm)) {
> > > - err = xe_sched_job_last_fence_add_dep(job, vm);
> > > + err = xe_sched_job_last_fence_add_dep(job, vm,
> > > NO_MASK_DEP,
> > > +
> > > NO_MASK_DEP);
> > > if (err)
> > > goto err_put_job;
> > >
> > > diff --git a/drivers/gpu/drm/xe/xe_exec_queue.c
> > > b/drivers/gpu/drm/xe/xe_exec_queue.c
> > > index 90cbc95f8e2e..d6f69d9bccba 100644
> > > --- a/drivers/gpu/drm/xe/xe_exec_queue.c
> > > +++ b/drivers/gpu/drm/xe/xe_exec_queue.c
> > > @@ -25,6 +25,7 @@
> > > #include "xe_migrate.h"
> > > #include "xe_pm.h"
> > > #include "xe_ring_ops_types.h"
> > > +#include "xe_sched_job.h"
> > > #include "xe_trace.h"
> > > #include "xe_vm.h"
> > > #include "xe_pxp.h"
> > > @@ -1106,11 +1107,17 @@ void xe_exec_queue_last_fence_set(struct
> > > xe_exec_queue *q, struct xe_vm *vm,
> > > * xe_exec_queue_last_fence_test_dep - Test last fence
> > > dependency of
> > > queue
> > > * @q: The exec queue
> > > * @vm: The VM the engine does a bind or exec for
> > > + * @mask_ctx0: Mask dma-fence context0
> > > + * @mask_ctx1: Mask dma-fence context1
> > > + *
> > > + * Test last fence dependency of queue, skipping masked dma
> > > fence
> > > contexts.
> > > *
> > > * Returns:
> > > - * -ETIME if there exists an unsignalled last fence dependency,
> > > zero
> > > otherwise.
> > > + * -ETIME if there exists an unsignalled and unmasked last fence
> > > dependency,
> > > + * zero otherwise.
> > > */
> > > -int xe_exec_queue_last_fence_test_dep(struct xe_exec_queue *q,
> > > struct xe_vm *vm)
> > > +int xe_exec_queue_last_fence_test_dep(struct xe_exec_queue *q,
> > > struct xe_vm *vm,
> > > + u64 mask_ctx0, u64
> > > mask_ctx1)
> > > {
> > > struct dma_fence *fence;
> > > int err = 0;
> > > @@ -1119,6 +1126,13 @@ int
> > > xe_exec_queue_last_fence_test_dep(struct
> > > xe_exec_queue *q, struct xe_vm *vm)
> > > if (fence) {
> > > err = test_bit(DMA_FENCE_FLAG_SIGNALED_BIT,
> > > &fence-
> > > > flags) ?
> > > 0 : -ETIME;
> > > +
> > > + if (err == -ETIME) {
> > > + if (xe_sched_job_mask_dependency(fence,
> > > mask_ctx0,
> > > +
> > > mask_ctx1))
> > > + err = 0;
> > > + }
> > > +
> > > dma_fence_put(fence);
> > > }
> > >
> > > diff --git a/drivers/gpu/drm/xe/xe_exec_queue.h
> > > b/drivers/gpu/drm/xe/xe_exec_queue.h
> > > index a4dfbe858bda..99a35b22a46c 100644
> > > --- a/drivers/gpu/drm/xe/xe_exec_queue.h
> > > +++ b/drivers/gpu/drm/xe/xe_exec_queue.h
> > > @@ -85,7 +85,8 @@ struct dma_fence
> > > *xe_exec_queue_last_fence_get_for_resume(struct xe_exec_queue *
> > > void xe_exec_queue_last_fence_set(struct xe_exec_queue *e,
> > > struct
> > > xe_vm *vm,
> > > struct dma_fence *fence);
> > > int xe_exec_queue_last_fence_test_dep(struct xe_exec_queue *q,
> > > - struct xe_vm *vm);
> > > + struct xe_vm *vm, u64
> > > mask_ctx0,
> > > + u64 mask_ctx1);
> > > void xe_exec_queue_update_run_ticks(struct xe_exec_queue *q);
> > >
> > > int xe_exec_queue_contexts_hwsp_rebase(struct xe_exec_queue *q,
> > > void
> > > *scratch);
> > > diff --git a/drivers/gpu/drm/xe/xe_pt.c
> > > b/drivers/gpu/drm/xe/xe_pt.c
> > > index d22fd1ccc0ba..bba9ae559f57 100644
> > > --- a/drivers/gpu/drm/xe/xe_pt.c
> > > +++ b/drivers/gpu/drm/xe/xe_pt.c
> > > @@ -1341,10 +1341,21 @@ static int xe_pt_vm_dependencies(struct
> > > xe_sched_job *job,
> > > }
> > >
> > > if (!(pt_update_ops->q->flags & EXEC_QUEUE_FLAG_KERNEL))
> > > {
> > > + u64 mask_ctx0 = NO_MASK_DEP, mask_ctx1 =
> > > NO_MASK_DEP;
> > > +
> > > + if (ijob)
> > > + mask_ctx0 =
> > > xe_tlb_inval_job_fence_context(ijob);
> > > + if (mjob)
> > > + mask_ctx1 =
> > > xe_tlb_inval_job_fence_context(mjob);
> >
> > Can we rename these ictx and mctx for consistency?
> >
>
> Yes.
>
> > Also, do we really need both of these here? Shouldn't we always
> > have
> > the primary GT inval (ictx) and so just need to check the one? My
>
> The code as written, we'd only need to check primary GT but Matt R
> eventually wants the driver to be able to boot without the primary
> GT.
> There is a bit of work to done to that work but I didn't want to make
> it
> worse in this patch.
Yeah makes sense, was just thinking of how to make this a little
simplier. I don't really like that we're looking at both contexts here
when we really just need one of them. But we're also doing this in
other parts of the driver (like the primary/media GT TLB invals for
which this is based), so maybe no problem. Just good to at least note
this, otherwise to me it isn't super clear why we need two contexts
here at a glance.
I think at least having those name changes (ictx and mctx) would help
here.
>
> > understanding is the reason being that we might be adding either
> > one of
> > these as the last fence so we need both checks. But in that case
> > would
> > it be better to check against all dependencies or even just the
> > last
> > two? Wouldn't that also help if multiple apps are trying to free at
> > once here so we have interleaved unbind dependencies?
> >
>
> Depending on how the bind is setup, we may check further dependecies
> in dma-resv - see all the other checks in this function. This is
> covering the case for last queue dependecy only which is at least
> sufficent help with ChromeOS case where this is triggered with a
> burst
> of user unbinds.
>
> We might still have an issue with a burst of SVM unbinds where this
> can
> serialize though, that would however likely need some DRM scheduler
> changes though fairly similar to this patch. I can maybe think on
> that
> one in a follow up in a later series. Also I probably should switch
> over
> SVM unbinds to drain the entire garbage collector list and issue a
> single unbind job too.
Makes sense to me, but I'm also fine having that in a follow-up patch.
>
> > > +
> > > if (job)
> > > - err =
> > > xe_sched_job_last_fence_add_dep(job,
> > > vm);
> > > + err =
> > > xe_sched_job_last_fence_add_dep(job,
> > > vm,
> > > +
> > > mask_ctx0,
> > > +
> > > mask_ctx1);
> > > else
> > > - err =
> > > xe_exec_queue_last_fence_test_dep(pt_update_ops->q, vm);
> > > + err =
> > > xe_exec_queue_last_fence_test_dep(pt_update_ops->q,
> > > + v
> > > m,
> > > mask_ctx0,
> > > + m
> > > ask_
> > > ctx1);
> > > }
> > >
> > > for (i = 0; job && !err && i < vops->num_syncs; i++)
> > > diff --git a/drivers/gpu/drm/xe/xe_sched_job.c
> > > b/drivers/gpu/drm/xe/xe_sched_job.c
> > > index d21bf8f26964..7cbdd87904c6 100644
> > > --- a/drivers/gpu/drm/xe/xe_sched_job.c
> > > +++ b/drivers/gpu/drm/xe/xe_sched_job.c
> > > @@ -6,6 +6,7 @@
> > > #include "xe_sched_job.h"
> > >
> > > #include <uapi/drm/xe_drm.h>
> > > +#include <linux/dma-fence-array.h>
> > > #include <linux/dma-fence-chain.h>
> > > #include <linux/slab.h>
> > >
> > > @@ -295,19 +296,60 @@ void xe_sched_job_push(struct xe_sched_job
> > > *job)
> > > xe_sched_job_put(job);
> > > }
> > >
> > > +/**
> > > + * xe_sched_job_mask_dependency() - Determine if a dma-fence
> > > dependency can be masked
> > > + * @fence: The dma-fence to check
> > > + * @mask_ctx0: First context to compare against the fence's
> > > context
> > > + * @mask_ctx1: Second context to compare against the fence's
> > > context
> > > + *
> > > + * This function checks whether the context of the given dma-
> > > fence
> > > matches
> > > + * either of the provided mask contexts. If a match is found,
> > > the
> > > dependency
> > > + * represented by the fence can be skipped. If the fence is a
> > > dma-
> > > fence-array,
> > > + * its individual fences are unwound and checked.
> > > + *
> > > + * Return: true if the fence can be masked (i.e., skipped),
> > > false
> > > otherwise.
> > > + */
> > > +bool xe_sched_job_mask_dependency(struct dma_fence *fence, u64
> > > mask_ctx0,
> > > + u64 mask_ctx1)
> > > +{
> > > + if (dma_fence_is_array(fence)) {
> > > + struct dma_fence *__fence;
> > > + int index;
> > > +
> > > + dma_fence_array_for_each(__fence, index, fence)
> > > + if (__fence->context == mask_ctx0 ||
> > > + __fence->context == mask_ctx1)
> > > + return true;
> > > + } else if (fence->context == mask_ctx0 ||
> > > + fence->context == mask_ctx1) {
> > > + return true;
> > > + }
> > > +
> > > + return false;
> > > +}
> > > +
> > > /**
> > > * xe_sched_job_last_fence_add_dep - Add last fence dependency
> > > to
> > > job
> > > * @job:job to add the last fence dependency to
> > > * @vm: virtual memory job belongs to
> > > + * @mask_ctx0: Mask dma-fence context0
> > > + * @mask_ctx1: Mask dma-fence context1
> > > + *
> > > + * Add last fence dependency to job, skipping masked dma fence
> > > contexts.
> > > *
> > > * Returns:
> > > * 0 on success, or an error on failing to expand the array.
> > > */
> > > -int xe_sched_job_last_fence_add_dep(struct xe_sched_job *job,
> > > struct
> > > xe_vm *vm)
> > > +int xe_sched_job_last_fence_add_dep(struct xe_sched_job *job,
> > > struct
> > > xe_vm *vm,
> > > + u64 mask_ctx0, u64 mask_ctx1)
> > > {
> > > struct dma_fence *fence;
> > >
> > > fence = xe_exec_queue_last_fence_get(job->q, vm);
> > > + if (xe_sched_job_mask_dependency(fence, mask_ctx0,
> > > mask_ctx1)) {
> > > + dma_fence_put(fence);
> > > + return 0;
> > > + }
> > >
> > > return drm_sched_job_add_dependency(&job->drm, fence);
> > > }
> > > diff --git a/drivers/gpu/drm/xe/xe_sched_job.h
> > > b/drivers/gpu/drm/xe/xe_sched_job.h
> > > index 3dc72c5c1f13..81d8e848e605 100644
> > > --- a/drivers/gpu/drm/xe/xe_sched_job.h
> > > +++ b/drivers/gpu/drm/xe/xe_sched_job.h
> > > @@ -58,7 +58,8 @@ bool xe_sched_job_completed(struct xe_sched_job
> > > *job);
> > > void xe_sched_job_arm(struct xe_sched_job *job);
> > > void xe_sched_job_push(struct xe_sched_job *job);
> > >
> > > -int xe_sched_job_last_fence_add_dep(struct xe_sched_job *job,
> > > struct
> > > xe_vm *vm);
> > > +int xe_sched_job_last_fence_add_dep(struct xe_sched_job *job,
> > > struct
> > > xe_vm *vm,
> > > + u64 mask_ctx0, u64
> > > mask_ctx1);
> > > void xe_sched_job_init_user_fence(struct xe_sched_job *job,
> > > struct xe_sync_entry *sync);
> > >
> > > @@ -93,4 +94,8 @@ void xe_sched_job_snapshot_print(struct
> > > xe_sched_job_snapshot *snapshot, struct
> > > int xe_sched_job_add_deps(struct xe_sched_job *job, struct
> > > dma_resv
> > > *resv,
> > > enum dma_resv_usage usage);
> > >
> > > +#define NO_MASK_DEP (~0x0ull)
> > > +bool xe_sched_job_mask_dependency(struct dma_fence *fence, u64
> > > mask_ctx0,
> > > + u64 mask_ctx1);
> > > +
> > > #endif
> > > diff --git a/drivers/gpu/drm/xe/xe_tlb_inval_job.c
> > > b/drivers/gpu/drm/xe/xe_tlb_inval_job.c
> > > index 492def04a559..f2fe7f9fbb22 100644
> > > --- a/drivers/gpu/drm/xe/xe_tlb_inval_job.c
> > > +++ b/drivers/gpu/drm/xe/xe_tlb_inval_job.c
> > > @@ -32,6 +32,8 @@ struct xe_tlb_inval_job {
> > > u64 start;
> > > /** @end: End address to invalidate */
> > > u64 end;
> > > + /** @fence_context: Fence context for job */
> > > + u64 fence_context;
> > > /** @asid: Address space ID to invalidate */
> > > u32 asid;
> > > /** @fence_armed: Fence has been armed */
> > > @@ -101,6 +103,7 @@ xe_tlb_inval_job_create(struct xe_exec_queue
> > > *q,
> > > struct xe_tlb_inval *tlb_inval,
> > > job->asid = asid;
> > > job->fence_armed = false;
> > > job->dep.ops = &dep_job_ops;
> >
> > This means the "finished" context per the entity definition right?
> > Can
> > you either add a note here or change the job->fence_context name to
> > reflect that? Or otherwise why is this adding the +1 here?
> >
>
> The schedule context is entity->context, the finished context is
> entity
> + 1 - this is in DRM scheduler doc. I can add a comment to this for
> now,
> and roll a better fix which is DRM scheduler helper to fish out the
> finished context into this series [1]. DRM scheduler stuff moves slow
> so
> that latter may take a minute and didn't want to block a fix on that.
Can you add some quick documentation there to that effect? Just nice
not to have to go back and forth to the entity documentation which
right now is just implied.
Also thanks for the link to that other series, I'll check that out too.
Thanks,
Stuart
>
> Matt
>
> [1] https://patchwork.freedesktop.org/series/155314/
>
> > Thanks,
> > Stuart
> >
> > > + job->fence_context = entity->fence_context + 1;
> > > kref_init(&job->refcount);
> > > xe_exec_queue_get(q); /* Pairs with put in
> > > xe_tlb_inval_job_destroy */
> > >
> > > @@ -266,3 +269,14 @@ void xe_tlb_inval_job_put(struct
> > > xe_tlb_inval_job *job)
> > > if (!IS_ERR_OR_NULL(job))
> > > kref_put(&job->refcount,
> > > xe_tlb_inval_job_destroy);
> > > }
> > > +
> > > +/**
> > > + * xe_tlb_inval_job_fence_context() - TLB invalidation job fence
> > > context
> > > + * @job: TLB invalidation job object
> > > + *
> > > + * Return: TLB invalidation job fence context
> > > + */
> > > +u64 xe_tlb_inval_job_fence_context(struct xe_tlb_inval_job *job)
> > > +{
> > > + return job->fence_context;
> > > +}
> > > diff --git a/drivers/gpu/drm/xe/xe_tlb_inval_job.h
> > > b/drivers/gpu/drm/xe/xe_tlb_inval_job.h
> > > index e63edcb26b50..2576165c2228 100644
> > > --- a/drivers/gpu/drm/xe/xe_tlb_inval_job.h
> > > +++ b/drivers/gpu/drm/xe/xe_tlb_inval_job.h
> > > @@ -30,4 +30,6 @@ void xe_tlb_inval_job_get(struct
> > > xe_tlb_inval_job
> > > *job);
> > >
> > > void xe_tlb_inval_job_put(struct xe_tlb_inval_job *job);
> > >
> > > +u64 xe_tlb_inval_job_fence_context(struct xe_tlb_inval_job
> > > *job);
> > > +
> > > #endif
> >
^ permalink raw reply [flat|nested] 15+ messages in thread* Re: [PATCH 1/1] drm/xe: Avoid serializing unbind jobs on prior TLB invalidations
2025-10-21 20:43 ` Summers, Stuart
@ 2025-10-21 20:50 ` Matthew Brost
0 siblings, 0 replies; 15+ messages in thread
From: Matthew Brost @ 2025-10-21 20:50 UTC (permalink / raw)
To: Summers, Stuart
Cc: intel-xe@lists.freedesktop.org, Santa, Carlos,
thomas.hellstrom@linux.intel.com
On Tue, Oct 21, 2025 at 02:43:07PM -0600, Summers, Stuart wrote:
> On Tue, 2025-10-21 at 13:36 -0700, Matthew Brost wrote:
> > On Tue, Oct 21, 2025 at 11:55:56AM -0600, Summers, Stuart wrote:
> > > On Fri, 2025-10-17 at 09:52 -0700, Matthew Brost wrote:
> > > > When a burst of unbind jobs is issued, a dependency chain can
> > > > form
> > > > between the TLB invalidation of a previous unbind job and the
> > > > current
> > > > one. This leads to undesirable serialization, causing current
> > > > jobs to
> > > > wait unnecessarily for prior TLB invalidations, execute on the
> > > > GPU
> > > > when
> > > > not needed, and significantly slow down the unbind
> > > > burst—resulting in
> > > > up
> > > > to a 4× slowdown.
> > > >
> > > > To break this chain, mask the last bind queue dependency if the
> > > > last
> > > > fence's DMA context matches the TLB invalidation context. This
> > > > allows
> > > > full pipelining of unbinds and TLB invalidations while preserving
> > > > correct dma-fence signaling semantics.
> > > >
> > > > Link: https://gitlab.freedesktop.org/drm/xe/kernel/-/issues/6047
> > > > Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> > > > ---
> > > > drivers/gpu/drm/xe/xe_exec.c | 3 +-
> > > > drivers/gpu/drm/xe/xe_exec_queue.c | 18 +++++++++--
> > > > drivers/gpu/drm/xe/xe_exec_queue.h | 3 +-
> > > > drivers/gpu/drm/xe/xe_pt.c | 15 +++++++--
> > > > drivers/gpu/drm/xe/xe_sched_job.c | 44
> > > > ++++++++++++++++++++++++++-
> > > > drivers/gpu/drm/xe/xe_sched_job.h | 7 ++++-
> > > > drivers/gpu/drm/xe/xe_tlb_inval_job.c | 14 +++++++++
> > > > drivers/gpu/drm/xe/xe_tlb_inval_job.h | 2 ++
> > > > 8 files changed, 98 insertions(+), 8 deletions(-)
> > > >
> > > > diff --git a/drivers/gpu/drm/xe/xe_exec.c
> > > > b/drivers/gpu/drm/xe/xe_exec.c
> > > > index 0dc27476832b..6034cfc8be06 100644
> > > > --- a/drivers/gpu/drm/xe/xe_exec.c
> > > > +++ b/drivers/gpu/drm/xe/xe_exec.c
> > > > @@ -294,7 +294,8 @@ int xe_exec_ioctl(struct drm_device *dev,
> > > > void
> > > > *data, struct drm_file *file)
> > > > goto err_put_job;
> > > >
> > > > if (!xe_vm_in_lr_mode(vm)) {
> > > > - err = xe_sched_job_last_fence_add_dep(job, vm);
> > > > + err = xe_sched_job_last_fence_add_dep(job, vm,
> > > > NO_MASK_DEP,
> > > > +
> > > > NO_MASK_DEP);
> > > > if (err)
> > > > goto err_put_job;
> > > >
> > > > diff --git a/drivers/gpu/drm/xe/xe_exec_queue.c
> > > > b/drivers/gpu/drm/xe/xe_exec_queue.c
> > > > index 90cbc95f8e2e..d6f69d9bccba 100644
> > > > --- a/drivers/gpu/drm/xe/xe_exec_queue.c
> > > > +++ b/drivers/gpu/drm/xe/xe_exec_queue.c
> > > > @@ -25,6 +25,7 @@
> > > > #include "xe_migrate.h"
> > > > #include "xe_pm.h"
> > > > #include "xe_ring_ops_types.h"
> > > > +#include "xe_sched_job.h"
> > > > #include "xe_trace.h"
> > > > #include "xe_vm.h"
> > > > #include "xe_pxp.h"
> > > > @@ -1106,11 +1107,17 @@ void xe_exec_queue_last_fence_set(struct
> > > > xe_exec_queue *q, struct xe_vm *vm,
> > > > * xe_exec_queue_last_fence_test_dep - Test last fence
> > > > dependency of
> > > > queue
> > > > * @q: The exec queue
> > > > * @vm: The VM the engine does a bind or exec for
> > > > + * @mask_ctx0: Mask dma-fence context0
> > > > + * @mask_ctx1: Mask dma-fence context1
> > > > + *
> > > > + * Test last fence dependency of queue, skipping masked dma
> > > > fence
> > > > contexts.
> > > > *
> > > > * Returns:
> > > > - * -ETIME if there exists an unsignalled last fence dependency,
> > > > zero
> > > > otherwise.
> > > > + * -ETIME if there exists an unsignalled and unmasked last fence
> > > > dependency,
> > > > + * zero otherwise.
> > > > */
> > > > -int xe_exec_queue_last_fence_test_dep(struct xe_exec_queue *q,
> > > > struct xe_vm *vm)
> > > > +int xe_exec_queue_last_fence_test_dep(struct xe_exec_queue *q,
> > > > struct xe_vm *vm,
> > > > + u64 mask_ctx0, u64
> > > > mask_ctx1)
> > > > {
> > > > struct dma_fence *fence;
> > > > int err = 0;
> > > > @@ -1119,6 +1126,13 @@ int
> > > > xe_exec_queue_last_fence_test_dep(struct
> > > > xe_exec_queue *q, struct xe_vm *vm)
> > > > if (fence) {
> > > > err = test_bit(DMA_FENCE_FLAG_SIGNALED_BIT,
> > > > &fence-
> > > > > flags) ?
> > > > 0 : -ETIME;
> > > > +
> > > > + if (err == -ETIME) {
> > > > + if (xe_sched_job_mask_dependency(fence,
> > > > mask_ctx0,
> > > > +
> > > > mask_ctx1))
> > > > + err = 0;
> > > > + }
> > > > +
> > > > dma_fence_put(fence);
> > > > }
> > > >
> > > > diff --git a/drivers/gpu/drm/xe/xe_exec_queue.h
> > > > b/drivers/gpu/drm/xe/xe_exec_queue.h
> > > > index a4dfbe858bda..99a35b22a46c 100644
> > > > --- a/drivers/gpu/drm/xe/xe_exec_queue.h
> > > > +++ b/drivers/gpu/drm/xe/xe_exec_queue.h
> > > > @@ -85,7 +85,8 @@ struct dma_fence
> > > > *xe_exec_queue_last_fence_get_for_resume(struct xe_exec_queue *
> > > > void xe_exec_queue_last_fence_set(struct xe_exec_queue *e,
> > > > struct
> > > > xe_vm *vm,
> > > > struct dma_fence *fence);
> > > > int xe_exec_queue_last_fence_test_dep(struct xe_exec_queue *q,
> > > > - struct xe_vm *vm);
> > > > + struct xe_vm *vm, u64
> > > > mask_ctx0,
> > > > + u64 mask_ctx1);
> > > > void xe_exec_queue_update_run_ticks(struct xe_exec_queue *q);
> > > >
> > > > int xe_exec_queue_contexts_hwsp_rebase(struct xe_exec_queue *q,
> > > > void
> > > > *scratch);
> > > > diff --git a/drivers/gpu/drm/xe/xe_pt.c
> > > > b/drivers/gpu/drm/xe/xe_pt.c
> > > > index d22fd1ccc0ba..bba9ae559f57 100644
> > > > --- a/drivers/gpu/drm/xe/xe_pt.c
> > > > +++ b/drivers/gpu/drm/xe/xe_pt.c
> > > > @@ -1341,10 +1341,21 @@ static int xe_pt_vm_dependencies(struct
> > > > xe_sched_job *job,
> > > > }
> > > >
> > > > if (!(pt_update_ops->q->flags & EXEC_QUEUE_FLAG_KERNEL))
> > > > {
> > > > + u64 mask_ctx0 = NO_MASK_DEP, mask_ctx1 =
> > > > NO_MASK_DEP;
> > > > +
> > > > + if (ijob)
> > > > + mask_ctx0 =
> > > > xe_tlb_inval_job_fence_context(ijob);
> > > > + if (mjob)
> > > > + mask_ctx1 =
> > > > xe_tlb_inval_job_fence_context(mjob);
> > >
> > > Can we rename these ictx and mctx for consistency?
> > >
> >
> > Yes.
> >
> > > Also, do we really need both of these here? Shouldn't we always
> > > have
> > > the primary GT inval (ictx) and so just need to check the one? My
> >
> > The code as written, we'd only need to check primary GT but Matt R
> > eventually wants the driver to be able to boot without the primary
> > GT.
> > There is a bit of work to done to that work but I didn't want to make
> > it
> > worse in this patch.
>
> Yeah makes sense, was just thinking of how to make this a little
> simplier. I don't really like that we're looking at both contexts here
> when we really just need one of them. But we're also doing this in
> other parts of the driver (like the primary/media GT TLB invals for
> which this is based), so maybe no problem. Just good to at least note
> this, otherwise to me it isn't super clear why we need two contexts
> here at a glance.
>
> I think at least having those name changes (ictx and mctx) would help
> here.
>
Yes, will add a comment too.
> >
> > > understanding is the reason being that we might be adding either
> > > one of
> > > these as the last fence so we need both checks. But in that case
> > > would
> > > it be better to check against all dependencies or even just the
> > > last
> > > two? Wouldn't that also help if multiple apps are trying to free at
> > > once here so we have interleaved unbind dependencies?
> > >
> >
> > Depending on how the bind is setup, we may check further dependecies
> > in dma-resv - see all the other checks in this function. This is
> > covering the case for last queue dependecy only which is at least
> > sufficent help with ChromeOS case where this is triggered with a
> > burst
> > of user unbinds.
> >
> > We might still have an issue with a burst of SVM unbinds where this
> > can
> > serialize though, that would however likely need some DRM scheduler
> > changes though fairly similar to this patch. I can maybe think on
> > that
> > one in a follow up in a later series. Also I probably should switch
> > over
> > SVM unbinds to drain the entire garbage collector list and issue a
> > single unbind job too.
>
> Makes sense to me, but I'm also fine having that in a follow-up patch.
>
> >
> > > > +
> > > > if (job)
> > > > - err =
> > > > xe_sched_job_last_fence_add_dep(job,
> > > > vm);
> > > > + err =
> > > > xe_sched_job_last_fence_add_dep(job,
> > > > vm,
> > > > +
> > > > mask_ctx0,
> > > > +
> > > > mask_ctx1);
> > > > else
> > > > - err =
> > > > xe_exec_queue_last_fence_test_dep(pt_update_ops->q, vm);
> > > > + err =
> > > > xe_exec_queue_last_fence_test_dep(pt_update_ops->q,
> > > > + v
> > > > m,
> > > > mask_ctx0,
> > > > + m
> > > > ask_
> > > > ctx1);
> > > > }
> > > >
> > > > for (i = 0; job && !err && i < vops->num_syncs; i++)
> > > > diff --git a/drivers/gpu/drm/xe/xe_sched_job.c
> > > > b/drivers/gpu/drm/xe/xe_sched_job.c
> > > > index d21bf8f26964..7cbdd87904c6 100644
> > > > --- a/drivers/gpu/drm/xe/xe_sched_job.c
> > > > +++ b/drivers/gpu/drm/xe/xe_sched_job.c
> > > > @@ -6,6 +6,7 @@
> > > > #include "xe_sched_job.h"
> > > >
> > > > #include <uapi/drm/xe_drm.h>
> > > > +#include <linux/dma-fence-array.h>
> > > > #include <linux/dma-fence-chain.h>
> > > > #include <linux/slab.h>
> > > >
> > > > @@ -295,19 +296,60 @@ void xe_sched_job_push(struct xe_sched_job
> > > > *job)
> > > > xe_sched_job_put(job);
> > > > }
> > > >
> > > > +/**
> > > > + * xe_sched_job_mask_dependency() - Determine if a dma-fence
> > > > dependency can be masked
> > > > + * @fence: The dma-fence to check
> > > > + * @mask_ctx0: First context to compare against the fence's
> > > > context
> > > > + * @mask_ctx1: Second context to compare against the fence's
> > > > context
> > > > + *
> > > > + * This function checks whether the context of the given dma-
> > > > fence
> > > > matches
> > > > + * either of the provided mask contexts. If a match is found,
> > > > the
> > > > dependency
> > > > + * represented by the fence can be skipped. If the fence is a
> > > > dma-
> > > > fence-array,
> > > > + * its individual fences are unwound and checked.
> > > > + *
> > > > + * Return: true if the fence can be masked (i.e., skipped),
> > > > false
> > > > otherwise.
> > > > + */
> > > > +bool xe_sched_job_mask_dependency(struct dma_fence *fence, u64
> > > > mask_ctx0,
> > > > + u64 mask_ctx1)
> > > > +{
> > > > + if (dma_fence_is_array(fence)) {
> > > > + struct dma_fence *__fence;
> > > > + int index;
> > > > +
> > > > + dma_fence_array_for_each(__fence, index, fence)
> > > > + if (__fence->context == mask_ctx0 ||
> > > > + __fence->context == mask_ctx1)
> > > > + return true;
> > > > + } else if (fence->context == mask_ctx0 ||
> > > > + fence->context == mask_ctx1) {
> > > > + return true;
> > > > + }
> > > > +
> > > > + return false;
> > > > +}
> > > > +
> > > > /**
> > > > * xe_sched_job_last_fence_add_dep - Add last fence dependency
> > > > to
> > > > job
> > > > * @job:job to add the last fence dependency to
> > > > * @vm: virtual memory job belongs to
> > > > + * @mask_ctx0: Mask dma-fence context0
> > > > + * @mask_ctx1: Mask dma-fence context1
> > > > + *
> > > > + * Add last fence dependency to job, skipping masked dma fence
> > > > contexts.
> > > > *
> > > > * Returns:
> > > > * 0 on success, or an error on failing to expand the array.
> > > > */
> > > > -int xe_sched_job_last_fence_add_dep(struct xe_sched_job *job,
> > > > struct
> > > > xe_vm *vm)
> > > > +int xe_sched_job_last_fence_add_dep(struct xe_sched_job *job,
> > > > struct
> > > > xe_vm *vm,
> > > > + u64 mask_ctx0, u64 mask_ctx1)
> > > > {
> > > > struct dma_fence *fence;
> > > >
> > > > fence = xe_exec_queue_last_fence_get(job->q, vm);
> > > > + if (xe_sched_job_mask_dependency(fence, mask_ctx0,
> > > > mask_ctx1)) {
> > > > + dma_fence_put(fence);
> > > > + return 0;
> > > > + }
> > > >
> > > > return drm_sched_job_add_dependency(&job->drm, fence);
> > > > }
> > > > diff --git a/drivers/gpu/drm/xe/xe_sched_job.h
> > > > b/drivers/gpu/drm/xe/xe_sched_job.h
> > > > index 3dc72c5c1f13..81d8e848e605 100644
> > > > --- a/drivers/gpu/drm/xe/xe_sched_job.h
> > > > +++ b/drivers/gpu/drm/xe/xe_sched_job.h
> > > > @@ -58,7 +58,8 @@ bool xe_sched_job_completed(struct xe_sched_job
> > > > *job);
> > > > void xe_sched_job_arm(struct xe_sched_job *job);
> > > > void xe_sched_job_push(struct xe_sched_job *job);
> > > >
> > > > -int xe_sched_job_last_fence_add_dep(struct xe_sched_job *job,
> > > > struct
> > > > xe_vm *vm);
> > > > +int xe_sched_job_last_fence_add_dep(struct xe_sched_job *job,
> > > > struct
> > > > xe_vm *vm,
> > > > + u64 mask_ctx0, u64
> > > > mask_ctx1);
> > > > void xe_sched_job_init_user_fence(struct xe_sched_job *job,
> > > > struct xe_sync_entry *sync);
> > > >
> > > > @@ -93,4 +94,8 @@ void xe_sched_job_snapshot_print(struct
> > > > xe_sched_job_snapshot *snapshot, struct
> > > > int xe_sched_job_add_deps(struct xe_sched_job *job, struct
> > > > dma_resv
> > > > *resv,
> > > > enum dma_resv_usage usage);
> > > >
> > > > +#define NO_MASK_DEP (~0x0ull)
> > > > +bool xe_sched_job_mask_dependency(struct dma_fence *fence, u64
> > > > mask_ctx0,
> > > > + u64 mask_ctx1);
> > > > +
> > > > #endif
> > > > diff --git a/drivers/gpu/drm/xe/xe_tlb_inval_job.c
> > > > b/drivers/gpu/drm/xe/xe_tlb_inval_job.c
> > > > index 492def04a559..f2fe7f9fbb22 100644
> > > > --- a/drivers/gpu/drm/xe/xe_tlb_inval_job.c
> > > > +++ b/drivers/gpu/drm/xe/xe_tlb_inval_job.c
> > > > @@ -32,6 +32,8 @@ struct xe_tlb_inval_job {
> > > > u64 start;
> > > > /** @end: End address to invalidate */
> > > > u64 end;
> > > > + /** @fence_context: Fence context for job */
> > > > + u64 fence_context;
> > > > /** @asid: Address space ID to invalidate */
> > > > u32 asid;
> > > > /** @fence_armed: Fence has been armed */
> > > > @@ -101,6 +103,7 @@ xe_tlb_inval_job_create(struct xe_exec_queue
> > > > *q,
> > > > struct xe_tlb_inval *tlb_inval,
> > > > job->asid = asid;
> > > > job->fence_armed = false;
> > > > job->dep.ops = &dep_job_ops;
> > >
> > > This means the "finished" context per the entity definition right?
> > > Can
> > > you either add a note here or change the job->fence_context name to
> > > reflect that? Or otherwise why is this adding the +1 here?
> > >
> >
> > The schedule context is entity->context, the finished context is
> > entity
> > + 1 - this is in DRM scheduler doc. I can add a comment to this for
> > now,
> > and roll a better fix which is DRM scheduler helper to fish out the
> > finished context into this series [1]. DRM scheduler stuff moves slow
> > so
> > that latter may take a minute and didn't want to block a fix on that.
>
> Can you add some quick documentation there to that effect? Just nice
> not to have to go back and forth to the entity documentation which
> right now is just implied.
>
Will add a comment.
Matt
> Also thanks for the link to that other series, I'll check that out too.
>
> Thanks,
> Stuart
>
> >
> > Matt
> >
> > [1] https://patchwork.freedesktop.org/series/155314/
> >
> > > Thanks,
> > > Stuart
> > >
> > > > + job->fence_context = entity->fence_context + 1;
> > > > kref_init(&job->refcount);
> > > > xe_exec_queue_get(q); /* Pairs with put in
> > > > xe_tlb_inval_job_destroy */
> > > >
> > > > @@ -266,3 +269,14 @@ void xe_tlb_inval_job_put(struct
> > > > xe_tlb_inval_job *job)
> > > > if (!IS_ERR_OR_NULL(job))
> > > > kref_put(&job->refcount,
> > > > xe_tlb_inval_job_destroy);
> > > > }
> > > > +
> > > > +/**
> > > > + * xe_tlb_inval_job_fence_context() - TLB invalidation job fence
> > > > context
> > > > + * @job: TLB invalidation job object
> > > > + *
> > > > + * Return: TLB invalidation job fence context
> > > > + */
> > > > +u64 xe_tlb_inval_job_fence_context(struct xe_tlb_inval_job *job)
> > > > +{
> > > > + return job->fence_context;
> > > > +}
> > > > diff --git a/drivers/gpu/drm/xe/xe_tlb_inval_job.h
> > > > b/drivers/gpu/drm/xe/xe_tlb_inval_job.h
> > > > index e63edcb26b50..2576165c2228 100644
> > > > --- a/drivers/gpu/drm/xe/xe_tlb_inval_job.h
> > > > +++ b/drivers/gpu/drm/xe/xe_tlb_inval_job.h
> > > > @@ -30,4 +30,6 @@ void xe_tlb_inval_job_get(struct
> > > > xe_tlb_inval_job
> > > > *job);
> > > >
> > > > void xe_tlb_inval_job_put(struct xe_tlb_inval_job *job);
> > > >
> > > > +u64 xe_tlb_inval_job_fence_context(struct xe_tlb_inval_job
> > > > *job);
> > > > +
> > > > #endif
> > >
>
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [PATCH 1/1] drm/xe: Avoid serializing unbind jobs on prior TLB invalidations
2025-10-17 16:52 ` [PATCH 1/1] drm/xe: Avoid serializing unbind jobs on prior TLB invalidations Matthew Brost
2025-10-21 17:55 ` Summers, Stuart
@ 2025-10-22 8:00 ` Tvrtko Ursulin
2025-10-22 15:10 ` Matthew Brost
2025-10-23 12:28 ` Thomas Hellström
2 siblings, 1 reply; 15+ messages in thread
From: Tvrtko Ursulin @ 2025-10-22 8:00 UTC (permalink / raw)
To: Matthew Brost, intel-xe; +Cc: carlos.santa, thomas.hellstrom, Philipp Stanner
On 17/10/2025 17:52, Matthew Brost wrote:
> When a burst of unbind jobs is issued, a dependency chain can form
> between the TLB invalidation of a previous unbind job and the current
> one. This leads to undesirable serialization, causing current jobs to
> wait unnecessarily for prior TLB invalidations, execute on the GPU when
> not needed, and significantly slow down the unbind burst—resulting in up
> to a 4× slowdown.
>
> To break this chain, mask the last bind queue dependency if the last
> fence's DMA context matches the TLB invalidation context. This allows
> full pipelining of unbinds and TLB invalidations while preserving
> correct dma-fence signaling semantics.
>
> Link: https://gitlab.freedesktop.org/drm/xe/kernel/-/issues/6047
> Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> ---
> drivers/gpu/drm/xe/xe_exec.c | 3 +-
> drivers/gpu/drm/xe/xe_exec_queue.c | 18 +++++++++--
> drivers/gpu/drm/xe/xe_exec_queue.h | 3 +-
> drivers/gpu/drm/xe/xe_pt.c | 15 +++++++--
> drivers/gpu/drm/xe/xe_sched_job.c | 44 ++++++++++++++++++++++++++-
> drivers/gpu/drm/xe/xe_sched_job.h | 7 ++++-
> drivers/gpu/drm/xe/xe_tlb_inval_job.c | 14 +++++++++
> drivers/gpu/drm/xe/xe_tlb_inval_job.h | 2 ++
> 8 files changed, 98 insertions(+), 8 deletions(-)
>
> diff --git a/drivers/gpu/drm/xe/xe_exec.c b/drivers/gpu/drm/xe/xe_exec.c
> index 0dc27476832b..6034cfc8be06 100644
> --- a/drivers/gpu/drm/xe/xe_exec.c
> +++ b/drivers/gpu/drm/xe/xe_exec.c
> @@ -294,7 +294,8 @@ int xe_exec_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
> goto err_put_job;
>
> if (!xe_vm_in_lr_mode(vm)) {
> - err = xe_sched_job_last_fence_add_dep(job, vm);
> + err = xe_sched_job_last_fence_add_dep(job, vm, NO_MASK_DEP,
> + NO_MASK_DEP);
> if (err)
> goto err_put_job;
>
> diff --git a/drivers/gpu/drm/xe/xe_exec_queue.c b/drivers/gpu/drm/xe/xe_exec_queue.c
> index 90cbc95f8e2e..d6f69d9bccba 100644
> --- a/drivers/gpu/drm/xe/xe_exec_queue.c
> +++ b/drivers/gpu/drm/xe/xe_exec_queue.c
> @@ -25,6 +25,7 @@
> #include "xe_migrate.h"
> #include "xe_pm.h"
> #include "xe_ring_ops_types.h"
> +#include "xe_sched_job.h"
> #include "xe_trace.h"
> #include "xe_vm.h"
> #include "xe_pxp.h"
> @@ -1106,11 +1107,17 @@ void xe_exec_queue_last_fence_set(struct xe_exec_queue *q, struct xe_vm *vm,
> * xe_exec_queue_last_fence_test_dep - Test last fence dependency of queue
> * @q: The exec queue
> * @vm: The VM the engine does a bind or exec for
> + * @mask_ctx0: Mask dma-fence context0
> + * @mask_ctx1: Mask dma-fence context1
> + *
> + * Test last fence dependency of queue, skipping masked dma fence contexts.
> *
> * Returns:
> - * -ETIME if there exists an unsignalled last fence dependency, zero otherwise.
> + * -ETIME if there exists an unsignalled and unmasked last fence dependency,
> + * zero otherwise.
> */
> -int xe_exec_queue_last_fence_test_dep(struct xe_exec_queue *q, struct xe_vm *vm)
> +int xe_exec_queue_last_fence_test_dep(struct xe_exec_queue *q, struct xe_vm *vm,
> + u64 mask_ctx0, u64 mask_ctx1)
> {
> struct dma_fence *fence;
> int err = 0;
> @@ -1119,6 +1126,13 @@ int xe_exec_queue_last_fence_test_dep(struct xe_exec_queue *q, struct xe_vm *vm)
> if (fence) {
> err = test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence->flags) ?
> 0 : -ETIME;
> +
> + if (err == -ETIME) {
> + if (xe_sched_job_mask_dependency(fence, mask_ctx0,
> + mask_ctx1))
> + err = 0;
> + }
> +
> dma_fence_put(fence);
> }
>
> diff --git a/drivers/gpu/drm/xe/xe_exec_queue.h b/drivers/gpu/drm/xe/xe_exec_queue.h
> index a4dfbe858bda..99a35b22a46c 100644
> --- a/drivers/gpu/drm/xe/xe_exec_queue.h
> +++ b/drivers/gpu/drm/xe/xe_exec_queue.h
> @@ -85,7 +85,8 @@ struct dma_fence *xe_exec_queue_last_fence_get_for_resume(struct xe_exec_queue *
> void xe_exec_queue_last_fence_set(struct xe_exec_queue *e, struct xe_vm *vm,
> struct dma_fence *fence);
> int xe_exec_queue_last_fence_test_dep(struct xe_exec_queue *q,
> - struct xe_vm *vm);
> + struct xe_vm *vm, u64 mask_ctx0,
> + u64 mask_ctx1);
> void xe_exec_queue_update_run_ticks(struct xe_exec_queue *q);
>
> int xe_exec_queue_contexts_hwsp_rebase(struct xe_exec_queue *q, void *scratch);
> diff --git a/drivers/gpu/drm/xe/xe_pt.c b/drivers/gpu/drm/xe/xe_pt.c
> index d22fd1ccc0ba..bba9ae559f57 100644
> --- a/drivers/gpu/drm/xe/xe_pt.c
> +++ b/drivers/gpu/drm/xe/xe_pt.c
> @@ -1341,10 +1341,21 @@ static int xe_pt_vm_dependencies(struct xe_sched_job *job,
> }
>
> if (!(pt_update_ops->q->flags & EXEC_QUEUE_FLAG_KERNEL)) {
> + u64 mask_ctx0 = NO_MASK_DEP, mask_ctx1 = NO_MASK_DEP;
> +
> + if (ijob)
> + mask_ctx0 = xe_tlb_inval_job_fence_context(ijob);
> + if (mjob)
> + mask_ctx1 = xe_tlb_inval_job_fence_context(mjob);
> +
> if (job)
> - err = xe_sched_job_last_fence_add_dep(job, vm);
> + err = xe_sched_job_last_fence_add_dep(job, vm,
> + mask_ctx0,
> + mask_ctx1);
> else
> - err = xe_exec_queue_last_fence_test_dep(pt_update_ops->q, vm);
> + err = xe_exec_queue_last_fence_test_dep(pt_update_ops->q,
> + vm, mask_ctx0,
> + mask_ctx1);
> }
>
> for (i = 0; job && !err && i < vops->num_syncs; i++)
> diff --git a/drivers/gpu/drm/xe/xe_sched_job.c b/drivers/gpu/drm/xe/xe_sched_job.c
> index d21bf8f26964..7cbdd87904c6 100644
> --- a/drivers/gpu/drm/xe/xe_sched_job.c
> +++ b/drivers/gpu/drm/xe/xe_sched_job.c
> @@ -6,6 +6,7 @@
> #include "xe_sched_job.h"
>
> #include <uapi/drm/xe_drm.h>
> +#include <linux/dma-fence-array.h>
> #include <linux/dma-fence-chain.h>
> #include <linux/slab.h>
>
> @@ -295,19 +296,60 @@ void xe_sched_job_push(struct xe_sched_job *job)
> xe_sched_job_put(job);
> }
>
> +/**
> + * xe_sched_job_mask_dependency() - Determine if a dma-fence dependency can be masked
> + * @fence: The dma-fence to check
> + * @mask_ctx0: First context to compare against the fence's context
> + * @mask_ctx1: Second context to compare against the fence's context
> + *
> + * This function checks whether the context of the given dma-fence matches
> + * either of the provided mask contexts. If a match is found, the dependency
> + * represented by the fence can be skipped. If the fence is a dma-fence-array,
> + * its individual fences are unwound and checked.
> + *
> + * Return: true if the fence can be masked (i.e., skipped), false otherwise.
> + */
> +bool xe_sched_job_mask_dependency(struct dma_fence *fence, u64 mask_ctx0,
> + u64 mask_ctx1)
> +{
> + if (dma_fence_is_array(fence)) {
> + struct dma_fence *__fence;
> + int index;
> +
> + dma_fence_array_for_each(__fence, index, fence)
> + if (__fence->context == mask_ctx0 ||
> + __fence->context == mask_ctx1)
> + return true;
> + } else if (fence->context == mask_ctx0 ||
> + fence->context == mask_ctx1) {
> + return true;
> + }
> +
> + return false;
> +}
> +
> /**
> * xe_sched_job_last_fence_add_dep - Add last fence dependency to job
> * @job:job to add the last fence dependency to
> * @vm: virtual memory job belongs to
> + * @mask_ctx0: Mask dma-fence context0
> + * @mask_ctx1: Mask dma-fence context1
> + *
> + * Add last fence dependency to job, skipping masked dma fence contexts.
> *
> * Returns:
> * 0 on success, or an error on failing to expand the array.
> */
> -int xe_sched_job_last_fence_add_dep(struct xe_sched_job *job, struct xe_vm *vm)
> +int xe_sched_job_last_fence_add_dep(struct xe_sched_job *job, struct xe_vm *vm,
> + u64 mask_ctx0, u64 mask_ctx1)
> {
> struct dma_fence *fence;
>
> fence = xe_exec_queue_last_fence_get(job->q, vm);
> + if (xe_sched_job_mask_dependency(fence, mask_ctx0, mask_ctx1)) {
> + dma_fence_put(fence);
> + return 0;
> + }
>
> return drm_sched_job_add_dependency(&job->drm, fence);
> }
> diff --git a/drivers/gpu/drm/xe/xe_sched_job.h b/drivers/gpu/drm/xe/xe_sched_job.h
> index 3dc72c5c1f13..81d8e848e605 100644
> --- a/drivers/gpu/drm/xe/xe_sched_job.h
> +++ b/drivers/gpu/drm/xe/xe_sched_job.h
> @@ -58,7 +58,8 @@ bool xe_sched_job_completed(struct xe_sched_job *job);
> void xe_sched_job_arm(struct xe_sched_job *job);
> void xe_sched_job_push(struct xe_sched_job *job);
>
> -int xe_sched_job_last_fence_add_dep(struct xe_sched_job *job, struct xe_vm *vm);
> +int xe_sched_job_last_fence_add_dep(struct xe_sched_job *job, struct xe_vm *vm,
> + u64 mask_ctx0, u64 mask_ctx1);
> void xe_sched_job_init_user_fence(struct xe_sched_job *job,
> struct xe_sync_entry *sync);
>
> @@ -93,4 +94,8 @@ void xe_sched_job_snapshot_print(struct xe_sched_job_snapshot *snapshot, struct
> int xe_sched_job_add_deps(struct xe_sched_job *job, struct dma_resv *resv,
> enum dma_resv_usage usage);
>
> +#define NO_MASK_DEP (~0x0ull)
> +bool xe_sched_job_mask_dependency(struct dma_fence *fence, u64 mask_ctx0,
> + u64 mask_ctx1);
> +
> #endif
> diff --git a/drivers/gpu/drm/xe/xe_tlb_inval_job.c b/drivers/gpu/drm/xe/xe_tlb_inval_job.c
> index 492def04a559..f2fe7f9fbb22 100644
> --- a/drivers/gpu/drm/xe/xe_tlb_inval_job.c
> +++ b/drivers/gpu/drm/xe/xe_tlb_inval_job.c
> @@ -32,6 +32,8 @@ struct xe_tlb_inval_job {
> u64 start;
> /** @end: End address to invalidate */
> u64 end;
> + /** @fence_context: Fence context for job */
> + u64 fence_context;
> /** @asid: Address space ID to invalidate */
> u32 asid;
> /** @fence_armed: Fence has been armed */
> @@ -101,6 +103,7 @@ xe_tlb_inval_job_create(struct xe_exec_queue *q, struct xe_tlb_inval *tlb_inval,
> job->asid = asid;
> job->fence_armed = false;
> job->dep.ops = &dep_job_ops;
> + job->fence_context = entity->fence_context + 1;
As a side note, hardcoding the assumption on how scheduler allocates
contexts is not great given recent efforts to make drivers know less of
the scheduler internals.
But what I really wanted to ask is, having only glanced the patch
briefly, could xe performance problem here also be solved by unwrapping
the container fences at the DRM scheduler dependency tracking level?
I am asking because amdgpu recently posted a patch to unwrap in their
code for potentially similar performance reasons, and if now xe wants
something similar, or even the same, it is an interesting question where
to do it.
Also, I have a patch (not sure if I posted it so far) which unwraps in
drm_sched_job_add_dependency() and converst the dependency xarray to
unwrapped dma-fence-array. Initial idea there was to allow scheduler
worker to only be woken up once, once all deps are signaled, but now if
two drivers seems to be unwrapping fences maybe there is a case to be
made for doing it in the core.
Regards,
Tvrtko
> kref_init(&job->refcount);
> xe_exec_queue_get(q); /* Pairs with put in xe_tlb_inval_job_destroy */
>
> @@ -266,3 +269,14 @@ void xe_tlb_inval_job_put(struct xe_tlb_inval_job *job)
> if (!IS_ERR_OR_NULL(job))
> kref_put(&job->refcount, xe_tlb_inval_job_destroy);
> }
> +
> +/**
> + * xe_tlb_inval_job_fence_context() - TLB invalidation job fence context
> + * @job: TLB invalidation job object
> + *
> + * Return: TLB invalidation job fence context
> + */
> +u64 xe_tlb_inval_job_fence_context(struct xe_tlb_inval_job *job)
> +{
> + return job->fence_context;
> +}
> diff --git a/drivers/gpu/drm/xe/xe_tlb_inval_job.h b/drivers/gpu/drm/xe/xe_tlb_inval_job.h
> index e63edcb26b50..2576165c2228 100644
> --- a/drivers/gpu/drm/xe/xe_tlb_inval_job.h
> +++ b/drivers/gpu/drm/xe/xe_tlb_inval_job.h
> @@ -30,4 +30,6 @@ void xe_tlb_inval_job_get(struct xe_tlb_inval_job *job);
>
> void xe_tlb_inval_job_put(struct xe_tlb_inval_job *job);
>
> +u64 xe_tlb_inval_job_fence_context(struct xe_tlb_inval_job *job);
> +
> #endif
^ permalink raw reply [flat|nested] 15+ messages in thread* Re: [PATCH 1/1] drm/xe: Avoid serializing unbind jobs on prior TLB invalidations
2025-10-22 8:00 ` Tvrtko Ursulin
@ 2025-10-22 15:10 ` Matthew Brost
2025-10-23 12:46 ` Tvrtko Ursulin
0 siblings, 1 reply; 15+ messages in thread
From: Matthew Brost @ 2025-10-22 15:10 UTC (permalink / raw)
To: Tvrtko Ursulin; +Cc: intel-xe, carlos.santa, thomas.hellstrom, Philipp Stanner
On Wed, Oct 22, 2025 at 09:00:47AM +0100, Tvrtko Ursulin wrote:
>
> On 17/10/2025 17:52, Matthew Brost wrote:
> > When a burst of unbind jobs is issued, a dependency chain can form
> > between the TLB invalidation of a previous unbind job and the current
> > one. This leads to undesirable serialization, causing current jobs to
> > wait unnecessarily for prior TLB invalidations, execute on the GPU when
> > not needed, and significantly slow down the unbind burst—resulting in up
> > to a 4× slowdown.
> >
> > To break this chain, mask the last bind queue dependency if the last
> > fence's DMA context matches the TLB invalidation context. This allows
> > full pipelining of unbinds and TLB invalidations while preserving
> > correct dma-fence signaling semantics.
> >
> > Link: https://gitlab.freedesktop.org/drm/xe/kernel/-/issues/6047
> > Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> > ---
> > drivers/gpu/drm/xe/xe_exec.c | 3 +-
> > drivers/gpu/drm/xe/xe_exec_queue.c | 18 +++++++++--
> > drivers/gpu/drm/xe/xe_exec_queue.h | 3 +-
> > drivers/gpu/drm/xe/xe_pt.c | 15 +++++++--
> > drivers/gpu/drm/xe/xe_sched_job.c | 44 ++++++++++++++++++++++++++-
> > drivers/gpu/drm/xe/xe_sched_job.h | 7 ++++-
> > drivers/gpu/drm/xe/xe_tlb_inval_job.c | 14 +++++++++
> > drivers/gpu/drm/xe/xe_tlb_inval_job.h | 2 ++
> > 8 files changed, 98 insertions(+), 8 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/xe/xe_exec.c b/drivers/gpu/drm/xe/xe_exec.c
> > index 0dc27476832b..6034cfc8be06 100644
> > --- a/drivers/gpu/drm/xe/xe_exec.c
> > +++ b/drivers/gpu/drm/xe/xe_exec.c
> > @@ -294,7 +294,8 @@ int xe_exec_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
> > goto err_put_job;
> > if (!xe_vm_in_lr_mode(vm)) {
> > - err = xe_sched_job_last_fence_add_dep(job, vm);
> > + err = xe_sched_job_last_fence_add_dep(job, vm, NO_MASK_DEP,
> > + NO_MASK_DEP);
> > if (err)
> > goto err_put_job;
> > diff --git a/drivers/gpu/drm/xe/xe_exec_queue.c b/drivers/gpu/drm/xe/xe_exec_queue.c
> > index 90cbc95f8e2e..d6f69d9bccba 100644
> > --- a/drivers/gpu/drm/xe/xe_exec_queue.c
> > +++ b/drivers/gpu/drm/xe/xe_exec_queue.c
> > @@ -25,6 +25,7 @@
> > #include "xe_migrate.h"
> > #include "xe_pm.h"
> > #include "xe_ring_ops_types.h"
> > +#include "xe_sched_job.h"
> > #include "xe_trace.h"
> > #include "xe_vm.h"
> > #include "xe_pxp.h"
> > @@ -1106,11 +1107,17 @@ void xe_exec_queue_last_fence_set(struct xe_exec_queue *q, struct xe_vm *vm,
> > * xe_exec_queue_last_fence_test_dep - Test last fence dependency of queue
> > * @q: The exec queue
> > * @vm: The VM the engine does a bind or exec for
> > + * @mask_ctx0: Mask dma-fence context0
> > + * @mask_ctx1: Mask dma-fence context1
> > + *
> > + * Test last fence dependency of queue, skipping masked dma fence contexts.
> > *
> > * Returns:
> > - * -ETIME if there exists an unsignalled last fence dependency, zero otherwise.
> > + * -ETIME if there exists an unsignalled and unmasked last fence dependency,
> > + * zero otherwise.
> > */
> > -int xe_exec_queue_last_fence_test_dep(struct xe_exec_queue *q, struct xe_vm *vm)
> > +int xe_exec_queue_last_fence_test_dep(struct xe_exec_queue *q, struct xe_vm *vm,
> > + u64 mask_ctx0, u64 mask_ctx1)
> > {
> > struct dma_fence *fence;
> > int err = 0;
> > @@ -1119,6 +1126,13 @@ int xe_exec_queue_last_fence_test_dep(struct xe_exec_queue *q, struct xe_vm *vm)
> > if (fence) {
> > err = test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence->flags) ?
> > 0 : -ETIME;
> > +
> > + if (err == -ETIME) {
> > + if (xe_sched_job_mask_dependency(fence, mask_ctx0,
> > + mask_ctx1))
> > + err = 0;
> > + }
> > +
> > dma_fence_put(fence);
> > }
> > diff --git a/drivers/gpu/drm/xe/xe_exec_queue.h b/drivers/gpu/drm/xe/xe_exec_queue.h
> > index a4dfbe858bda..99a35b22a46c 100644
> > --- a/drivers/gpu/drm/xe/xe_exec_queue.h
> > +++ b/drivers/gpu/drm/xe/xe_exec_queue.h
> > @@ -85,7 +85,8 @@ struct dma_fence *xe_exec_queue_last_fence_get_for_resume(struct xe_exec_queue *
> > void xe_exec_queue_last_fence_set(struct xe_exec_queue *e, struct xe_vm *vm,
> > struct dma_fence *fence);
> > int xe_exec_queue_last_fence_test_dep(struct xe_exec_queue *q,
> > - struct xe_vm *vm);
> > + struct xe_vm *vm, u64 mask_ctx0,
> > + u64 mask_ctx1);
> > void xe_exec_queue_update_run_ticks(struct xe_exec_queue *q);
> > int xe_exec_queue_contexts_hwsp_rebase(struct xe_exec_queue *q, void *scratch);
> > diff --git a/drivers/gpu/drm/xe/xe_pt.c b/drivers/gpu/drm/xe/xe_pt.c
> > index d22fd1ccc0ba..bba9ae559f57 100644
> > --- a/drivers/gpu/drm/xe/xe_pt.c
> > +++ b/drivers/gpu/drm/xe/xe_pt.c
> > @@ -1341,10 +1341,21 @@ static int xe_pt_vm_dependencies(struct xe_sched_job *job,
> > }
> > if (!(pt_update_ops->q->flags & EXEC_QUEUE_FLAG_KERNEL)) {
> > + u64 mask_ctx0 = NO_MASK_DEP, mask_ctx1 = NO_MASK_DEP;
> > +
> > + if (ijob)
> > + mask_ctx0 = xe_tlb_inval_job_fence_context(ijob);
> > + if (mjob)
> > + mask_ctx1 = xe_tlb_inval_job_fence_context(mjob);
> > +
> > if (job)
> > - err = xe_sched_job_last_fence_add_dep(job, vm);
> > + err = xe_sched_job_last_fence_add_dep(job, vm,
> > + mask_ctx0,
> > + mask_ctx1);
> > else
> > - err = xe_exec_queue_last_fence_test_dep(pt_update_ops->q, vm);
> > + err = xe_exec_queue_last_fence_test_dep(pt_update_ops->q,
> > + vm, mask_ctx0,
> > + mask_ctx1);
> > }
> > for (i = 0; job && !err && i < vops->num_syncs; i++)
> > diff --git a/drivers/gpu/drm/xe/xe_sched_job.c b/drivers/gpu/drm/xe/xe_sched_job.c
> > index d21bf8f26964..7cbdd87904c6 100644
> > --- a/drivers/gpu/drm/xe/xe_sched_job.c
> > +++ b/drivers/gpu/drm/xe/xe_sched_job.c
> > @@ -6,6 +6,7 @@
> > #include "xe_sched_job.h"
> > #include <uapi/drm/xe_drm.h>
> > +#include <linux/dma-fence-array.h>
> > #include <linux/dma-fence-chain.h>
> > #include <linux/slab.h>
> > @@ -295,19 +296,60 @@ void xe_sched_job_push(struct xe_sched_job *job)
> > xe_sched_job_put(job);
> > }
> > +/**
> > + * xe_sched_job_mask_dependency() - Determine if a dma-fence dependency can be masked
> > + * @fence: The dma-fence to check
> > + * @mask_ctx0: First context to compare against the fence's context
> > + * @mask_ctx1: Second context to compare against the fence's context
> > + *
> > + * This function checks whether the context of the given dma-fence matches
> > + * either of the provided mask contexts. If a match is found, the dependency
> > + * represented by the fence can be skipped. If the fence is a dma-fence-array,
> > + * its individual fences are unwound and checked.
> > + *
> > + * Return: true if the fence can be masked (i.e., skipped), false otherwise.
> > + */
> > +bool xe_sched_job_mask_dependency(struct dma_fence *fence, u64 mask_ctx0,
> > + u64 mask_ctx1)
> > +{
> > + if (dma_fence_is_array(fence)) {
> > + struct dma_fence *__fence;
> > + int index;
> > +
> > + dma_fence_array_for_each(__fence, index, fence)
> > + if (__fence->context == mask_ctx0 ||
> > + __fence->context == mask_ctx1)
> > + return true;
> > + } else if (fence->context == mask_ctx0 ||
> > + fence->context == mask_ctx1) {
> > + return true;
> > + }
> > +
> > + return false;
> > +}
> > +
> > /**
> > * xe_sched_job_last_fence_add_dep - Add last fence dependency to job
> > * @job:job to add the last fence dependency to
> > * @vm: virtual memory job belongs to
> > + * @mask_ctx0: Mask dma-fence context0
> > + * @mask_ctx1: Mask dma-fence context1
> > + *
> > + * Add last fence dependency to job, skipping masked dma fence contexts.
> > *
> > * Returns:
> > * 0 on success, or an error on failing to expand the array.
> > */
> > -int xe_sched_job_last_fence_add_dep(struct xe_sched_job *job, struct xe_vm *vm)
> > +int xe_sched_job_last_fence_add_dep(struct xe_sched_job *job, struct xe_vm *vm,
> > + u64 mask_ctx0, u64 mask_ctx1)
> > {
> > struct dma_fence *fence;
> > fence = xe_exec_queue_last_fence_get(job->q, vm);
> > + if (xe_sched_job_mask_dependency(fence, mask_ctx0, mask_ctx1)) {
> > + dma_fence_put(fence);
> > + return 0;
> > + }
> > return drm_sched_job_add_dependency(&job->drm, fence);
> > }
> > diff --git a/drivers/gpu/drm/xe/xe_sched_job.h b/drivers/gpu/drm/xe/xe_sched_job.h
> > index 3dc72c5c1f13..81d8e848e605 100644
> > --- a/drivers/gpu/drm/xe/xe_sched_job.h
> > +++ b/drivers/gpu/drm/xe/xe_sched_job.h
> > @@ -58,7 +58,8 @@ bool xe_sched_job_completed(struct xe_sched_job *job);
> > void xe_sched_job_arm(struct xe_sched_job *job);
> > void xe_sched_job_push(struct xe_sched_job *job);
> > -int xe_sched_job_last_fence_add_dep(struct xe_sched_job *job, struct xe_vm *vm);
> > +int xe_sched_job_last_fence_add_dep(struct xe_sched_job *job, struct xe_vm *vm,
> > + u64 mask_ctx0, u64 mask_ctx1);
> > void xe_sched_job_init_user_fence(struct xe_sched_job *job,
> > struct xe_sync_entry *sync);
> > @@ -93,4 +94,8 @@ void xe_sched_job_snapshot_print(struct xe_sched_job_snapshot *snapshot, struct
> > int xe_sched_job_add_deps(struct xe_sched_job *job, struct dma_resv *resv,
> > enum dma_resv_usage usage);
> > +#define NO_MASK_DEP (~0x0ull)
> > +bool xe_sched_job_mask_dependency(struct dma_fence *fence, u64 mask_ctx0,
> > + u64 mask_ctx1);
> > +
> > #endif
> > diff --git a/drivers/gpu/drm/xe/xe_tlb_inval_job.c b/drivers/gpu/drm/xe/xe_tlb_inval_job.c
> > index 492def04a559..f2fe7f9fbb22 100644
> > --- a/drivers/gpu/drm/xe/xe_tlb_inval_job.c
> > +++ b/drivers/gpu/drm/xe/xe_tlb_inval_job.c
> > @@ -32,6 +32,8 @@ struct xe_tlb_inval_job {
> > u64 start;
> > /** @end: End address to invalidate */
> > u64 end;
> > + /** @fence_context: Fence context for job */
> > + u64 fence_context;
> > /** @asid: Address space ID to invalidate */
> > u32 asid;
> > /** @fence_armed: Fence has been armed */
> > @@ -101,6 +103,7 @@ xe_tlb_inval_job_create(struct xe_exec_queue *q, struct xe_tlb_inval *tlb_inval,
> > job->asid = asid;
> > job->fence_armed = false;
> > job->dep.ops = &dep_job_ops;
> > + job->fence_context = entity->fence_context + 1;
>
> As a side note, hardcoding the assumption on how scheduler allocates
> contexts is not great given recent efforts to make drivers know less of the
> scheduler internals.
>
Yes, we should probably have a helper here — maybe
drm_sched_job_finished_context?
I was planning to roll this change into [1], but that series hasn’t
gained much traction, and fixing this is a fairly high-priority issue
for customers.
This is documented in the DRM scheduler kernel docs:
entity->fence_context + 1 is the job's finished context.
[1] https://patchwork.freedesktop.org/series/155314/
> But what I really wanted to ask is, having only glanced the patch briefly,
> could xe performance problem here also be solved by unwrapping the container
> fences at the DRM scheduler dependency tracking level?
>
This is primarily about preventing TLB fences — which originate from a
different context than the bind queue but are still ordered on the queue
— from becoming dependencies. The process involves two passes: in the
first pass, we detect dependencies. If none are found, we immediately
complete the bind via the CPU. If dependencies are present, we defer the
bind to the GPU.
> I am asking because amdgpu recently posted a patch to unwrap in their code
> for potentially similar performance reasons, and if now xe wants something
> similar, or even the same, it is an interesting question where to do it.
>
> Also, I have a patch (not sure if I posted it so far) which unwraps in
> drm_sched_job_add_dependency() and converst the dependency xarray to
> unwrapped dma-fence-array. Initial idea there was to allow scheduler worker
> to only be woken up once, once all deps are signaled, but now if two drivers
> seems to be unwrapping fences maybe there is a case to be made for doing it
> in the core.
>
I don't think this is the same problem as the one above, but it's an
interesting idea in general. CC me if you post this one.
Matt
> Regards,
>
> Tvrtko
>
> > kref_init(&job->refcount);
> > xe_exec_queue_get(q); /* Pairs with put in xe_tlb_inval_job_destroy */
> > @@ -266,3 +269,14 @@ void xe_tlb_inval_job_put(struct xe_tlb_inval_job *job)
> > if (!IS_ERR_OR_NULL(job))
> > kref_put(&job->refcount, xe_tlb_inval_job_destroy);
> > }
> > +
> > +/**
> > + * xe_tlb_inval_job_fence_context() - TLB invalidation job fence context
> > + * @job: TLB invalidation job object
> > + *
> > + * Return: TLB invalidation job fence context
> > + */
> > +u64 xe_tlb_inval_job_fence_context(struct xe_tlb_inval_job *job)
> > +{
> > + return job->fence_context;
> > +}
> > diff --git a/drivers/gpu/drm/xe/xe_tlb_inval_job.h b/drivers/gpu/drm/xe/xe_tlb_inval_job.h
> > index e63edcb26b50..2576165c2228 100644
> > --- a/drivers/gpu/drm/xe/xe_tlb_inval_job.h
> > +++ b/drivers/gpu/drm/xe/xe_tlb_inval_job.h
> > @@ -30,4 +30,6 @@ void xe_tlb_inval_job_get(struct xe_tlb_inval_job *job);
> > void xe_tlb_inval_job_put(struct xe_tlb_inval_job *job);
> > +u64 xe_tlb_inval_job_fence_context(struct xe_tlb_inval_job *job);
> > +
> > #endif
>
^ permalink raw reply [flat|nested] 15+ messages in thread* Re: [PATCH 1/1] drm/xe: Avoid serializing unbind jobs on prior TLB invalidations
2025-10-22 15:10 ` Matthew Brost
@ 2025-10-23 12:46 ` Tvrtko Ursulin
2025-10-23 18:55 ` Matthew Brost
0 siblings, 1 reply; 15+ messages in thread
From: Tvrtko Ursulin @ 2025-10-23 12:46 UTC (permalink / raw)
To: Matthew Brost; +Cc: intel-xe, carlos.santa, thomas.hellstrom, Philipp Stanner
On 22/10/2025 16:10, Matthew Brost wrote:
> On Wed, Oct 22, 2025 at 09:00:47AM +0100, Tvrtko Ursulin wrote:
>>
>> On 17/10/2025 17:52, Matthew Brost wrote:
>>> When a burst of unbind jobs is issued, a dependency chain can form
>>> between the TLB invalidation of a previous unbind job and the current
>>> one. This leads to undesirable serialization, causing current jobs to
>>> wait unnecessarily for prior TLB invalidations, execute on the GPU when
>>> not needed, and significantly slow down the unbind burst—resulting in up
>>> to a 4× slowdown.
>>>
>>> To break this chain, mask the last bind queue dependency if the last
>>> fence's DMA context matches the TLB invalidation context. This allows
>>> full pipelining of unbinds and TLB invalidations while preserving
>>> correct dma-fence signaling semantics.
>>>
>>> Link: https://gitlab.freedesktop.org/drm/xe/kernel/-/issues/6047
>>> Signed-off-by: Matthew Brost <matthew.brost@intel.com>
>>> ---
>>> drivers/gpu/drm/xe/xe_exec.c | 3 +-
>>> drivers/gpu/drm/xe/xe_exec_queue.c | 18 +++++++++--
>>> drivers/gpu/drm/xe/xe_exec_queue.h | 3 +-
>>> drivers/gpu/drm/xe/xe_pt.c | 15 +++++++--
>>> drivers/gpu/drm/xe/xe_sched_job.c | 44 ++++++++++++++++++++++++++-
>>> drivers/gpu/drm/xe/xe_sched_job.h | 7 ++++-
>>> drivers/gpu/drm/xe/xe_tlb_inval_job.c | 14 +++++++++
>>> drivers/gpu/drm/xe/xe_tlb_inval_job.h | 2 ++
>>> 8 files changed, 98 insertions(+), 8 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/xe/xe_exec.c b/drivers/gpu/drm/xe/xe_exec.c
>>> index 0dc27476832b..6034cfc8be06 100644
>>> --- a/drivers/gpu/drm/xe/xe_exec.c
>>> +++ b/drivers/gpu/drm/xe/xe_exec.c
>>> @@ -294,7 +294,8 @@ int xe_exec_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
>>> goto err_put_job;
>>> if (!xe_vm_in_lr_mode(vm)) {
>>> - err = xe_sched_job_last_fence_add_dep(job, vm);
>>> + err = xe_sched_job_last_fence_add_dep(job, vm, NO_MASK_DEP,
>>> + NO_MASK_DEP);
>>> if (err)
>>> goto err_put_job;
>>> diff --git a/drivers/gpu/drm/xe/xe_exec_queue.c b/drivers/gpu/drm/xe/xe_exec_queue.c
>>> index 90cbc95f8e2e..d6f69d9bccba 100644
>>> --- a/drivers/gpu/drm/xe/xe_exec_queue.c
>>> +++ b/drivers/gpu/drm/xe/xe_exec_queue.c
>>> @@ -25,6 +25,7 @@
>>> #include "xe_migrate.h"
>>> #include "xe_pm.h"
>>> #include "xe_ring_ops_types.h"
>>> +#include "xe_sched_job.h"
>>> #include "xe_trace.h"
>>> #include "xe_vm.h"
>>> #include "xe_pxp.h"
>>> @@ -1106,11 +1107,17 @@ void xe_exec_queue_last_fence_set(struct xe_exec_queue *q, struct xe_vm *vm,
>>> * xe_exec_queue_last_fence_test_dep - Test last fence dependency of queue
>>> * @q: The exec queue
>>> * @vm: The VM the engine does a bind or exec for
>>> + * @mask_ctx0: Mask dma-fence context0
>>> + * @mask_ctx1: Mask dma-fence context1
>>> + *
>>> + * Test last fence dependency of queue, skipping masked dma fence contexts.
>>> *
>>> * Returns:
>>> - * -ETIME if there exists an unsignalled last fence dependency, zero otherwise.
>>> + * -ETIME if there exists an unsignalled and unmasked last fence dependency,
>>> + * zero otherwise.
>>> */
>>> -int xe_exec_queue_last_fence_test_dep(struct xe_exec_queue *q, struct xe_vm *vm)
>>> +int xe_exec_queue_last_fence_test_dep(struct xe_exec_queue *q, struct xe_vm *vm,
>>> + u64 mask_ctx0, u64 mask_ctx1)
>>> {
>>> struct dma_fence *fence;
>>> int err = 0;
>>> @@ -1119,6 +1126,13 @@ int xe_exec_queue_last_fence_test_dep(struct xe_exec_queue *q, struct xe_vm *vm)
>>> if (fence) {
>>> err = test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence->flags) ?
>>> 0 : -ETIME;
>>> +
>>> + if (err == -ETIME) {
>>> + if (xe_sched_job_mask_dependency(fence, mask_ctx0,
>>> + mask_ctx1))
>>> + err = 0;
>>> + }
>>> +
>>> dma_fence_put(fence);
>>> }
>>> diff --git a/drivers/gpu/drm/xe/xe_exec_queue.h b/drivers/gpu/drm/xe/xe_exec_queue.h
>>> index a4dfbe858bda..99a35b22a46c 100644
>>> --- a/drivers/gpu/drm/xe/xe_exec_queue.h
>>> +++ b/drivers/gpu/drm/xe/xe_exec_queue.h
>>> @@ -85,7 +85,8 @@ struct dma_fence *xe_exec_queue_last_fence_get_for_resume(struct xe_exec_queue *
>>> void xe_exec_queue_last_fence_set(struct xe_exec_queue *e, struct xe_vm *vm,
>>> struct dma_fence *fence);
>>> int xe_exec_queue_last_fence_test_dep(struct xe_exec_queue *q,
>>> - struct xe_vm *vm);
>>> + struct xe_vm *vm, u64 mask_ctx0,
>>> + u64 mask_ctx1);
>>> void xe_exec_queue_update_run_ticks(struct xe_exec_queue *q);
>>> int xe_exec_queue_contexts_hwsp_rebase(struct xe_exec_queue *q, void *scratch);
>>> diff --git a/drivers/gpu/drm/xe/xe_pt.c b/drivers/gpu/drm/xe/xe_pt.c
>>> index d22fd1ccc0ba..bba9ae559f57 100644
>>> --- a/drivers/gpu/drm/xe/xe_pt.c
>>> +++ b/drivers/gpu/drm/xe/xe_pt.c
>>> @@ -1341,10 +1341,21 @@ static int xe_pt_vm_dependencies(struct xe_sched_job *job,
>>> }
>>> if (!(pt_update_ops->q->flags & EXEC_QUEUE_FLAG_KERNEL)) {
>>> + u64 mask_ctx0 = NO_MASK_DEP, mask_ctx1 = NO_MASK_DEP;
>>> +
>>> + if (ijob)
>>> + mask_ctx0 = xe_tlb_inval_job_fence_context(ijob);
>>> + if (mjob)
>>> + mask_ctx1 = xe_tlb_inval_job_fence_context(mjob);
>>> +
>>> if (job)
>>> - err = xe_sched_job_last_fence_add_dep(job, vm);
>>> + err = xe_sched_job_last_fence_add_dep(job, vm,
>>> + mask_ctx0,
>>> + mask_ctx1);
>>> else
>>> - err = xe_exec_queue_last_fence_test_dep(pt_update_ops->q, vm);
>>> + err = xe_exec_queue_last_fence_test_dep(pt_update_ops->q,
>>> + vm, mask_ctx0,
>>> + mask_ctx1);
>>> }
>>> for (i = 0; job && !err && i < vops->num_syncs; i++)
>>> diff --git a/drivers/gpu/drm/xe/xe_sched_job.c b/drivers/gpu/drm/xe/xe_sched_job.c
>>> index d21bf8f26964..7cbdd87904c6 100644
>>> --- a/drivers/gpu/drm/xe/xe_sched_job.c
>>> +++ b/drivers/gpu/drm/xe/xe_sched_job.c
>>> @@ -6,6 +6,7 @@
>>> #include "xe_sched_job.h"
>>> #include <uapi/drm/xe_drm.h>
>>> +#include <linux/dma-fence-array.h>
>>> #include <linux/dma-fence-chain.h>
>>> #include <linux/slab.h>
>>> @@ -295,19 +296,60 @@ void xe_sched_job_push(struct xe_sched_job *job)
>>> xe_sched_job_put(job);
>>> }
>>> +/**
>>> + * xe_sched_job_mask_dependency() - Determine if a dma-fence dependency can be masked
>>> + * @fence: The dma-fence to check
>>> + * @mask_ctx0: First context to compare against the fence's context
>>> + * @mask_ctx1: Second context to compare against the fence's context
>>> + *
>>> + * This function checks whether the context of the given dma-fence matches
>>> + * either of the provided mask contexts. If a match is found, the dependency
>>> + * represented by the fence can be skipped. If the fence is a dma-fence-array,
>>> + * its individual fences are unwound and checked.
>>> + *
>>> + * Return: true if the fence can be masked (i.e., skipped), false otherwise.
>>> + */
>>> +bool xe_sched_job_mask_dependency(struct dma_fence *fence, u64 mask_ctx0,
>>> + u64 mask_ctx1)
>>> +{
>>> + if (dma_fence_is_array(fence)) {
>>> + struct dma_fence *__fence;
>>> + int index;
>>> +
>>> + dma_fence_array_for_each(__fence, index, fence)
>>> + if (__fence->context == mask_ctx0 ||
>>> + __fence->context == mask_ctx1)
>>> + return true;
>>> + } else if (fence->context == mask_ctx0 ||
>>> + fence->context == mask_ctx1) {
>>> + return true;
>>> + }
>>> +
>>> + return false;
>>> +}
>>> +
>>> /**
>>> * xe_sched_job_last_fence_add_dep - Add last fence dependency to job
>>> * @job:job to add the last fence dependency to
>>> * @vm: virtual memory job belongs to
>>> + * @mask_ctx0: Mask dma-fence context0
>>> + * @mask_ctx1: Mask dma-fence context1
>>> + *
>>> + * Add last fence dependency to job, skipping masked dma fence contexts.
>>> *
>>> * Returns:
>>> * 0 on success, or an error on failing to expand the array.
>>> */
>>> -int xe_sched_job_last_fence_add_dep(struct xe_sched_job *job, struct xe_vm *vm)
>>> +int xe_sched_job_last_fence_add_dep(struct xe_sched_job *job, struct xe_vm *vm,
>>> + u64 mask_ctx0, u64 mask_ctx1)
>>> {
>>> struct dma_fence *fence;
>>> fence = xe_exec_queue_last_fence_get(job->q, vm);
>>> + if (xe_sched_job_mask_dependency(fence, mask_ctx0, mask_ctx1)) {
>>> + dma_fence_put(fence);
>>> + return 0;
>>> + }
>>> return drm_sched_job_add_dependency(&job->drm, fence);
>>> }
>>> diff --git a/drivers/gpu/drm/xe/xe_sched_job.h b/drivers/gpu/drm/xe/xe_sched_job.h
>>> index 3dc72c5c1f13..81d8e848e605 100644
>>> --- a/drivers/gpu/drm/xe/xe_sched_job.h
>>> +++ b/drivers/gpu/drm/xe/xe_sched_job.h
>>> @@ -58,7 +58,8 @@ bool xe_sched_job_completed(struct xe_sched_job *job);
>>> void xe_sched_job_arm(struct xe_sched_job *job);
>>> void xe_sched_job_push(struct xe_sched_job *job);
>>> -int xe_sched_job_last_fence_add_dep(struct xe_sched_job *job, struct xe_vm *vm);
>>> +int xe_sched_job_last_fence_add_dep(struct xe_sched_job *job, struct xe_vm *vm,
>>> + u64 mask_ctx0, u64 mask_ctx1);
>>> void xe_sched_job_init_user_fence(struct xe_sched_job *job,
>>> struct xe_sync_entry *sync);
>>> @@ -93,4 +94,8 @@ void xe_sched_job_snapshot_print(struct xe_sched_job_snapshot *snapshot, struct
>>> int xe_sched_job_add_deps(struct xe_sched_job *job, struct dma_resv *resv,
>>> enum dma_resv_usage usage);
>>> +#define NO_MASK_DEP (~0x0ull)
>>> +bool xe_sched_job_mask_dependency(struct dma_fence *fence, u64 mask_ctx0,
>>> + u64 mask_ctx1);
>>> +
>>> #endif
>>> diff --git a/drivers/gpu/drm/xe/xe_tlb_inval_job.c b/drivers/gpu/drm/xe/xe_tlb_inval_job.c
>>> index 492def04a559..f2fe7f9fbb22 100644
>>> --- a/drivers/gpu/drm/xe/xe_tlb_inval_job.c
>>> +++ b/drivers/gpu/drm/xe/xe_tlb_inval_job.c
>>> @@ -32,6 +32,8 @@ struct xe_tlb_inval_job {
>>> u64 start;
>>> /** @end: End address to invalidate */
>>> u64 end;
>>> + /** @fence_context: Fence context for job */
>>> + u64 fence_context;
>>> /** @asid: Address space ID to invalidate */
>>> u32 asid;
>>> /** @fence_armed: Fence has been armed */
>>> @@ -101,6 +103,7 @@ xe_tlb_inval_job_create(struct xe_exec_queue *q, struct xe_tlb_inval *tlb_inval,
>>> job->asid = asid;
>>> job->fence_armed = false;
>>> job->dep.ops = &dep_job_ops;
>>> + job->fence_context = entity->fence_context + 1;
>>
>> As a side note, hardcoding the assumption on how scheduler allocates
>> contexts is not great given recent efforts to make drivers know less of the
>> scheduler internals.
>>
>
> Yes, we should probably have a helper here — maybe
> drm_sched_job_finished_context?
>
> I was planning to roll this change into [1], but that series hasn’t
> gained much traction, and fixing this is a fairly high-priority issue
> for customers.
>
> This is documented in the DRM scheduler kernel docs:
> entity->fence_context + 1 is the job's finished context.
>
> [1] https://patchwork.freedesktop.org/series/155314/
>
>> But what I really wanted to ask is, having only glanced the patch briefly,
>> could xe performance problem here also be solved by unwrapping the container
>> fences at the DRM scheduler dependency tracking level?
>>
>
> This is primarily about preventing TLB fences — which originate from a
> different context than the bind queue but are still ordered on the queue
> — from becoming dependencies. The process involves two passes: in the
> first pass, we detect dependencies. If none are found, we immediately
> complete the bind via the CPU. If dependencies are present, we defer the
> bind to the GPU.
Interesting, I saw fence unwrapping and context number checking and
thought it was maybe the same problem. I do not fully understand what xe
is doing to comment strongly but it does raise a question on whether
there could be a more elegant solution (ie not a hack).
Could the two entities be shared and would that solve the problem? I
mean the TLB invalidation and the bind queue entities, do they need to
be separate if the assumption and guarantee is to execute in order?
>> I am asking because amdgpu recently posted a patch to unwrap in their code
>> for potentially similar performance reasons, and if now xe wants something
>> similar, or even the same, it is an interesting question where to do it.
>>
>> Also, I have a patch (not sure if I posted it so far) which unwraps in
>> drm_sched_job_add_dependency() and converst the dependency xarray to
>> unwrapped dma-fence-array. Initial idea there was to allow scheduler worker
>> to only be woken up once, once all deps are signaled, but now if two drivers
>> seems to be unwrapping fences maybe there is a case to be made for doing it
>> in the core.
>>
>
> I don't think this is the same problem as the one above, but it's an
> interesting idea in general. CC me if you post this one.
Okay, but since it sounds it would not be helping here it will not be
priority to clean it up and send so might be a while.
Regards,
Tvrtko
>
> Matt
>
>> Regards,
>>
>> Tvrtko
>>
>>> kref_init(&job->refcount);
>>> xe_exec_queue_get(q); /* Pairs with put in xe_tlb_inval_job_destroy */
>>> @@ -266,3 +269,14 @@ void xe_tlb_inval_job_put(struct xe_tlb_inval_job *job)
>>> if (!IS_ERR_OR_NULL(job))
>>> kref_put(&job->refcount, xe_tlb_inval_job_destroy);
>>> }
>>> +
>>> +/**
>>> + * xe_tlb_inval_job_fence_context() - TLB invalidation job fence context
>>> + * @job: TLB invalidation job object
>>> + *
>>> + * Return: TLB invalidation job fence context
>>> + */
>>> +u64 xe_tlb_inval_job_fence_context(struct xe_tlb_inval_job *job)
>>> +{
>>> + return job->fence_context;
>>> +}
>>> diff --git a/drivers/gpu/drm/xe/xe_tlb_inval_job.h b/drivers/gpu/drm/xe/xe_tlb_inval_job.h
>>> index e63edcb26b50..2576165c2228 100644
>>> --- a/drivers/gpu/drm/xe/xe_tlb_inval_job.h
>>> +++ b/drivers/gpu/drm/xe/xe_tlb_inval_job.h
>>> @@ -30,4 +30,6 @@ void xe_tlb_inval_job_get(struct xe_tlb_inval_job *job);
>>> void xe_tlb_inval_job_put(struct xe_tlb_inval_job *job);
>>> +u64 xe_tlb_inval_job_fence_context(struct xe_tlb_inval_job *job);
>>> +
>>> #endif
>>
^ permalink raw reply [flat|nested] 15+ messages in thread* Re: [PATCH 1/1] drm/xe: Avoid serializing unbind jobs on prior TLB invalidations
2025-10-23 12:46 ` Tvrtko Ursulin
@ 2025-10-23 18:55 ` Matthew Brost
2025-10-23 19:27 ` Matthew Brost
0 siblings, 1 reply; 15+ messages in thread
From: Matthew Brost @ 2025-10-23 18:55 UTC (permalink / raw)
To: Tvrtko Ursulin; +Cc: intel-xe, carlos.santa, thomas.hellstrom, Philipp Stanner
On Thu, Oct 23, 2025 at 01:46:26PM +0100, Tvrtko Ursulin wrote:
>
> On 22/10/2025 16:10, Matthew Brost wrote:
> > On Wed, Oct 22, 2025 at 09:00:47AM +0100, Tvrtko Ursulin wrote:
> > >
> > > On 17/10/2025 17:52, Matthew Brost wrote:
> > > > When a burst of unbind jobs is issued, a dependency chain can form
> > > > between the TLB invalidation of a previous unbind job and the current
> > > > one. This leads to undesirable serialization, causing current jobs to
> > > > wait unnecessarily for prior TLB invalidations, execute on the GPU when
> > > > not needed, and significantly slow down the unbind burst—resulting in up
> > > > to a 4× slowdown.
> > > >
> > > > To break this chain, mask the last bind queue dependency if the last
> > > > fence's DMA context matches the TLB invalidation context. This allows
> > > > full pipelining of unbinds and TLB invalidations while preserving
> > > > correct dma-fence signaling semantics.
> > > >
> > > > Link: https://gitlab.freedesktop.org/drm/xe/kernel/-/issues/6047
> > > > Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> > > > ---
> > > > drivers/gpu/drm/xe/xe_exec.c | 3 +-
> > > > drivers/gpu/drm/xe/xe_exec_queue.c | 18 +++++++++--
> > > > drivers/gpu/drm/xe/xe_exec_queue.h | 3 +-
> > > > drivers/gpu/drm/xe/xe_pt.c | 15 +++++++--
> > > > drivers/gpu/drm/xe/xe_sched_job.c | 44 ++++++++++++++++++++++++++-
> > > > drivers/gpu/drm/xe/xe_sched_job.h | 7 ++++-
> > > > drivers/gpu/drm/xe/xe_tlb_inval_job.c | 14 +++++++++
> > > > drivers/gpu/drm/xe/xe_tlb_inval_job.h | 2 ++
> > > > 8 files changed, 98 insertions(+), 8 deletions(-)
> > > >
> > > > diff --git a/drivers/gpu/drm/xe/xe_exec.c b/drivers/gpu/drm/xe/xe_exec.c
> > > > index 0dc27476832b..6034cfc8be06 100644
> > > > --- a/drivers/gpu/drm/xe/xe_exec.c
> > > > +++ b/drivers/gpu/drm/xe/xe_exec.c
> > > > @@ -294,7 +294,8 @@ int xe_exec_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
> > > > goto err_put_job;
> > > > if (!xe_vm_in_lr_mode(vm)) {
> > > > - err = xe_sched_job_last_fence_add_dep(job, vm);
> > > > + err = xe_sched_job_last_fence_add_dep(job, vm, NO_MASK_DEP,
> > > > + NO_MASK_DEP);
> > > > if (err)
> > > > goto err_put_job;
> > > > diff --git a/drivers/gpu/drm/xe/xe_exec_queue.c b/drivers/gpu/drm/xe/xe_exec_queue.c
> > > > index 90cbc95f8e2e..d6f69d9bccba 100644
> > > > --- a/drivers/gpu/drm/xe/xe_exec_queue.c
> > > > +++ b/drivers/gpu/drm/xe/xe_exec_queue.c
> > > > @@ -25,6 +25,7 @@
> > > > #include "xe_migrate.h"
> > > > #include "xe_pm.h"
> > > > #include "xe_ring_ops_types.h"
> > > > +#include "xe_sched_job.h"
> > > > #include "xe_trace.h"
> > > > #include "xe_vm.h"
> > > > #include "xe_pxp.h"
> > > > @@ -1106,11 +1107,17 @@ void xe_exec_queue_last_fence_set(struct xe_exec_queue *q, struct xe_vm *vm,
> > > > * xe_exec_queue_last_fence_test_dep - Test last fence dependency of queue
> > > > * @q: The exec queue
> > > > * @vm: The VM the engine does a bind or exec for
> > > > + * @mask_ctx0: Mask dma-fence context0
> > > > + * @mask_ctx1: Mask dma-fence context1
> > > > + *
> > > > + * Test last fence dependency of queue, skipping masked dma fence contexts.
> > > > *
> > > > * Returns:
> > > > - * -ETIME if there exists an unsignalled last fence dependency, zero otherwise.
> > > > + * -ETIME if there exists an unsignalled and unmasked last fence dependency,
> > > > + * zero otherwise.
> > > > */
> > > > -int xe_exec_queue_last_fence_test_dep(struct xe_exec_queue *q, struct xe_vm *vm)
> > > > +int xe_exec_queue_last_fence_test_dep(struct xe_exec_queue *q, struct xe_vm *vm,
> > > > + u64 mask_ctx0, u64 mask_ctx1)
> > > > {
> > > > struct dma_fence *fence;
> > > > int err = 0;
> > > > @@ -1119,6 +1126,13 @@ int xe_exec_queue_last_fence_test_dep(struct xe_exec_queue *q, struct xe_vm *vm)
> > > > if (fence) {
> > > > err = test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence->flags) ?
> > > > 0 : -ETIME;
> > > > +
> > > > + if (err == -ETIME) {
> > > > + if (xe_sched_job_mask_dependency(fence, mask_ctx0,
> > > > + mask_ctx1))
> > > > + err = 0;
> > > > + }
> > > > +
> > > > dma_fence_put(fence);
> > > > }
> > > > diff --git a/drivers/gpu/drm/xe/xe_exec_queue.h b/drivers/gpu/drm/xe/xe_exec_queue.h
> > > > index a4dfbe858bda..99a35b22a46c 100644
> > > > --- a/drivers/gpu/drm/xe/xe_exec_queue.h
> > > > +++ b/drivers/gpu/drm/xe/xe_exec_queue.h
> > > > @@ -85,7 +85,8 @@ struct dma_fence *xe_exec_queue_last_fence_get_for_resume(struct xe_exec_queue *
> > > > void xe_exec_queue_last_fence_set(struct xe_exec_queue *e, struct xe_vm *vm,
> > > > struct dma_fence *fence);
> > > > int xe_exec_queue_last_fence_test_dep(struct xe_exec_queue *q,
> > > > - struct xe_vm *vm);
> > > > + struct xe_vm *vm, u64 mask_ctx0,
> > > > + u64 mask_ctx1);
> > > > void xe_exec_queue_update_run_ticks(struct xe_exec_queue *q);
> > > > int xe_exec_queue_contexts_hwsp_rebase(struct xe_exec_queue *q, void *scratch);
> > > > diff --git a/drivers/gpu/drm/xe/xe_pt.c b/drivers/gpu/drm/xe/xe_pt.c
> > > > index d22fd1ccc0ba..bba9ae559f57 100644
> > > > --- a/drivers/gpu/drm/xe/xe_pt.c
> > > > +++ b/drivers/gpu/drm/xe/xe_pt.c
> > > > @@ -1341,10 +1341,21 @@ static int xe_pt_vm_dependencies(struct xe_sched_job *job,
> > > > }
> > > > if (!(pt_update_ops->q->flags & EXEC_QUEUE_FLAG_KERNEL)) {
> > > > + u64 mask_ctx0 = NO_MASK_DEP, mask_ctx1 = NO_MASK_DEP;
> > > > +
> > > > + if (ijob)
> > > > + mask_ctx0 = xe_tlb_inval_job_fence_context(ijob);
> > > > + if (mjob)
> > > > + mask_ctx1 = xe_tlb_inval_job_fence_context(mjob);
> > > > +
> > > > if (job)
> > > > - err = xe_sched_job_last_fence_add_dep(job, vm);
> > > > + err = xe_sched_job_last_fence_add_dep(job, vm,
> > > > + mask_ctx0,
> > > > + mask_ctx1);
> > > > else
> > > > - err = xe_exec_queue_last_fence_test_dep(pt_update_ops->q, vm);
> > > > + err = xe_exec_queue_last_fence_test_dep(pt_update_ops->q,
> > > > + vm, mask_ctx0,
> > > > + mask_ctx1);
> > > > }
> > > > for (i = 0; job && !err && i < vops->num_syncs; i++)
> > > > diff --git a/drivers/gpu/drm/xe/xe_sched_job.c b/drivers/gpu/drm/xe/xe_sched_job.c
> > > > index d21bf8f26964..7cbdd87904c6 100644
> > > > --- a/drivers/gpu/drm/xe/xe_sched_job.c
> > > > +++ b/drivers/gpu/drm/xe/xe_sched_job.c
> > > > @@ -6,6 +6,7 @@
> > > > #include "xe_sched_job.h"
> > > > #include <uapi/drm/xe_drm.h>
> > > > +#include <linux/dma-fence-array.h>
> > > > #include <linux/dma-fence-chain.h>
> > > > #include <linux/slab.h>
> > > > @@ -295,19 +296,60 @@ void xe_sched_job_push(struct xe_sched_job *job)
> > > > xe_sched_job_put(job);
> > > > }
> > > > +/**
> > > > + * xe_sched_job_mask_dependency() - Determine if a dma-fence dependency can be masked
> > > > + * @fence: The dma-fence to check
> > > > + * @mask_ctx0: First context to compare against the fence's context
> > > > + * @mask_ctx1: Second context to compare against the fence's context
> > > > + *
> > > > + * This function checks whether the context of the given dma-fence matches
> > > > + * either of the provided mask contexts. If a match is found, the dependency
> > > > + * represented by the fence can be skipped. If the fence is a dma-fence-array,
> > > > + * its individual fences are unwound and checked.
> > > > + *
> > > > + * Return: true if the fence can be masked (i.e., skipped), false otherwise.
> > > > + */
> > > > +bool xe_sched_job_mask_dependency(struct dma_fence *fence, u64 mask_ctx0,
> > > > + u64 mask_ctx1)
> > > > +{
> > > > + if (dma_fence_is_array(fence)) {
> > > > + struct dma_fence *__fence;
> > > > + int index;
> > > > +
> > > > + dma_fence_array_for_each(__fence, index, fence)
> > > > + if (__fence->context == mask_ctx0 ||
> > > > + __fence->context == mask_ctx1)
> > > > + return true;
> > > > + } else if (fence->context == mask_ctx0 ||
> > > > + fence->context == mask_ctx1) {
> > > > + return true;
> > > > + }
> > > > +
> > > > + return false;
> > > > +}
> > > > +
> > > > /**
> > > > * xe_sched_job_last_fence_add_dep - Add last fence dependency to job
> > > > * @job:job to add the last fence dependency to
> > > > * @vm: virtual memory job belongs to
> > > > + * @mask_ctx0: Mask dma-fence context0
> > > > + * @mask_ctx1: Mask dma-fence context1
> > > > + *
> > > > + * Add last fence dependency to job, skipping masked dma fence contexts.
> > > > *
> > > > * Returns:
> > > > * 0 on success, or an error on failing to expand the array.
> > > > */
> > > > -int xe_sched_job_last_fence_add_dep(struct xe_sched_job *job, struct xe_vm *vm)
> > > > +int xe_sched_job_last_fence_add_dep(struct xe_sched_job *job, struct xe_vm *vm,
> > > > + u64 mask_ctx0, u64 mask_ctx1)
> > > > {
> > > > struct dma_fence *fence;
> > > > fence = xe_exec_queue_last_fence_get(job->q, vm);
> > > > + if (xe_sched_job_mask_dependency(fence, mask_ctx0, mask_ctx1)) {
> > > > + dma_fence_put(fence);
> > > > + return 0;
> > > > + }
> > > > return drm_sched_job_add_dependency(&job->drm, fence);
> > > > }
> > > > diff --git a/drivers/gpu/drm/xe/xe_sched_job.h b/drivers/gpu/drm/xe/xe_sched_job.h
> > > > index 3dc72c5c1f13..81d8e848e605 100644
> > > > --- a/drivers/gpu/drm/xe/xe_sched_job.h
> > > > +++ b/drivers/gpu/drm/xe/xe_sched_job.h
> > > > @@ -58,7 +58,8 @@ bool xe_sched_job_completed(struct xe_sched_job *job);
> > > > void xe_sched_job_arm(struct xe_sched_job *job);
> > > > void xe_sched_job_push(struct xe_sched_job *job);
> > > > -int xe_sched_job_last_fence_add_dep(struct xe_sched_job *job, struct xe_vm *vm);
> > > > +int xe_sched_job_last_fence_add_dep(struct xe_sched_job *job, struct xe_vm *vm,
> > > > + u64 mask_ctx0, u64 mask_ctx1);
> > > > void xe_sched_job_init_user_fence(struct xe_sched_job *job,
> > > > struct xe_sync_entry *sync);
> > > > @@ -93,4 +94,8 @@ void xe_sched_job_snapshot_print(struct xe_sched_job_snapshot *snapshot, struct
> > > > int xe_sched_job_add_deps(struct xe_sched_job *job, struct dma_resv *resv,
> > > > enum dma_resv_usage usage);
> > > > +#define NO_MASK_DEP (~0x0ull)
> > > > +bool xe_sched_job_mask_dependency(struct dma_fence *fence, u64 mask_ctx0,
> > > > + u64 mask_ctx1);
> > > > +
> > > > #endif
> > > > diff --git a/drivers/gpu/drm/xe/xe_tlb_inval_job.c b/drivers/gpu/drm/xe/xe_tlb_inval_job.c
> > > > index 492def04a559..f2fe7f9fbb22 100644
> > > > --- a/drivers/gpu/drm/xe/xe_tlb_inval_job.c
> > > > +++ b/drivers/gpu/drm/xe/xe_tlb_inval_job.c
> > > > @@ -32,6 +32,8 @@ struct xe_tlb_inval_job {
> > > > u64 start;
> > > > /** @end: End address to invalidate */
> > > > u64 end;
> > > > + /** @fence_context: Fence context for job */
> > > > + u64 fence_context;
> > > > /** @asid: Address space ID to invalidate */
> > > > u32 asid;
> > > > /** @fence_armed: Fence has been armed */
> > > > @@ -101,6 +103,7 @@ xe_tlb_inval_job_create(struct xe_exec_queue *q, struct xe_tlb_inval *tlb_inval,
> > > > job->asid = asid;
> > > > job->fence_armed = false;
> > > > job->dep.ops = &dep_job_ops;
> > > > + job->fence_context = entity->fence_context + 1;
> > >
> > > As a side note, hardcoding the assumption on how scheduler allocates
> > > contexts is not great given recent efforts to make drivers know less of the
> > > scheduler internals.
> > >
> >
> > Yes, we should probably have a helper here — maybe
> > drm_sched_job_finished_context?
> >
> > I was planning to roll this change into [1], but that series hasn’t
> > gained much traction, and fixing this is a fairly high-priority issue
> > for customers.
> >
> > This is documented in the DRM scheduler kernel docs:
> > entity->fence_context + 1 is the job's finished context.
> >
> > [1] https://patchwork.freedesktop.org/series/155314/
> >
> > > But what I really wanted to ask is, having only glanced the patch briefly,
> > > could xe performance problem here also be solved by unwrapping the container
> > > fences at the DRM scheduler dependency tracking level?
> > >
> >
> > This is primarily about preventing TLB fences — which originate from a
> > different context than the bind queue but are still ordered on the queue
> > — from becoming dependencies. The process involves two passes: in the
> > first pass, we detect dependencies. If none are found, we immediately
> > complete the bind via the CPU. If dependencies are present, we defer the
> > bind to the GPU.
>
> Interesting, I saw fence unwrapping and context number checking and thought
> it was maybe the same problem. I do not fully understand what xe is doing to
> comment strongly but it does raise a question on whether there could be a
> more elegant solution (ie not a hack).
>
> Could the two entities be shared and would that solve the problem? I mean
> the TLB invalidation and the bind queue entities, do they need to be
> separate if the assumption and guarantee is to execute in order?
>
Sharing dma-fence context would be great, but we have three scheduler
instances here — one for the bind queue and two for TLB invalidations,
one per GuC instance. The bind job feeds into the two TLB invalidations
as dependencies. The two TLB invalidations themselves are not ordered
with respect to each other, and the overall operation signals only when
both TLB invalidations have signaled.
This gets more complicated when a subsequent bind is issued without an
invalidation — it needs to wait on the prior invalidations to ensure
that fences sent to user space from the queue don’t signal out of order.
If the subsequent bind does issue an invalidation, then we don’t need to
wait — and that’s what this patch is (partially) fixing (e.g., a burst
of unbinds, which is the issue you previously raised with Chrome
switching tabs).
I’d love to find an elegant solution, but I’m just not seeing one right
now. I also wouldn’t call this a hack — getting dependency tracking
right in a complex driver is, frankly, just really hard. We’re still
working on getting everything correct.
Matt
> > > I am asking because amdgpu recently posted a patch to unwrap in their code
> > > for potentially similar performance reasons, and if now xe wants something
> > > similar, or even the same, it is an interesting question where to do it.
> > >
> > > Also, I have a patch (not sure if I posted it so far) which unwraps in
> > > drm_sched_job_add_dependency() and converst the dependency xarray to
> > > unwrapped dma-fence-array. Initial idea there was to allow scheduler worker
> > > to only be woken up once, once all deps are signaled, but now if two drivers
> > > seems to be unwrapping fences maybe there is a case to be made for doing it
> > > in the core.
> > >
> >
> > I don't think this is the same problem as the one above, but it's an
> > interesting idea in general. CC me if you post this one.
>
> Okay, but since it sounds it would not be helping here it will not be
> priority to clean it up and send so might be a while.
>
> Regards,
>
> Tvrtko
>
> >
> > Matt
> >
> > > Regards,
> > >
> > > Tvrtko
> > >
> > > > kref_init(&job->refcount);
> > > > xe_exec_queue_get(q); /* Pairs with put in xe_tlb_inval_job_destroy */
> > > > @@ -266,3 +269,14 @@ void xe_tlb_inval_job_put(struct xe_tlb_inval_job *job)
> > > > if (!IS_ERR_OR_NULL(job))
> > > > kref_put(&job->refcount, xe_tlb_inval_job_destroy);
> > > > }
> > > > +
> > > > +/**
> > > > + * xe_tlb_inval_job_fence_context() - TLB invalidation job fence context
> > > > + * @job: TLB invalidation job object
> > > > + *
> > > > + * Return: TLB invalidation job fence context
> > > > + */
> > > > +u64 xe_tlb_inval_job_fence_context(struct xe_tlb_inval_job *job)
> > > > +{
> > > > + return job->fence_context;
> > > > +}
> > > > diff --git a/drivers/gpu/drm/xe/xe_tlb_inval_job.h b/drivers/gpu/drm/xe/xe_tlb_inval_job.h
> > > > index e63edcb26b50..2576165c2228 100644
> > > > --- a/drivers/gpu/drm/xe/xe_tlb_inval_job.h
> > > > +++ b/drivers/gpu/drm/xe/xe_tlb_inval_job.h
> > > > @@ -30,4 +30,6 @@ void xe_tlb_inval_job_get(struct xe_tlb_inval_job *job);
> > > > void xe_tlb_inval_job_put(struct xe_tlb_inval_job *job);
> > > > +u64 xe_tlb_inval_job_fence_context(struct xe_tlb_inval_job *job);
> > > > +
> > > > #endif
> > >
>
>
>
>
>
^ permalink raw reply [flat|nested] 15+ messages in thread* Re: [PATCH 1/1] drm/xe: Avoid serializing unbind jobs on prior TLB invalidations
2025-10-23 18:55 ` Matthew Brost
@ 2025-10-23 19:27 ` Matthew Brost
0 siblings, 0 replies; 15+ messages in thread
From: Matthew Brost @ 2025-10-23 19:27 UTC (permalink / raw)
To: Tvrtko Ursulin; +Cc: intel-xe, carlos.santa, thomas.hellstrom, Philipp Stanner
On Thu, Oct 23, 2025 at 11:55:45AM -0700, Matthew Brost wrote:
> On Thu, Oct 23, 2025 at 01:46:26PM +0100, Tvrtko Ursulin wrote:
> >
> > On 22/10/2025 16:10, Matthew Brost wrote:
> > > On Wed, Oct 22, 2025 at 09:00:47AM +0100, Tvrtko Ursulin wrote:
> > > >
> > > > On 17/10/2025 17:52, Matthew Brost wrote:
> > > > > When a burst of unbind jobs is issued, a dependency chain can form
> > > > > between the TLB invalidation of a previous unbind job and the current
> > > > > one. This leads to undesirable serialization, causing current jobs to
> > > > > wait unnecessarily for prior TLB invalidations, execute on the GPU when
> > > > > not needed, and significantly slow down the unbind burst—resulting in up
> > > > > to a 4× slowdown.
> > > > >
> > > > > To break this chain, mask the last bind queue dependency if the last
> > > > > fence's DMA context matches the TLB invalidation context. This allows
> > > > > full pipelining of unbinds and TLB invalidations while preserving
> > > > > correct dma-fence signaling semantics.
> > > > >
> > > > > Link: https://gitlab.freedesktop.org/drm/xe/kernel/-/issues/6047
> > > > > Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> > > > > ---
> > > > > drivers/gpu/drm/xe/xe_exec.c | 3 +-
> > > > > drivers/gpu/drm/xe/xe_exec_queue.c | 18 +++++++++--
> > > > > drivers/gpu/drm/xe/xe_exec_queue.h | 3 +-
> > > > > drivers/gpu/drm/xe/xe_pt.c | 15 +++++++--
> > > > > drivers/gpu/drm/xe/xe_sched_job.c | 44 ++++++++++++++++++++++++++-
> > > > > drivers/gpu/drm/xe/xe_sched_job.h | 7 ++++-
> > > > > drivers/gpu/drm/xe/xe_tlb_inval_job.c | 14 +++++++++
> > > > > drivers/gpu/drm/xe/xe_tlb_inval_job.h | 2 ++
> > > > > 8 files changed, 98 insertions(+), 8 deletions(-)
> > > > >
> > > > > diff --git a/drivers/gpu/drm/xe/xe_exec.c b/drivers/gpu/drm/xe/xe_exec.c
> > > > > index 0dc27476832b..6034cfc8be06 100644
> > > > > --- a/drivers/gpu/drm/xe/xe_exec.c
> > > > > +++ b/drivers/gpu/drm/xe/xe_exec.c
> > > > > @@ -294,7 +294,8 @@ int xe_exec_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
> > > > > goto err_put_job;
> > > > > if (!xe_vm_in_lr_mode(vm)) {
> > > > > - err = xe_sched_job_last_fence_add_dep(job, vm);
> > > > > + err = xe_sched_job_last_fence_add_dep(job, vm, NO_MASK_DEP,
> > > > > + NO_MASK_DEP);
> > > > > if (err)
> > > > > goto err_put_job;
> > > > > diff --git a/drivers/gpu/drm/xe/xe_exec_queue.c b/drivers/gpu/drm/xe/xe_exec_queue.c
> > > > > index 90cbc95f8e2e..d6f69d9bccba 100644
> > > > > --- a/drivers/gpu/drm/xe/xe_exec_queue.c
> > > > > +++ b/drivers/gpu/drm/xe/xe_exec_queue.c
> > > > > @@ -25,6 +25,7 @@
> > > > > #include "xe_migrate.h"
> > > > > #include "xe_pm.h"
> > > > > #include "xe_ring_ops_types.h"
> > > > > +#include "xe_sched_job.h"
> > > > > #include "xe_trace.h"
> > > > > #include "xe_vm.h"
> > > > > #include "xe_pxp.h"
> > > > > @@ -1106,11 +1107,17 @@ void xe_exec_queue_last_fence_set(struct xe_exec_queue *q, struct xe_vm *vm,
> > > > > * xe_exec_queue_last_fence_test_dep - Test last fence dependency of queue
> > > > > * @q: The exec queue
> > > > > * @vm: The VM the engine does a bind or exec for
> > > > > + * @mask_ctx0: Mask dma-fence context0
> > > > > + * @mask_ctx1: Mask dma-fence context1
> > > > > + *
> > > > > + * Test last fence dependency of queue, skipping masked dma fence contexts.
> > > > > *
> > > > > * Returns:
> > > > > - * -ETIME if there exists an unsignalled last fence dependency, zero otherwise.
> > > > > + * -ETIME if there exists an unsignalled and unmasked last fence dependency,
> > > > > + * zero otherwise.
> > > > > */
> > > > > -int xe_exec_queue_last_fence_test_dep(struct xe_exec_queue *q, struct xe_vm *vm)
> > > > > +int xe_exec_queue_last_fence_test_dep(struct xe_exec_queue *q, struct xe_vm *vm,
> > > > > + u64 mask_ctx0, u64 mask_ctx1)
> > > > > {
> > > > > struct dma_fence *fence;
> > > > > int err = 0;
> > > > > @@ -1119,6 +1126,13 @@ int xe_exec_queue_last_fence_test_dep(struct xe_exec_queue *q, struct xe_vm *vm)
> > > > > if (fence) {
> > > > > err = test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence->flags) ?
> > > > > 0 : -ETIME;
> > > > > +
> > > > > + if (err == -ETIME) {
> > > > > + if (xe_sched_job_mask_dependency(fence, mask_ctx0,
> > > > > + mask_ctx1))
> > > > > + err = 0;
> > > > > + }
> > > > > +
> > > > > dma_fence_put(fence);
> > > > > }
> > > > > diff --git a/drivers/gpu/drm/xe/xe_exec_queue.h b/drivers/gpu/drm/xe/xe_exec_queue.h
> > > > > index a4dfbe858bda..99a35b22a46c 100644
> > > > > --- a/drivers/gpu/drm/xe/xe_exec_queue.h
> > > > > +++ b/drivers/gpu/drm/xe/xe_exec_queue.h
> > > > > @@ -85,7 +85,8 @@ struct dma_fence *xe_exec_queue_last_fence_get_for_resume(struct xe_exec_queue *
> > > > > void xe_exec_queue_last_fence_set(struct xe_exec_queue *e, struct xe_vm *vm,
> > > > > struct dma_fence *fence);
> > > > > int xe_exec_queue_last_fence_test_dep(struct xe_exec_queue *q,
> > > > > - struct xe_vm *vm);
> > > > > + struct xe_vm *vm, u64 mask_ctx0,
> > > > > + u64 mask_ctx1);
> > > > > void xe_exec_queue_update_run_ticks(struct xe_exec_queue *q);
> > > > > int xe_exec_queue_contexts_hwsp_rebase(struct xe_exec_queue *q, void *scratch);
> > > > > diff --git a/drivers/gpu/drm/xe/xe_pt.c b/drivers/gpu/drm/xe/xe_pt.c
> > > > > index d22fd1ccc0ba..bba9ae559f57 100644
> > > > > --- a/drivers/gpu/drm/xe/xe_pt.c
> > > > > +++ b/drivers/gpu/drm/xe/xe_pt.c
> > > > > @@ -1341,10 +1341,21 @@ static int xe_pt_vm_dependencies(struct xe_sched_job *job,
> > > > > }
> > > > > if (!(pt_update_ops->q->flags & EXEC_QUEUE_FLAG_KERNEL)) {
> > > > > + u64 mask_ctx0 = NO_MASK_DEP, mask_ctx1 = NO_MASK_DEP;
> > > > > +
> > > > > + if (ijob)
> > > > > + mask_ctx0 = xe_tlb_inval_job_fence_context(ijob);
> > > > > + if (mjob)
> > > > > + mask_ctx1 = xe_tlb_inval_job_fence_context(mjob);
> > > > > +
> > > > > if (job)
> > > > > - err = xe_sched_job_last_fence_add_dep(job, vm);
> > > > > + err = xe_sched_job_last_fence_add_dep(job, vm,
> > > > > + mask_ctx0,
> > > > > + mask_ctx1);
> > > > > else
> > > > > - err = xe_exec_queue_last_fence_test_dep(pt_update_ops->q, vm);
> > > > > + err = xe_exec_queue_last_fence_test_dep(pt_update_ops->q,
> > > > > + vm, mask_ctx0,
> > > > > + mask_ctx1);
> > > > > }
> > > > > for (i = 0; job && !err && i < vops->num_syncs; i++)
> > > > > diff --git a/drivers/gpu/drm/xe/xe_sched_job.c b/drivers/gpu/drm/xe/xe_sched_job.c
> > > > > index d21bf8f26964..7cbdd87904c6 100644
> > > > > --- a/drivers/gpu/drm/xe/xe_sched_job.c
> > > > > +++ b/drivers/gpu/drm/xe/xe_sched_job.c
> > > > > @@ -6,6 +6,7 @@
> > > > > #include "xe_sched_job.h"
> > > > > #include <uapi/drm/xe_drm.h>
> > > > > +#include <linux/dma-fence-array.h>
> > > > > #include <linux/dma-fence-chain.h>
> > > > > #include <linux/slab.h>
> > > > > @@ -295,19 +296,60 @@ void xe_sched_job_push(struct xe_sched_job *job)
> > > > > xe_sched_job_put(job);
> > > > > }
> > > > > +/**
> > > > > + * xe_sched_job_mask_dependency() - Determine if a dma-fence dependency can be masked
> > > > > + * @fence: The dma-fence to check
> > > > > + * @mask_ctx0: First context to compare against the fence's context
> > > > > + * @mask_ctx1: Second context to compare against the fence's context
> > > > > + *
> > > > > + * This function checks whether the context of the given dma-fence matches
> > > > > + * either of the provided mask contexts. If a match is found, the dependency
> > > > > + * represented by the fence can be skipped. If the fence is a dma-fence-array,
> > > > > + * its individual fences are unwound and checked.
> > > > > + *
> > > > > + * Return: true if the fence can be masked (i.e., skipped), false otherwise.
> > > > > + */
> > > > > +bool xe_sched_job_mask_dependency(struct dma_fence *fence, u64 mask_ctx0,
> > > > > + u64 mask_ctx1)
> > > > > +{
> > > > > + if (dma_fence_is_array(fence)) {
> > > > > + struct dma_fence *__fence;
> > > > > + int index;
> > > > > +
> > > > > + dma_fence_array_for_each(__fence, index, fence)
> > > > > + if (__fence->context == mask_ctx0 ||
> > > > > + __fence->context == mask_ctx1)
> > > > > + return true;
> > > > > + } else if (fence->context == mask_ctx0 ||
> > > > > + fence->context == mask_ctx1) {
> > > > > + return true;
> > > > > + }
> > > > > +
> > > > > + return false;
> > > > > +}
> > > > > +
> > > > > /**
> > > > > * xe_sched_job_last_fence_add_dep - Add last fence dependency to job
> > > > > * @job:job to add the last fence dependency to
> > > > > * @vm: virtual memory job belongs to
> > > > > + * @mask_ctx0: Mask dma-fence context0
> > > > > + * @mask_ctx1: Mask dma-fence context1
> > > > > + *
> > > > > + * Add last fence dependency to job, skipping masked dma fence contexts.
> > > > > *
> > > > > * Returns:
> > > > > * 0 on success, or an error on failing to expand the array.
> > > > > */
> > > > > -int xe_sched_job_last_fence_add_dep(struct xe_sched_job *job, struct xe_vm *vm)
> > > > > +int xe_sched_job_last_fence_add_dep(struct xe_sched_job *job, struct xe_vm *vm,
> > > > > + u64 mask_ctx0, u64 mask_ctx1)
> > > > > {
> > > > > struct dma_fence *fence;
> > > > > fence = xe_exec_queue_last_fence_get(job->q, vm);
> > > > > + if (xe_sched_job_mask_dependency(fence, mask_ctx0, mask_ctx1)) {
> > > > > + dma_fence_put(fence);
> > > > > + return 0;
> > > > > + }
> > > > > return drm_sched_job_add_dependency(&job->drm, fence);
> > > > > }
> > > > > diff --git a/drivers/gpu/drm/xe/xe_sched_job.h b/drivers/gpu/drm/xe/xe_sched_job.h
> > > > > index 3dc72c5c1f13..81d8e848e605 100644
> > > > > --- a/drivers/gpu/drm/xe/xe_sched_job.h
> > > > > +++ b/drivers/gpu/drm/xe/xe_sched_job.h
> > > > > @@ -58,7 +58,8 @@ bool xe_sched_job_completed(struct xe_sched_job *job);
> > > > > void xe_sched_job_arm(struct xe_sched_job *job);
> > > > > void xe_sched_job_push(struct xe_sched_job *job);
> > > > > -int xe_sched_job_last_fence_add_dep(struct xe_sched_job *job, struct xe_vm *vm);
> > > > > +int xe_sched_job_last_fence_add_dep(struct xe_sched_job *job, struct xe_vm *vm,
> > > > > + u64 mask_ctx0, u64 mask_ctx1);
> > > > > void xe_sched_job_init_user_fence(struct xe_sched_job *job,
> > > > > struct xe_sync_entry *sync);
> > > > > @@ -93,4 +94,8 @@ void xe_sched_job_snapshot_print(struct xe_sched_job_snapshot *snapshot, struct
> > > > > int xe_sched_job_add_deps(struct xe_sched_job *job, struct dma_resv *resv,
> > > > > enum dma_resv_usage usage);
> > > > > +#define NO_MASK_DEP (~0x0ull)
> > > > > +bool xe_sched_job_mask_dependency(struct dma_fence *fence, u64 mask_ctx0,
> > > > > + u64 mask_ctx1);
> > > > > +
> > > > > #endif
> > > > > diff --git a/drivers/gpu/drm/xe/xe_tlb_inval_job.c b/drivers/gpu/drm/xe/xe_tlb_inval_job.c
> > > > > index 492def04a559..f2fe7f9fbb22 100644
> > > > > --- a/drivers/gpu/drm/xe/xe_tlb_inval_job.c
> > > > > +++ b/drivers/gpu/drm/xe/xe_tlb_inval_job.c
> > > > > @@ -32,6 +32,8 @@ struct xe_tlb_inval_job {
> > > > > u64 start;
> > > > > /** @end: End address to invalidate */
> > > > > u64 end;
> > > > > + /** @fence_context: Fence context for job */
> > > > > + u64 fence_context;
> > > > > /** @asid: Address space ID to invalidate */
> > > > > u32 asid;
> > > > > /** @fence_armed: Fence has been armed */
> > > > > @@ -101,6 +103,7 @@ xe_tlb_inval_job_create(struct xe_exec_queue *q, struct xe_tlb_inval *tlb_inval,
> > > > > job->asid = asid;
> > > > > job->fence_armed = false;
> > > > > job->dep.ops = &dep_job_ops;
> > > > > + job->fence_context = entity->fence_context + 1;
> > > >
> > > > As a side note, hardcoding the assumption on how scheduler allocates
> > > > contexts is not great given recent efforts to make drivers know less of the
> > > > scheduler internals.
> > > >
> > >
> > > Yes, we should probably have a helper here — maybe
> > > drm_sched_job_finished_context?
> > >
> > > I was planning to roll this change into [1], but that series hasn’t
> > > gained much traction, and fixing this is a fairly high-priority issue
> > > for customers.
> > >
> > > This is documented in the DRM scheduler kernel docs:
> > > entity->fence_context + 1 is the job's finished context.
> > >
> > > [1] https://patchwork.freedesktop.org/series/155314/
> > >
> > > > But what I really wanted to ask is, having only glanced the patch briefly,
> > > > could xe performance problem here also be solved by unwrapping the container
> > > > fences at the DRM scheduler dependency tracking level?
> > > >
> > >
> > > This is primarily about preventing TLB fences — which originate from a
> > > different context than the bind queue but are still ordered on the queue
> > > — from becoming dependencies. The process involves two passes: in the
> > > first pass, we detect dependencies. If none are found, we immediately
> > > complete the bind via the CPU. If dependencies are present, we defer the
> > > bind to the GPU.
> >
> > Interesting, I saw fence unwrapping and context number checking and thought
> > it was maybe the same problem. I do not fully understand what xe is doing to
> > comment strongly but it does raise a question on whether there could be a
> > more elegant solution (ie not a hack).
> >
> > Could the two entities be shared and would that solve the problem? I mean
> > the TLB invalidation and the bind queue entities, do they need to be
> > separate if the assumption and guarantee is to execute in order?
> >
>
> Sharing dma-fence context would be great, but we have three scheduler
> instances here — one for the bind queue and two for TLB invalidations,
> one per GuC instance. The bind job feeds into the two TLB invalidations
> as dependencies. The two TLB invalidations themselves are not ordered
> with respect to each other, and the overall operation signals only when
> both TLB invalidations have signaled.
>
> This gets more complicated when a subsequent bind is issued without an
> invalidation — it needs to wait on the prior invalidations to ensure
> that fences sent to user space from the queue don’t signal out of order.
>
> If the subsequent bind does issue an invalidation, then we don’t need to
> wait — and that’s what this patch is (partially) fixing (e.g., a burst
> of unbinds, which is the issue you previously raised with Chrome
> switching tabs).
>
> I’d love to find an elegant solution, but I’m just not seeing one right
Okay, I take this back. I think I have an elegant solution, but it will
take time for Xe to get there.
Currently, when a bind job is needed (i.e., it has dependencies), the
final part of the bind process runs on the GPU, programming leaves or
pruning new parts in the page table structure. GPU jobs are inherently
asynchronous, so subsequent TLB invalidations must be scheduled to run
afterward.
I have patches that, for various reasons, make it advantageous to run
the bind job on the CPU. Once that happens, the operation becomes
synchronous, allowing TLB invalidations to be issued immediately from
run_job, returning a dma-fence-array or dma-fence-chain of all issued
TLB invalidations. We’d only need a single queue (scheduler instance /
entity) for bind jobs and invalidations.
We’d still need to block somewhere to cover the case of a job with
invalidation followed by a job without invalidation, but that seems
workable.
As I said, this won’t happen overnight. The CPU bind patches have been
around for a couple of years, and the last time I revived them, PM
references were leaking—so I need to track that down. But in the long
term, a single queue for binds and invalidations seems like the ultimate
goal.
In the short term, this is something we need to fix—it's a clear issue
in our driver.
Matt
> now. I also wouldn’t call this a hack — getting dependency tracking
> right in a complex driver is, frankly, just really hard. We’re still
> working on getting everything correct.
>
> Matt
>
> > > > I am asking because amdgpu recently posted a patch to unwrap in their code
> > > > for potentially similar performance reasons, and if now xe wants something
> > > > similar, or even the same, it is an interesting question where to do it.
> > > >
> > > > Also, I have a patch (not sure if I posted it so far) which unwraps in
> > > > drm_sched_job_add_dependency() and converst the dependency xarray to
> > > > unwrapped dma-fence-array. Initial idea there was to allow scheduler worker
> > > > to only be woken up once, once all deps are signaled, but now if two drivers
> > > > seems to be unwrapping fences maybe there is a case to be made for doing it
> > > > in the core.
> > > >
> > >
> > > I don't think this is the same problem as the one above, but it's an
> > > interesting idea in general. CC me if you post this one.
> >
> > Okay, but since it sounds it would not be helping here it will not be
> > priority to clean it up and send so might be a while.
> >
> > Regards,
> >
> > Tvrtko
> >
> > >
> > > Matt
> > >
> > > > Regards,
> > > >
> > > > Tvrtko
> > > >
> > > > > kref_init(&job->refcount);
> > > > > xe_exec_queue_get(q); /* Pairs with put in xe_tlb_inval_job_destroy */
> > > > > @@ -266,3 +269,14 @@ void xe_tlb_inval_job_put(struct xe_tlb_inval_job *job)
> > > > > if (!IS_ERR_OR_NULL(job))
> > > > > kref_put(&job->refcount, xe_tlb_inval_job_destroy);
> > > > > }
> > > > > +
> > > > > +/**
> > > > > + * xe_tlb_inval_job_fence_context() - TLB invalidation job fence context
> > > > > + * @job: TLB invalidation job object
> > > > > + *
> > > > > + * Return: TLB invalidation job fence context
> > > > > + */
> > > > > +u64 xe_tlb_inval_job_fence_context(struct xe_tlb_inval_job *job)
> > > > > +{
> > > > > + return job->fence_context;
> > > > > +}
> > > > > diff --git a/drivers/gpu/drm/xe/xe_tlb_inval_job.h b/drivers/gpu/drm/xe/xe_tlb_inval_job.h
> > > > > index e63edcb26b50..2576165c2228 100644
> > > > > --- a/drivers/gpu/drm/xe/xe_tlb_inval_job.h
> > > > > +++ b/drivers/gpu/drm/xe/xe_tlb_inval_job.h
> > > > > @@ -30,4 +30,6 @@ void xe_tlb_inval_job_get(struct xe_tlb_inval_job *job);
> > > > > void xe_tlb_inval_job_put(struct xe_tlb_inval_job *job);
> > > > > +u64 xe_tlb_inval_job_fence_context(struct xe_tlb_inval_job *job);
> > > > > +
> > > > > #endif
> > > >
> >
> >
> >
> >
> >
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [PATCH 1/1] drm/xe: Avoid serializing unbind jobs on prior TLB invalidations
2025-10-17 16:52 ` [PATCH 1/1] drm/xe: Avoid serializing unbind jobs on prior TLB invalidations Matthew Brost
2025-10-21 17:55 ` Summers, Stuart
2025-10-22 8:00 ` Tvrtko Ursulin
@ 2025-10-23 12:28 ` Thomas Hellström
2 siblings, 0 replies; 15+ messages in thread
From: Thomas Hellström @ 2025-10-23 12:28 UTC (permalink / raw)
To: Matthew Brost, intel-xe; +Cc: carlos.santa
Hi Matt
On Fri, 2025-10-17 at 09:52 -0700, Matthew Brost wrote:
> When a burst of unbind jobs is issued, a dependency chain can form
> between the TLB invalidation of a previous unbind job and the current
> one. This leads to undesirable serialization, causing current jobs to
> wait unnecessarily for prior TLB invalidations, execute on the GPU
> when
> not needed, and significantly slow down the unbind burst—resulting in
> up
> to a 4× slowdown.
>
> To break this chain, mask the last bind queue dependency if the last
> fence's DMA context matches the TLB invalidation context. This allows
> full pipelining of unbinds and TLB invalidations while preserving
> correct dma-fence signaling semantics.
>
> Link: https://gitlab.freedesktop.org/drm/xe/kernel/-/issues/6047
> Signed-off-by: Matthew Brost <matthew.brost@intel.com>
Some comments below.
Apart from that, it looks like this stems from us always combining the
exec_queue fence and TLB fence into an out_fence and then use that as
the exec_queue_last_fence.
But IMO the exec_queue last fence should always be a single gpu_job
fence, with the exception of no bind jobs, so that we store the gpu_job
fence as the last fence, and *then* combine with any TLB fence as the
out fence.
> ---
> drivers/gpu/drm/xe/xe_exec.c | 3 +-
> drivers/gpu/drm/xe/xe_exec_queue.c | 18 +++++++++--
> drivers/gpu/drm/xe/xe_exec_queue.h | 3 +-
> drivers/gpu/drm/xe/xe_pt.c | 15 +++++++--
> drivers/gpu/drm/xe/xe_sched_job.c | 44
> ++++++++++++++++++++++++++-
> drivers/gpu/drm/xe/xe_sched_job.h | 7 ++++-
> drivers/gpu/drm/xe/xe_tlb_inval_job.c | 14 +++++++++
> drivers/gpu/drm/xe/xe_tlb_inval_job.h | 2 ++
> 8 files changed, 98 insertions(+), 8 deletions(-)
>
> diff --git a/drivers/gpu/drm/xe/xe_exec.c
> b/drivers/gpu/drm/xe/xe_exec.c
> index 0dc27476832b..6034cfc8be06 100644
> --- a/drivers/gpu/drm/xe/xe_exec.c
> +++ b/drivers/gpu/drm/xe/xe_exec.c
> @@ -294,7 +294,8 @@ int xe_exec_ioctl(struct drm_device *dev, void
> *data, struct drm_file *file)
> goto err_put_job;
>
> if (!xe_vm_in_lr_mode(vm)) {
> - err = xe_sched_job_last_fence_add_dep(job, vm);
> + err = xe_sched_job_last_fence_add_dep(job, vm,
> NO_MASK_DEP,
> + NO_MASK_DEP);
> if (err)
> goto err_put_job;
>
> diff --git a/drivers/gpu/drm/xe/xe_exec_queue.c
> b/drivers/gpu/drm/xe/xe_exec_queue.c
> index 90cbc95f8e2e..d6f69d9bccba 100644
> --- a/drivers/gpu/drm/xe/xe_exec_queue.c
> +++ b/drivers/gpu/drm/xe/xe_exec_queue.c
> @@ -25,6 +25,7 @@
> #include "xe_migrate.h"
> #include "xe_pm.h"
> #include "xe_ring_ops_types.h"
> +#include "xe_sched_job.h"
> #include "xe_trace.h"
> #include "xe_vm.h"
> #include "xe_pxp.h"
> @@ -1106,11 +1107,17 @@ void xe_exec_queue_last_fence_set(struct
> xe_exec_queue *q, struct xe_vm *vm,
> * xe_exec_queue_last_fence_test_dep - Test last fence dependency of
> queue
> * @q: The exec queue
> * @vm: The VM the engine does a bind or exec for
> + * @mask_ctx0: Mask dma-fence context0
> + * @mask_ctx1: Mask dma-fence context1
> + *
> + * Test last fence dependency of queue, skipping masked dma fence
> contexts.
> *
> * Returns:
> - * -ETIME if there exists an unsignalled last fence dependency, zero
> otherwise.
> + * -ETIME if there exists an unsignalled and unmasked last fence
> dependency,
> + * zero otherwise.
> */
> -int xe_exec_queue_last_fence_test_dep(struct xe_exec_queue *q,
> struct xe_vm *vm)
> +int xe_exec_queue_last_fence_test_dep(struct xe_exec_queue *q,
> struct xe_vm *vm,
> + u64 mask_ctx0, u64 mask_ctx1)
> {
> struct dma_fence *fence;
> int err = 0;
> @@ -1119,6 +1126,13 @@ int xe_exec_queue_last_fence_test_dep(struct
> xe_exec_queue *q, struct xe_vm *vm)
> if (fence) {
> err = test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence-
> >flags) ?
> 0 : -ETIME;
> +
> + if (err == -ETIME) {
> + if (xe_sched_job_mask_dependency(fence,
> mask_ctx0,
> + mask_ctx1))
> + err = 0;
> + }
> +
> dma_fence_put(fence);
> }
>
> diff --git a/drivers/gpu/drm/xe/xe_exec_queue.h
> b/drivers/gpu/drm/xe/xe_exec_queue.h
> index a4dfbe858bda..99a35b22a46c 100644
> --- a/drivers/gpu/drm/xe/xe_exec_queue.h
> +++ b/drivers/gpu/drm/xe/xe_exec_queue.h
> @@ -85,7 +85,8 @@ struct dma_fence
> *xe_exec_queue_last_fence_get_for_resume(struct xe_exec_queue *
> void xe_exec_queue_last_fence_set(struct xe_exec_queue *e, struct
> xe_vm *vm,
> struct dma_fence *fence);
> int xe_exec_queue_last_fence_test_dep(struct xe_exec_queue *q,
> - struct xe_vm *vm);
> + struct xe_vm *vm, u64
> mask_ctx0,
> + u64 mask_ctx1);
> void xe_exec_queue_update_run_ticks(struct xe_exec_queue *q);
>
> int xe_exec_queue_contexts_hwsp_rebase(struct xe_exec_queue *q, void
> *scratch);
> diff --git a/drivers/gpu/drm/xe/xe_pt.c b/drivers/gpu/drm/xe/xe_pt.c
> index d22fd1ccc0ba..bba9ae559f57 100644
> --- a/drivers/gpu/drm/xe/xe_pt.c
> +++ b/drivers/gpu/drm/xe/xe_pt.c
> @@ -1341,10 +1341,21 @@ static int xe_pt_vm_dependencies(struct
> xe_sched_job *job,
> }
>
> if (!(pt_update_ops->q->flags & EXEC_QUEUE_FLAG_KERNEL)) {
> + u64 mask_ctx0 = NO_MASK_DEP, mask_ctx1 =
> NO_MASK_DEP;
> +
> + if (ijob)
> + mask_ctx0 =
> xe_tlb_inval_job_fence_context(ijob);
> + if (mjob)
> + mask_ctx1 =
> xe_tlb_inval_job_fence_context(mjob);
> +
> if (job)
> - err = xe_sched_job_last_fence_add_dep(job,
> vm);
> + err = xe_sched_job_last_fence_add_dep(job,
> vm,
> +
> mask_ctx0,
> +
> mask_ctx1);
> else
> - err =
> xe_exec_queue_last_fence_test_dep(pt_update_ops->q, vm);
> + err =
> xe_exec_queue_last_fence_test_dep(pt_update_ops->q,
> + vm,
> mask_ctx0,
> + mask
> _ctx1);
> }
>
> for (i = 0; job && !err && i < vops->num_syncs; i++)
> diff --git a/drivers/gpu/drm/xe/xe_sched_job.c
> b/drivers/gpu/drm/xe/xe_sched_job.c
> index d21bf8f26964..7cbdd87904c6 100644
> --- a/drivers/gpu/drm/xe/xe_sched_job.c
> +++ b/drivers/gpu/drm/xe/xe_sched_job.c
> @@ -6,6 +6,7 @@
> #include "xe_sched_job.h"
>
> #include <uapi/drm/xe_drm.h>
> +#include <linux/dma-fence-array.h>
> #include <linux/dma-fence-chain.h>
> #include <linux/slab.h>
>
> @@ -295,19 +296,60 @@ void xe_sched_job_push(struct xe_sched_job
> *job)
> xe_sched_job_put(job);
> }
>
> +/**
> + * xe_sched_job_mask_dependency() - Determine if a dma-fence
> dependency can be masked
> + * @fence: The dma-fence to check
> + * @mask_ctx0: First context to compare against the fence's context
> + * @mask_ctx1: Second context to compare against the fence's context
> + *
> + * This function checks whether the context of the given dma-fence
> matches
> + * either of the provided mask contexts. If a match is found, the
> dependency
> + * represented by the fence can be skipped. If the fence is a dma-
> fence-array,
> + * its individual fences are unwound and checked.
> + *
> + * Return: true if the fence can be masked (i.e., skipped), false
> otherwise.
> + */
> +bool xe_sched_job_mask_dependency(struct dma_fence *fence, u64
> mask_ctx0,
> + u64 mask_ctx1)
> +{
> + if (dma_fence_is_array(fence)) {
> + struct dma_fence *__fence;
> + int index;
> +
> + dma_fence_array_for_each(__fence, index, fence)
> + if (__fence->context == mask_ctx0 ||
> + __fence->context == mask_ctx1)
> + return true;
What if there are other fences in the array that don't match the
contexts? Don't we lose them.
On a different side-note, when looking at the code it looks like the
last fence could in theory be an array_fence[tiles] of
array_fence[gts]. I don't think we have such HW yet, but IIRC dma_fence
container rules do not allow that.
Thanks,
Thomas
> + } else if (fence->context == mask_ctx0 ||
> + fence->context == mask_ctx1) {
> + return true;
> + }
> +
> + return false;
> +}
> +
> /**
> * xe_sched_job_last_fence_add_dep - Add last fence dependency to
> job
> * @job:job to add the last fence dependency to
> * @vm: virtual memory job belongs to
> + * @mask_ctx0: Mask dma-fence context0
> + * @mask_ctx1: Mask dma-fence context1
> + *
> + * Add last fence dependency to job, skipping masked dma fence
> contexts.
> *
> * Returns:
> * 0 on success, or an error on failing to expand the array.
> */
> -int xe_sched_job_last_fence_add_dep(struct xe_sched_job *job, struct
> xe_vm *vm)
> +int xe_sched_job_last_fence_add_dep(struct xe_sched_job *job, struct
> xe_vm *vm,
> + u64 mask_ctx0, u64 mask_ctx1)
> {
> struct dma_fence *fence;
>
> fence = xe_exec_queue_last_fence_get(job->q, vm);
> + if (xe_sched_job_mask_dependency(fence, mask_ctx0,
> mask_ctx1)) {
> + dma_fence_put(fence);
> + return 0;
> + }
>
> return drm_sched_job_add_dependency(&job->drm, fence);
> }
> diff --git a/drivers/gpu/drm/xe/xe_sched_job.h
> b/drivers/gpu/drm/xe/xe_sched_job.h
> index 3dc72c5c1f13..81d8e848e605 100644
> --- a/drivers/gpu/drm/xe/xe_sched_job.h
> +++ b/drivers/gpu/drm/xe/xe_sched_job.h
> @@ -58,7 +58,8 @@ bool xe_sched_job_completed(struct xe_sched_job
> *job);
> void xe_sched_job_arm(struct xe_sched_job *job);
> void xe_sched_job_push(struct xe_sched_job *job);
>
> -int xe_sched_job_last_fence_add_dep(struct xe_sched_job *job, struct
> xe_vm *vm);
> +int xe_sched_job_last_fence_add_dep(struct xe_sched_job *job, struct
> xe_vm *vm,
> + u64 mask_ctx0, u64 mask_ctx1);
> void xe_sched_job_init_user_fence(struct xe_sched_job *job,
> struct xe_sync_entry *sync);
>
> @@ -93,4 +94,8 @@ void xe_sched_job_snapshot_print(struct
> xe_sched_job_snapshot *snapshot, struct
> int xe_sched_job_add_deps(struct xe_sched_job *job, struct dma_resv
> *resv,
> enum dma_resv_usage usage);
>
> +#define NO_MASK_DEP (~0x0ull)
> +bool xe_sched_job_mask_dependency(struct dma_fence *fence, u64
> mask_ctx0,
> + u64 mask_ctx1);
> +
> #endif
> diff --git a/drivers/gpu/drm/xe/xe_tlb_inval_job.c
> b/drivers/gpu/drm/xe/xe_tlb_inval_job.c
> index 492def04a559..f2fe7f9fbb22 100644
> --- a/drivers/gpu/drm/xe/xe_tlb_inval_job.c
> +++ b/drivers/gpu/drm/xe/xe_tlb_inval_job.c
> @@ -32,6 +32,8 @@ struct xe_tlb_inval_job {
> u64 start;
> /** @end: End address to invalidate */
> u64 end;
> + /** @fence_context: Fence context for job */
> + u64 fence_context;
> /** @asid: Address space ID to invalidate */
> u32 asid;
> /** @fence_armed: Fence has been armed */
> @@ -101,6 +103,7 @@ xe_tlb_inval_job_create(struct xe_exec_queue *q,
> struct xe_tlb_inval *tlb_inval,
> job->asid = asid;
> job->fence_armed = false;
> job->dep.ops = &dep_job_ops;
> + job->fence_context = entity->fence_context + 1;
> kref_init(&job->refcount);
> xe_exec_queue_get(q); /* Pairs with put in
> xe_tlb_inval_job_destroy */
>
> @@ -266,3 +269,14 @@ void xe_tlb_inval_job_put(struct
> xe_tlb_inval_job *job)
> if (!IS_ERR_OR_NULL(job))
> kref_put(&job->refcount, xe_tlb_inval_job_destroy);
> }
> +
> +/**
> + * xe_tlb_inval_job_fence_context() - TLB invalidation job fence
> context
> + * @job: TLB invalidation job object
> + *
> + * Return: TLB invalidation job fence context
> + */
> +u64 xe_tlb_inval_job_fence_context(struct xe_tlb_inval_job *job)
> +{
> + return job->fence_context;
> +}
> diff --git a/drivers/gpu/drm/xe/xe_tlb_inval_job.h
> b/drivers/gpu/drm/xe/xe_tlb_inval_job.h
> index e63edcb26b50..2576165c2228 100644
> --- a/drivers/gpu/drm/xe/xe_tlb_inval_job.h
> +++ b/drivers/gpu/drm/xe/xe_tlb_inval_job.h
> @@ -30,4 +30,6 @@ void xe_tlb_inval_job_get(struct xe_tlb_inval_job
> *job);
>
> void xe_tlb_inval_job_put(struct xe_tlb_inval_job *job);
>
> +u64 xe_tlb_inval_job_fence_context(struct xe_tlb_inval_job *job);
> +
> #endif
^ permalink raw reply [flat|nested] 15+ messages in thread