* [PATCH v3 1/4] drm/xe: Use ring ops TLB invalidation for rebinds
[not found] <20240327091136.3271-1-thomas.hellstrom@linux.intel.com>
@ 2024-03-27 9:11 ` Thomas Hellström
2024-03-27 9:11 ` [PATCH v3 2/4] drm/xe: Rework rebinding Thomas Hellström
2024-03-27 9:11 ` [PATCH v3 3/4] drm/xe: Make TLB invalidation fences unordered Thomas Hellström
2 siblings, 0 replies; 3+ messages in thread
From: Thomas Hellström @ 2024-03-27 9:11 UTC (permalink / raw)
To: intel-xe; +Cc: Thomas Hellström, Matthew Brost, stable
For each rebind we insert a GuC TLB invalidation and add a
corresponding unordered TLB invalidation fence. This might
add a huge number of TLB invalidation fences to wait for so
rather than doing that, defer the TLB invalidation to the
next ring ops for each affected exec queue. Since the TLB
is invalidated on exec_queue switch, we need to invalidate
once for each affected exec_queue.
v2:
- Simplify if-statements around the tlb_flush_seqno.
(Matthew Brost)
- Add some comments and asserts.
Fixes: 5387e865d90e ("drm/xe: Add TLB invalidation fence after rebinds issued from execs")
Cc: Matthew Brost <matthew.brost@intel.com>
Cc: <stable@vger.kernel.org> # v6.8+
Signed-off-by: Thomas Hellström <thomas.hellstrom@linux.intel.com>
Reviewed-by: Matthew Brost <matthew.brost@intel.com>
---
drivers/gpu/drm/xe/xe_exec_queue_types.h | 5 +++++
drivers/gpu/drm/xe/xe_pt.c | 6 ++++--
drivers/gpu/drm/xe/xe_ring_ops.c | 11 ++++-------
drivers/gpu/drm/xe/xe_sched_job.c | 10 ++++++++++
drivers/gpu/drm/xe/xe_sched_job_types.h | 2 ++
drivers/gpu/drm/xe/xe_vm_types.h | 5 +++++
6 files changed, 30 insertions(+), 9 deletions(-)
diff --git a/drivers/gpu/drm/xe/xe_exec_queue_types.h b/drivers/gpu/drm/xe/xe_exec_queue_types.h
index 9cc689f50db0..ee78d497d838 100644
--- a/drivers/gpu/drm/xe/xe_exec_queue_types.h
+++ b/drivers/gpu/drm/xe/xe_exec_queue_types.h
@@ -146,6 +146,11 @@ struct xe_exec_queue {
const struct xe_ring_ops *ring_ops;
/** @entity: DRM sched entity for this exec queue (1 to 1 relationship) */
struct drm_sched_entity *entity;
+ /**
+ * @tlb_flush_seqno: The seqno of the last rebind tlb flush performed
+ * Protected by @vm's resv. Unused if @vm == NULL.
+ */
+ u64 tlb_flush_seqno;
/** @lrc: logical ring context for this exec queue */
struct xe_lrc lrc[];
};
diff --git a/drivers/gpu/drm/xe/xe_pt.c b/drivers/gpu/drm/xe/xe_pt.c
index 8d3922d2206e..37117752cfc9 100644
--- a/drivers/gpu/drm/xe/xe_pt.c
+++ b/drivers/gpu/drm/xe/xe_pt.c
@@ -1254,11 +1254,13 @@ __xe_pt_bind_vma(struct xe_tile *tile, struct xe_vma *vma, struct xe_exec_queue
* non-faulting LR, in particular on user-space batch buffer chaining,
* it needs to be done here.
*/
- if ((rebind && !xe_vm_in_lr_mode(vm) && !vm->batch_invalidate_tlb) ||
- (!rebind && xe_vm_has_scratch(vm) && xe_vm_in_preempt_fence_mode(vm))) {
+ if ((!rebind && xe_vm_has_scratch(vm) && xe_vm_in_preempt_fence_mode(vm))) {
ifence = kzalloc(sizeof(*ifence), GFP_KERNEL);
if (!ifence)
return ERR_PTR(-ENOMEM);
+ } else if (rebind && !xe_vm_in_lr_mode(vm)) {
+ /* We bump also if batch_invalidate_tlb is true */
+ vm->tlb_flush_seqno++;
}
rfence = kzalloc(sizeof(*rfence), GFP_KERNEL);
diff --git a/drivers/gpu/drm/xe/xe_ring_ops.c b/drivers/gpu/drm/xe/xe_ring_ops.c
index c4edffcd4a32..5b2b37b59813 100644
--- a/drivers/gpu/drm/xe/xe_ring_ops.c
+++ b/drivers/gpu/drm/xe/xe_ring_ops.c
@@ -219,10 +219,9 @@ static void __emit_job_gen12_simple(struct xe_sched_job *job, struct xe_lrc *lrc
{
u32 dw[MAX_JOB_SIZE_DW], i = 0;
u32 ppgtt_flag = get_ppgtt_flag(job);
- struct xe_vm *vm = job->q->vm;
struct xe_gt *gt = job->q->gt;
- if (vm && vm->batch_invalidate_tlb) {
+ if (job->ring_ops_flush_tlb) {
dw[i++] = preparser_disable(true);
i = emit_flush_imm_ggtt(xe_lrc_start_seqno_ggtt_addr(lrc),
seqno, true, dw, i);
@@ -270,7 +269,6 @@ static void __emit_job_gen12_video(struct xe_sched_job *job, struct xe_lrc *lrc,
struct xe_gt *gt = job->q->gt;
struct xe_device *xe = gt_to_xe(gt);
bool decode = job->q->class == XE_ENGINE_CLASS_VIDEO_DECODE;
- struct xe_vm *vm = job->q->vm;
dw[i++] = preparser_disable(true);
@@ -282,13 +280,13 @@ static void __emit_job_gen12_video(struct xe_sched_job *job, struct xe_lrc *lrc,
i = emit_aux_table_inv(gt, VE0_AUX_INV, dw, i);
}
- if (vm && vm->batch_invalidate_tlb)
+ if (job->ring_ops_flush_tlb)
i = emit_flush_imm_ggtt(xe_lrc_start_seqno_ggtt_addr(lrc),
seqno, true, dw, i);
dw[i++] = preparser_disable(false);
- if (!vm || !vm->batch_invalidate_tlb)
+ if (!job->ring_ops_flush_tlb)
i = emit_store_imm_ggtt(xe_lrc_start_seqno_ggtt_addr(lrc),
seqno, dw, i);
@@ -317,7 +315,6 @@ static void __emit_job_gen12_render_compute(struct xe_sched_job *job,
struct xe_gt *gt = job->q->gt;
struct xe_device *xe = gt_to_xe(gt);
bool lacks_render = !(gt->info.engine_mask & XE_HW_ENGINE_RCS_MASK);
- struct xe_vm *vm = job->q->vm;
u32 mask_flags = 0;
dw[i++] = preparser_disable(true);
@@ -327,7 +324,7 @@ static void __emit_job_gen12_render_compute(struct xe_sched_job *job,
mask_flags = PIPE_CONTROL_3D_ENGINE_FLAGS;
/* See __xe_pt_bind_vma() for a discussion on TLB invalidations. */
- i = emit_pipe_invalidate(mask_flags, vm && vm->batch_invalidate_tlb, dw, i);
+ i = emit_pipe_invalidate(mask_flags, job->ring_ops_flush_tlb, dw, i);
/* hsdes: 1809175790 */
if (has_aux_ccs(xe))
diff --git a/drivers/gpu/drm/xe/xe_sched_job.c b/drivers/gpu/drm/xe/xe_sched_job.c
index add5a8b89be8..80daee910ae9 100644
--- a/drivers/gpu/drm/xe/xe_sched_job.c
+++ b/drivers/gpu/drm/xe/xe_sched_job.c
@@ -252,6 +252,16 @@ bool xe_sched_job_completed(struct xe_sched_job *job)
void xe_sched_job_arm(struct xe_sched_job *job)
{
+ struct xe_exec_queue *q = job->q;
+ struct xe_vm *vm = q->vm;
+
+ if (vm && !xe_sched_job_is_migration(q) && !xe_vm_in_lr_mode(vm) &&
+ (vm->batch_invalidate_tlb || vm->tlb_flush_seqno != q->tlb_flush_seqno)) {
+ xe_vm_assert_held(vm);
+ q->tlb_flush_seqno = vm->tlb_flush_seqno;
+ job->ring_ops_flush_tlb = true;
+ }
+
drm_sched_job_arm(&job->drm);
}
diff --git a/drivers/gpu/drm/xe/xe_sched_job_types.h b/drivers/gpu/drm/xe/xe_sched_job_types.h
index b1d83da50a53..5e12724219fd 100644
--- a/drivers/gpu/drm/xe/xe_sched_job_types.h
+++ b/drivers/gpu/drm/xe/xe_sched_job_types.h
@@ -39,6 +39,8 @@ struct xe_sched_job {
} user_fence;
/** @migrate_flush_flags: Additional flush flags for migration jobs */
u32 migrate_flush_flags;
+ /** @ring_ops_flush_tlb: The ring ops need to flush TLB before payload. */
+ bool ring_ops_flush_tlb;
/** @batch_addr: batch buffer address of job */
u64 batch_addr[];
};
diff --git a/drivers/gpu/drm/xe/xe_vm_types.h b/drivers/gpu/drm/xe/xe_vm_types.h
index ae5fb565f6bf..5747f136d24d 100644
--- a/drivers/gpu/drm/xe/xe_vm_types.h
+++ b/drivers/gpu/drm/xe/xe_vm_types.h
@@ -264,6 +264,11 @@ struct xe_vm {
bool capture_once;
} error_capture;
+ /**
+ * @tlb_flush_seqno: Required TLB flush seqno for the next exec.
+ * protected by the vm resv.
+ */
+ u64 tlb_flush_seqno;
/** @batch_invalidate_tlb: Always invalidate TLB before batch start */
bool batch_invalidate_tlb;
/** @xef: XE file handle for tracking this VM's drm client */
--
2.44.0
^ permalink raw reply related [flat|nested] 3+ messages in thread
* [PATCH v3 2/4] drm/xe: Rework rebinding
[not found] <20240327091136.3271-1-thomas.hellstrom@linux.intel.com>
2024-03-27 9:11 ` [PATCH v3 1/4] drm/xe: Use ring ops TLB invalidation for rebinds Thomas Hellström
@ 2024-03-27 9:11 ` Thomas Hellström
2024-03-27 9:11 ` [PATCH v3 3/4] drm/xe: Make TLB invalidation fences unordered Thomas Hellström
2 siblings, 0 replies; 3+ messages in thread
From: Thomas Hellström @ 2024-03-27 9:11 UTC (permalink / raw)
To: intel-xe; +Cc: Thomas Hellström, Rodrigo Vivi, Matthew Brost, stable
Instead of handling the vm's rebind fence separately,
which is error prone if they are not strictly ordered,
attach rebind fences as kernel fences to the vm's resv.
Fixes: dd08ebf6c352 ("drm/xe: Introduce a new DRM driver for Intel GPUs")
Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
Cc: Matthew Brost <matthew.brost@intel.com>
Cc: <stable@vger.kernel.org> # v6.8+
Signed-off-by: Thomas Hellström <thomas.hellstrom@linux.intel.com>
Reviewed-by: Matthew Brost <matthew.brost@intel.com>
---
drivers/gpu/drm/xe/xe_exec.c | 31 +++----------------------------
drivers/gpu/drm/xe/xe_pt.c | 2 +-
drivers/gpu/drm/xe/xe_vm.c | 27 +++++++++------------------
drivers/gpu/drm/xe/xe_vm.h | 2 +-
drivers/gpu/drm/xe/xe_vm_types.h | 3 ---
5 files changed, 14 insertions(+), 51 deletions(-)
diff --git a/drivers/gpu/drm/xe/xe_exec.c b/drivers/gpu/drm/xe/xe_exec.c
index 9d53ef8c49cc..b7e26e8e6472 100644
--- a/drivers/gpu/drm/xe/xe_exec.c
+++ b/drivers/gpu/drm/xe/xe_exec.c
@@ -152,7 +152,6 @@ int xe_exec_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
struct drm_exec *exec = &vm_exec.exec;
u32 i, num_syncs = 0, num_ufence = 0;
struct xe_sched_job *job;
- struct dma_fence *rebind_fence;
struct xe_vm *vm;
bool write_locked, skip_retry = false;
ktime_t end = 0;
@@ -294,35 +293,11 @@ int xe_exec_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
* Rebind any invalidated userptr or evicted BOs in the VM, non-compute
* VM mode only.
*/
- rebind_fence = xe_vm_rebind(vm, false);
- if (IS_ERR(rebind_fence)) {
- err = PTR_ERR(rebind_fence);
+ err = xe_vm_rebind(vm, false);
+ if (err)
goto err_put_job;
- }
-
- /*
- * We store the rebind_fence in the VM so subsequent execs don't get
- * scheduled before the rebinds of userptrs / evicted BOs is complete.
- */
- if (rebind_fence) {
- dma_fence_put(vm->rebind_fence);
- vm->rebind_fence = rebind_fence;
- }
- if (vm->rebind_fence) {
- if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT,
- &vm->rebind_fence->flags)) {
- dma_fence_put(vm->rebind_fence);
- vm->rebind_fence = NULL;
- } else {
- dma_fence_get(vm->rebind_fence);
- err = drm_sched_job_add_dependency(&job->drm,
- vm->rebind_fence);
- if (err)
- goto err_put_job;
- }
- }
- /* Wait behind munmap style rebinds */
+ /* Wait behind rebinds */
if (!xe_vm_in_lr_mode(vm)) {
err = drm_sched_job_add_resv_dependencies(&job->drm,
xe_vm_resv(vm),
diff --git a/drivers/gpu/drm/xe/xe_pt.c b/drivers/gpu/drm/xe/xe_pt.c
index 37117752cfc9..632c1919471d 100644
--- a/drivers/gpu/drm/xe/xe_pt.c
+++ b/drivers/gpu/drm/xe/xe_pt.c
@@ -1299,7 +1299,7 @@ __xe_pt_bind_vma(struct xe_tile *tile, struct xe_vma *vma, struct xe_exec_queue
}
/* add shared fence now for pagetable delayed destroy */
- dma_resv_add_fence(xe_vm_resv(vm), fence, !rebind &&
+ dma_resv_add_fence(xe_vm_resv(vm), fence, rebind ||
last_munmap_rebind ?
DMA_RESV_USAGE_KERNEL :
DMA_RESV_USAGE_BOOKKEEP);
diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c
index 694fbb546372..e995196862db 100644
--- a/drivers/gpu/drm/xe/xe_vm.c
+++ b/drivers/gpu/drm/xe/xe_vm.c
@@ -522,7 +522,6 @@ static void preempt_rebind_work_func(struct work_struct *w)
{
struct xe_vm *vm = container_of(w, struct xe_vm, preempt.rebind_work);
struct drm_exec exec;
- struct dma_fence *rebind_fence;
unsigned int fence_count = 0;
LIST_HEAD(preempt_fences);
ktime_t end = 0;
@@ -568,18 +567,11 @@ static void preempt_rebind_work_func(struct work_struct *w)
if (err)
goto out_unlock;
- rebind_fence = xe_vm_rebind(vm, true);
- if (IS_ERR(rebind_fence)) {
- err = PTR_ERR(rebind_fence);
+ err = xe_vm_rebind(vm, true);
+ if (err)
goto out_unlock;
- }
-
- if (rebind_fence) {
- dma_fence_wait(rebind_fence, false);
- dma_fence_put(rebind_fence);
- }
- /* Wait on munmap style VM unbinds */
+ /* Wait on rebinds and munmap style VM unbinds */
wait = dma_resv_wait_timeout(xe_vm_resv(vm),
DMA_RESV_USAGE_KERNEL,
false, MAX_SCHEDULE_TIMEOUT);
@@ -777,14 +769,14 @@ xe_vm_bind_vma(struct xe_vma *vma, struct xe_exec_queue *q,
struct xe_sync_entry *syncs, u32 num_syncs,
bool first_op, bool last_op);
-struct dma_fence *xe_vm_rebind(struct xe_vm *vm, bool rebind_worker)
+int xe_vm_rebind(struct xe_vm *vm, bool rebind_worker)
{
- struct dma_fence *fence = NULL;
+ struct dma_fence *fence;
struct xe_vma *vma, *next;
lockdep_assert_held(&vm->lock);
if (xe_vm_in_lr_mode(vm) && !rebind_worker)
- return NULL;
+ return 0;
xe_vm_assert_held(vm);
list_for_each_entry_safe(vma, next, &vm->rebind_list,
@@ -792,17 +784,17 @@ struct dma_fence *xe_vm_rebind(struct xe_vm *vm, bool rebind_worker)
xe_assert(vm->xe, vma->tile_present);
list_del_init(&vma->combined_links.rebind);
- dma_fence_put(fence);
if (rebind_worker)
trace_xe_vma_rebind_worker(vma);
else
trace_xe_vma_rebind_exec(vma);
fence = xe_vm_bind_vma(vma, NULL, NULL, 0, false, false);
if (IS_ERR(fence))
- return fence;
+ return PTR_ERR(fence);
+ dma_fence_put(fence);
}
- return fence;
+ return 0;
}
static void xe_vma_free(struct xe_vma *vma)
@@ -1592,7 +1584,6 @@ static void vm_destroy_work_func(struct work_struct *w)
XE_WARN_ON(vm->pt_root[id]);
trace_xe_vm_free(vm);
- dma_fence_put(vm->rebind_fence);
kfree(vm);
}
diff --git a/drivers/gpu/drm/xe/xe_vm.h b/drivers/gpu/drm/xe/xe_vm.h
index 6df1f1c7f85d..4853354336f2 100644
--- a/drivers/gpu/drm/xe/xe_vm.h
+++ b/drivers/gpu/drm/xe/xe_vm.h
@@ -207,7 +207,7 @@ int __xe_vm_userptr_needs_repin(struct xe_vm *vm);
int xe_vm_userptr_check_repin(struct xe_vm *vm);
-struct dma_fence *xe_vm_rebind(struct xe_vm *vm, bool rebind_worker);
+int xe_vm_rebind(struct xe_vm *vm, bool rebind_worker);
int xe_vm_invalidate_vma(struct xe_vma *vma);
diff --git a/drivers/gpu/drm/xe/xe_vm_types.h b/drivers/gpu/drm/xe/xe_vm_types.h
index 5747f136d24d..badf3945083d 100644
--- a/drivers/gpu/drm/xe/xe_vm_types.h
+++ b/drivers/gpu/drm/xe/xe_vm_types.h
@@ -177,9 +177,6 @@ struct xe_vm {
*/
struct list_head rebind_list;
- /** @rebind_fence: rebind fence from execbuf */
- struct dma_fence *rebind_fence;
-
/**
* @destroy_work: worker to destroy VM, needed as a dma_fence signaling
* from an irq context can be last put and the destroy needs to be able
--
2.44.0
^ permalink raw reply related [flat|nested] 3+ messages in thread
* [PATCH v3 3/4] drm/xe: Make TLB invalidation fences unordered
[not found] <20240327091136.3271-1-thomas.hellstrom@linux.intel.com>
2024-03-27 9:11 ` [PATCH v3 1/4] drm/xe: Use ring ops TLB invalidation for rebinds Thomas Hellström
2024-03-27 9:11 ` [PATCH v3 2/4] drm/xe: Rework rebinding Thomas Hellström
@ 2024-03-27 9:11 ` Thomas Hellström
2 siblings, 0 replies; 3+ messages in thread
From: Thomas Hellström @ 2024-03-27 9:11 UTC (permalink / raw)
To: intel-xe; +Cc: Thomas Hellström, Matthew Brost, stable
They can actually complete out-of-order, so allocate a unique
fence context for each fence.
Fixes: 5387e865d90e ("drm/xe: Add TLB invalidation fence after rebinds issued from execs")
Cc: Matthew Brost <matthew.brost@intel.com>
Cc: <stable@vger.kernel.org> # v6.8+
Signed-off-by: Thomas Hellström <thomas.hellstrom@linux.intel.com>
Reviewed-by: Matthew Brost <matthew.brost@intel.com>
---
drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c | 1 -
drivers/gpu/drm/xe/xe_gt_types.h | 7 -------
drivers/gpu/drm/xe/xe_pt.c | 3 +--
3 files changed, 1 insertion(+), 10 deletions(-)
diff --git a/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c b/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c
index 25b4111097bc..93df2d7969b3 100644
--- a/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c
+++ b/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c
@@ -63,7 +63,6 @@ int xe_gt_tlb_invalidation_init(struct xe_gt *gt)
INIT_LIST_HEAD(>->tlb_invalidation.pending_fences);
spin_lock_init(>->tlb_invalidation.pending_lock);
spin_lock_init(>->tlb_invalidation.lock);
- gt->tlb_invalidation.fence_context = dma_fence_context_alloc(1);
INIT_DELAYED_WORK(>->tlb_invalidation.fence_tdr,
xe_gt_tlb_fence_timeout);
diff --git a/drivers/gpu/drm/xe/xe_gt_types.h b/drivers/gpu/drm/xe/xe_gt_types.h
index f6da2ad9719f..2143dffcaf11 100644
--- a/drivers/gpu/drm/xe/xe_gt_types.h
+++ b/drivers/gpu/drm/xe/xe_gt_types.h
@@ -179,13 +179,6 @@ struct xe_gt {
* xe_gt_tlb_fence_timeout after the timeut interval is over.
*/
struct delayed_work fence_tdr;
- /** @tlb_invalidation.fence_context: context for TLB invalidation fences */
- u64 fence_context;
- /**
- * @tlb_invalidation.fence_seqno: seqno to TLB invalidation fences, protected by
- * tlb_invalidation.lock
- */
- u32 fence_seqno;
/** @tlb_invalidation.lock: protects TLB invalidation fences */
spinlock_t lock;
} tlb_invalidation;
diff --git a/drivers/gpu/drm/xe/xe_pt.c b/drivers/gpu/drm/xe/xe_pt.c
index 632c1919471d..d1b999dbc906 100644
--- a/drivers/gpu/drm/xe/xe_pt.c
+++ b/drivers/gpu/drm/xe/xe_pt.c
@@ -1135,8 +1135,7 @@ static int invalidation_fence_init(struct xe_gt *gt,
spin_lock_irq(>->tlb_invalidation.lock);
dma_fence_init(&ifence->base.base, &invalidation_fence_ops,
>->tlb_invalidation.lock,
- gt->tlb_invalidation.fence_context,
- ++gt->tlb_invalidation.fence_seqno);
+ dma_fence_context_alloc(1), 1);
spin_unlock_irq(>->tlb_invalidation.lock);
INIT_LIST_HEAD(&ifence->base.link);
--
2.44.0
^ permalink raw reply related [flat|nested] 3+ messages in thread
end of thread, other threads:[~2024-03-27 9:11 UTC | newest]
Thread overview: 3+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
[not found] <20240327091136.3271-1-thomas.hellstrom@linux.intel.com>
2024-03-27 9:11 ` [PATCH v3 1/4] drm/xe: Use ring ops TLB invalidation for rebinds Thomas Hellström
2024-03-27 9:11 ` [PATCH v3 2/4] drm/xe: Rework rebinding Thomas Hellström
2024-03-27 9:11 ` [PATCH v3 3/4] drm/xe: Make TLB invalidation fences unordered Thomas Hellström
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).