[PATCH v2 0/5] drm/xe: Allow migrate vm gpu submissions from reclaim context

Intel-XE Archive on lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH v2 0/5] drm/xe: Allow migrate vm gpu submissions from reclaim context
@ 2024-05-23 15:29 Thomas Hellström
  2024-05-23 15:29 ` [PATCH v2 1/5] drm/xe: Decouple job seqno and lrc seqno Thomas Hellström
                   ` (5 more replies)
  0 siblings, 6 replies; 10+ messages in thread
From: Thomas Hellström @ 2024-05-23 15:29 UTC (permalink / raw)
  To: intel-xe; +Cc: Thomas Hellström, Matthew Brost

On lunar lake, we need to be able to copy CCS compression metadata from
reclaim context when using a shrinker.
ince we do dma_fence allocations under the
struct xe_migrate::job_mutex, we're facing a lockdep violation.
This patchset deals with moving the job + fence allocation out of
the job_mutex by splitting up the fence allocation- and
initialization.

v2:
- Add a patch from Matthew Brost to fix up the use of
  composite fence seqno.
- A couple of bugfixes as detailed in patch 3.

Matthew Brost (1):
  drm/xe: Decouple job seqno and lrc seqno

Thomas Hellström (4):
  drm/xe: Split lrc seqno fence creation up
  drm/xe: Don't initialize fences at xe_sched_job_create()
  drm/xe: Remove xe_lrc_create_seqno_fence()
  drm/xe: Move job creation out of the struct xe_migrate::job_mutex

 drivers/gpu/drm/xe/xe_exec_queue.c       |   5 -
 drivers/gpu/drm/xe/xe_exec_queue_types.h |  10 --
 drivers/gpu/drm/xe/xe_guc_submit.c       |   5 +-
 drivers/gpu/drm/xe/xe_hw_fence.c         |  59 ++++++--
 drivers/gpu/drm/xe/xe_hw_fence.h         |   7 +-
 drivers/gpu/drm/xe/xe_lrc.c              |  39 +++++-
 drivers/gpu/drm/xe/xe_lrc.h              |   4 +-
 drivers/gpu/drm/xe/xe_migrate.c          |  17 ++-
 drivers/gpu/drm/xe/xe_ring_ops.c         |  22 +--
 drivers/gpu/drm/xe/xe_sched_job.c        | 165 +++++++++++++----------
 drivers/gpu/drm/xe/xe_sched_job.h        |   5 +
 drivers/gpu/drm/xe/xe_sched_job_types.h  |  20 ++-
 drivers/gpu/drm/xe/xe_trace.h            |   9 +-
 13 files changed, 235 insertions(+), 132 deletions(-)

-- 
2.44.0


^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH v2 1/5] drm/xe: Decouple job seqno and lrc seqno
  2024-05-23 15:29 [PATCH v2 0/5] drm/xe: Allow migrate vm gpu submissions from reclaim context Thomas Hellström
@ 2024-05-23 15:29 ` Thomas Hellström
  2024-05-23 15:29 ` [PATCH v2 2/5] drm/xe: Split lrc seqno fence creation up Thomas Hellström
                   ` (4 subsequent siblings)
  5 siblings, 0 replies; 10+ messages in thread
From: Thomas Hellström @ 2024-05-23 15:29 UTC (permalink / raw)
  To: intel-xe; +Cc: Matthew Brost, Thomas Hellström

From: Matthew Brost <matthew.brost@intel.com>

Tightly coupling these seqno presents problems if alternative fences for
jobs are used. Decouple these for correctness.

v2:
- Slightly reword commit message (Thomas)
- Make sure the lrc fence ops are used in comparison (Thomas)
- Assume seqno is unsigned rather than signed in format string (Thomas)

Cc: Thomas Hellström <thomas.hellstrom@linux.intel.com>
Signed-off-by: Matthew Brost <matthew.brost@intel.com>
Signed-off-by: Thomas Hellström <thomas.hellstrom@linux.intel.com>
---
 drivers/gpu/drm/xe/xe_exec_queue.c      |  2 +-
 drivers/gpu/drm/xe/xe_guc_submit.c      |  5 +++--
 drivers/gpu/drm/xe/xe_ring_ops.c        | 12 ++++++------
 drivers/gpu/drm/xe/xe_sched_job.c       | 16 ++++++++--------
 drivers/gpu/drm/xe/xe_sched_job.h       |  5 +++++
 drivers/gpu/drm/xe/xe_sched_job_types.h |  2 ++
 drivers/gpu/drm/xe/xe_trace.h           |  7 +++++--
 7 files changed, 30 insertions(+), 19 deletions(-)

diff --git a/drivers/gpu/drm/xe/xe_exec_queue.c b/drivers/gpu/drm/xe/xe_exec_queue.c
index 395de93579fa..e4607f0e3456 100644
--- a/drivers/gpu/drm/xe/xe_exec_queue.c
+++ b/drivers/gpu/drm/xe/xe_exec_queue.c
@@ -98,7 +98,7 @@ static struct xe_exec_queue *__xe_exec_queue_alloc(struct xe_device *xe,
 
 	if (xe_exec_queue_is_parallel(q)) {
 		q->parallel.composite_fence_ctx = dma_fence_context_alloc(1);
-		q->parallel.composite_fence_seqno = XE_FENCE_INITIAL_SEQNO;
+		q->parallel.composite_fence_seqno = 0;
 	}
 
 	return q;
diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c
index 4efb88e3e056..5c64850641c2 100644
--- a/drivers/gpu/drm/xe/xe_guc_submit.c
+++ b/drivers/gpu/drm/xe/xe_guc_submit.c
@@ -973,8 +973,9 @@ guc_exec_queue_timedout_job(struct drm_sched_job *drm_job)
 		return DRM_GPU_SCHED_STAT_NOMINAL;
 	}
 
-	drm_notice(&xe->drm, "Timedout job: seqno=%u, guc_id=%d, flags=0x%lx",
-		   xe_sched_job_seqno(job), q->guc->id, q->flags);
+	drm_notice(&xe->drm, "Timedout job: seqno=%u, lrc_seqno=%u, guc_id=%d, flags=0x%lx",
+		   xe_sched_job_seqno(job), xe_sched_job_lrc_seqno(job),
+		   q->guc->id, q->flags);
 	xe_gt_WARN(q->gt, q->flags & EXEC_QUEUE_FLAG_KERNEL,
 		   "Kernel-submitted job timed out\n");
 	xe_gt_WARN(q->gt, q->flags & EXEC_QUEUE_FLAG_VM && !exec_queue_killed(q),
diff --git a/drivers/gpu/drm/xe/xe_ring_ops.c b/drivers/gpu/drm/xe/xe_ring_ops.c
index a3ca718456f6..2705d1f9d572 100644
--- a/drivers/gpu/drm/xe/xe_ring_ops.c
+++ b/drivers/gpu/drm/xe/xe_ring_ops.c
@@ -398,7 +398,7 @@ static void emit_job_gen12_gsc(struct xe_sched_job *job)
 
 	__emit_job_gen12_simple(job, job->q->lrc,
 				job->batch_addr[0],
-				xe_sched_job_seqno(job));
+				xe_sched_job_lrc_seqno(job));
 }
 
 static void emit_job_gen12_copy(struct xe_sched_job *job)
@@ -407,14 +407,14 @@ static void emit_job_gen12_copy(struct xe_sched_job *job)
 
 	if (xe_sched_job_is_migration(job->q)) {
 		emit_migration_job_gen12(job, job->q->lrc,
-					 xe_sched_job_seqno(job));
+					 xe_sched_job_lrc_seqno(job));
 		return;
 	}
 
 	for (i = 0; i < job->q->width; ++i)
 		__emit_job_gen12_simple(job, job->q->lrc + i,
-				        job->batch_addr[i],
-				        xe_sched_job_seqno(job));
+					job->batch_addr[i],
+					xe_sched_job_lrc_seqno(job));
 }
 
 static void emit_job_gen12_video(struct xe_sched_job *job)
@@ -425,7 +425,7 @@ static void emit_job_gen12_video(struct xe_sched_job *job)
 	for (i = 0; i < job->q->width; ++i)
 		__emit_job_gen12_video(job, job->q->lrc + i,
 				       job->batch_addr[i],
-				       xe_sched_job_seqno(job));
+				       xe_sched_job_lrc_seqno(job));
 }
 
 static void emit_job_gen12_render_compute(struct xe_sched_job *job)
@@ -435,7 +435,7 @@ static void emit_job_gen12_render_compute(struct xe_sched_job *job)
 	for (i = 0; i < job->q->width; ++i)
 		__emit_job_gen12_render_compute(job, job->q->lrc + i,
 						job->batch_addr[i],
-						xe_sched_job_seqno(job));
+						xe_sched_job_lrc_seqno(job));
 }
 
 static const struct xe_ring_ops ring_ops_gen12_gsc = {
diff --git a/drivers/gpu/drm/xe/xe_sched_job.c b/drivers/gpu/drm/xe/xe_sched_job.c
index cd8a2fba5438..8dd612b8b2b0 100644
--- a/drivers/gpu/drm/xe/xe_sched_job.c
+++ b/drivers/gpu/drm/xe/xe_sched_job.c
@@ -117,6 +117,7 @@ struct xe_sched_job *xe_sched_job_create(struct xe_exec_queue *q,
 			err = PTR_ERR(job->fence);
 			goto err_sched_job;
 		}
+		job->lrc_seqno = job->fence->seqno;
 	} else {
 		struct dma_fence_array *cf;
 
@@ -132,6 +133,8 @@ struct xe_sched_job *xe_sched_job_create(struct xe_exec_queue *q,
 				err = PTR_ERR(fences[j]);
 				goto err_fences;
 			}
+			if (!j)
+				job->lrc_seqno = fences[0]->seqno;
 		}
 
 		cf = dma_fence_array_create(q->width, fences,
@@ -144,10 +147,6 @@ struct xe_sched_job *xe_sched_job_create(struct xe_exec_queue *q,
 			goto err_fences;
 		}
 
-		/* Sanity check */
-		for (j = 0; j < q->width; ++j)
-			xe_assert(job_to_xe(job), cf->base.seqno == fences[j]->seqno);
-
 		job->fence = &cf->base;
 	}
 
@@ -233,9 +232,9 @@ bool xe_sched_job_started(struct xe_sched_job *job)
 {
 	struct xe_lrc *lrc = job->q->lrc;
 
-	return !__dma_fence_is_later(xe_sched_job_seqno(job),
+	return !__dma_fence_is_later(xe_sched_job_lrc_seqno(job),
 				     xe_lrc_start_seqno(lrc),
-				     job->fence->ops);
+				     dma_fence_array_first(job->fence)->ops);
 }
 
 bool xe_sched_job_completed(struct xe_sched_job *job)
@@ -247,8 +246,9 @@ bool xe_sched_job_completed(struct xe_sched_job *job)
 	 * parallel handshake is done.
 	 */
 
-	return !__dma_fence_is_later(xe_sched_job_seqno(job), xe_lrc_seqno(lrc),
-				     job->fence->ops);
+	return !__dma_fence_is_later(xe_sched_job_lrc_seqno(job),
+				     xe_lrc_seqno(lrc),
+				     dma_fence_array_first(job->fence)->ops);
 }
 
 void xe_sched_job_arm(struct xe_sched_job *job)
diff --git a/drivers/gpu/drm/xe/xe_sched_job.h b/drivers/gpu/drm/xe/xe_sched_job.h
index c75018f4660d..002c3b5c0a5c 100644
--- a/drivers/gpu/drm/xe/xe_sched_job.h
+++ b/drivers/gpu/drm/xe/xe_sched_job.h
@@ -73,6 +73,11 @@ static inline u32 xe_sched_job_seqno(struct xe_sched_job *job)
 	return job->fence->seqno;
 }
 
+static inline u32 xe_sched_job_lrc_seqno(struct xe_sched_job *job)
+{
+	return job->lrc_seqno;
+}
+
 static inline void
 xe_sched_job_add_migrate_flush(struct xe_sched_job *job, u32 flags)
 {
diff --git a/drivers/gpu/drm/xe/xe_sched_job_types.h b/drivers/gpu/drm/xe/xe_sched_job_types.h
index 5e12724219fd..990ddac55ed6 100644
--- a/drivers/gpu/drm/xe/xe_sched_job_types.h
+++ b/drivers/gpu/drm/xe/xe_sched_job_types.h
@@ -37,6 +37,8 @@ struct xe_sched_job {
 		/** @user_fence.value: write back value */
 		u64 value;
 	} user_fence;
+	/** @lrc_seqno: LRC seqno */
+	u32 lrc_seqno;
 	/** @migrate_flush_flags: Additional flush flags for migration jobs */
 	u32 migrate_flush_flags;
 	/** @ring_ops_flush_tlb: The ring ops need to flush TLB before payload. */
diff --git a/drivers/gpu/drm/xe/xe_trace.h b/drivers/gpu/drm/xe/xe_trace.h
index 2d56cfc09e42..6c6cecc58f63 100644
--- a/drivers/gpu/drm/xe/xe_trace.h
+++ b/drivers/gpu/drm/xe/xe_trace.h
@@ -254,6 +254,7 @@ DECLARE_EVENT_CLASS(xe_sched_job,
 
 		    TP_STRUCT__entry(
 			     __field(u32, seqno)
+			     __field(u32, lrc_seqno)
 			     __field(u16, guc_id)
 			     __field(u32, guc_state)
 			     __field(u32, flags)
@@ -264,6 +265,7 @@ DECLARE_EVENT_CLASS(xe_sched_job,
 
 		    TP_fast_assign(
 			   __entry->seqno = xe_sched_job_seqno(job);
+			   __entry->lrc_seqno = xe_sched_job_lrc_seqno(job);
 			   __entry->guc_id = job->q->guc->id;
 			   __entry->guc_state =
 			   atomic_read(&job->q->guc->state);
@@ -273,8 +275,9 @@ DECLARE_EVENT_CLASS(xe_sched_job,
 			   __entry->batch_addr = (u64)job->batch_addr[0];
 			   ),
 
-		    TP_printk("fence=%p, seqno=%u, guc_id=%d, batch_addr=0x%012llx, guc_state=0x%x, flags=0x%x, error=%d",
-			      __entry->fence, __entry->seqno, __entry->guc_id,
+		    TP_printk("fence=%p, seqno=%u, lrc_seqno=%u, guc_id=%d, batch_addr=0x%012llx, guc_state=0x%x, flags=0x%x, error=%d",
+			      __entry->fence, __entry->seqno,
+			      __entry->lrc_seqno, __entry->guc_id,
 			      __entry->batch_addr, __entry->guc_state,
 			      __entry->flags, __entry->error)
 );
-- 
2.44.0


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH v2 2/5] drm/xe: Split lrc seqno fence creation up
  2024-05-23 15:29 [PATCH v2 0/5] drm/xe: Allow migrate vm gpu submissions from reclaim context Thomas Hellström
  2024-05-23 15:29 ` [PATCH v2 1/5] drm/xe: Decouple job seqno and lrc seqno Thomas Hellström
@ 2024-05-23 15:29 ` Thomas Hellström
  2024-05-23 17:58   ` Matthew Brost
  2024-05-23 15:29 ` [PATCH v2 3/5] drm/xe: Don't initialize fences at xe_sched_job_create() Thomas Hellström
                   ` (3 subsequent siblings)
  5 siblings, 1 reply; 10+ messages in thread
From: Thomas Hellström @ 2024-05-23 15:29 UTC (permalink / raw)
  To: intel-xe; +Cc: Thomas Hellström, Matthew Brost

Since sometimes a lock is required to initialize a seqno fence,
and it might be desirable not to hold that lock while performing
memory allocations, split the lrc seqno fence creation up into an
allocation phase and an initialization phase.

Since lrc seqno fences under the hood are hw_fences, do the same
for these and remove the xe_hw_fence_create() function since it
is not used anymore.

Signed-off-by: Thomas Hellström <thomas.hellstrom@linux.intel.com>
---
 drivers/gpu/drm/xe/xe_hw_fence.c | 59 +++++++++++++++++++++++++-------
 drivers/gpu/drm/xe/xe_hw_fence.h |  7 ++--
 drivers/gpu/drm/xe/xe_lrc.c      | 48 ++++++++++++++++++++++++--
 drivers/gpu/drm/xe/xe_lrc.h      |  3 ++
 4 files changed, 101 insertions(+), 16 deletions(-)

diff --git a/drivers/gpu/drm/xe/xe_hw_fence.c b/drivers/gpu/drm/xe/xe_hw_fence.c
index f872ef103127..35c0063a831a 100644
--- a/drivers/gpu/drm/xe/xe_hw_fence.c
+++ b/drivers/gpu/drm/xe/xe_hw_fence.c
@@ -208,23 +208,58 @@ static struct xe_hw_fence *to_xe_hw_fence(struct dma_fence *fence)
 	return container_of(fence, struct xe_hw_fence, dma);
 }
 
-struct xe_hw_fence *xe_hw_fence_create(struct xe_hw_fence_ctx *ctx,
-				       struct iosys_map seqno_map)
+/**
+ * xe_hw_fence_alloc() -  Allocate an hw fence.
+ *
+ * Allocate but don't initialize an hw fence.
+ *
+ * Return: Pointer to the allocated fence or
+ * negative error pointer on error.
+ */
+struct dma_fence *xe_hw_fence_alloc(void)
 {
-	struct xe_hw_fence *fence;
+	struct xe_hw_fence *hw_fence = fence_alloc();
 
-	fence = fence_alloc();
-	if (!fence)
+	if (!hw_fence)
 		return ERR_PTR(-ENOMEM);
 
-	fence->ctx = ctx;
-	fence->seqno_map = seqno_map;
-	INIT_LIST_HEAD(&fence->irq_link);
+	return &hw_fence->dma;
+}
 
-	dma_fence_init(&fence->dma, &xe_hw_fence_ops, &ctx->irq->lock,
-		       ctx->dma_fence_ctx, ctx->next_seqno++);
+/**
+ * xe_hw_fence_free() - Free an hw fence.
+ * @fence: Pointer to the fence to free.
+ *
+ * Frees an hw fence that hasn't yet been
+ * initialized.
+ */
+void xe_hw_fence_free(struct dma_fence *fence)
+{
+	fence_free(&fence->rcu);
+}
 
-	trace_xe_hw_fence_create(fence);
+/**
+ * xe_hw_fence_init() - Initialize an hw fence.
+ * @fence: Pointer to the fence to initialize.
+ * @ctx: Pointer to the struct xe_hw_fence_ctx fence context.
+ * @seqno_map: Pointer to the map into where the seqno is blitted.
+ *
+ * Initializes a pre-allocated hw fence.
+ * After initialization, the fence is subject to normal
+ * dma-fence refcounting.
+ */
+void xe_hw_fence_init(struct dma_fence *fence, struct xe_hw_fence_ctx *ctx,
+		      struct iosys_map seqno_map)
+{
+	struct  xe_hw_fence *hw_fence =
+		container_of(fence, typeof(*hw_fence), dma);
+
+	hw_fence->ctx = ctx;
+	hw_fence->seqno_map = seqno_map;
+	INIT_LIST_HEAD(&hw_fence->irq_link);
+
+	dma_fence_init(fence, &xe_hw_fence_ops, &ctx->irq->lock,
+		       ctx->dma_fence_ctx, ctx->next_seqno++);
 
-	return fence;
+	trace_xe_hw_fence_create(hw_fence);
 }
diff --git a/drivers/gpu/drm/xe/xe_hw_fence.h b/drivers/gpu/drm/xe/xe_hw_fence.h
index cfe5fd603787..f13a1c4982c7 100644
--- a/drivers/gpu/drm/xe/xe_hw_fence.h
+++ b/drivers/gpu/drm/xe/xe_hw_fence.h
@@ -24,7 +24,10 @@ void xe_hw_fence_ctx_init(struct xe_hw_fence_ctx *ctx, struct xe_gt *gt,
 			  struct xe_hw_fence_irq *irq, const char *name);
 void xe_hw_fence_ctx_finish(struct xe_hw_fence_ctx *ctx);
 
-struct xe_hw_fence *xe_hw_fence_create(struct xe_hw_fence_ctx *ctx,
-				       struct iosys_map seqno_map);
+struct dma_fence *xe_hw_fence_alloc(void);
 
+void xe_hw_fence_free(struct dma_fence *fence);
+
+void xe_hw_fence_init(struct dma_fence *fence, struct xe_hw_fence_ctx *ctx,
+		      struct iosys_map seqno_map);
 #endif
diff --git a/drivers/gpu/drm/xe/xe_lrc.c b/drivers/gpu/drm/xe/xe_lrc.c
index 9b0a4078add3..7441230ad627 100644
--- a/drivers/gpu/drm/xe/xe_lrc.c
+++ b/drivers/gpu/drm/xe/xe_lrc.c
@@ -1030,10 +1030,54 @@ u32 xe_lrc_seqno_ggtt_addr(struct xe_lrc *lrc)
 	return __xe_lrc_seqno_ggtt_addr(lrc);
 }
 
+/**
+ * xe_lrc_alloc_seqno_fence() - Allocate an lrc seqno fence.
+ *
+ * Allocate but don't initialize an lrc seqno fence.
+ *
+ * Return: Pointer to the allocated fence or
+ * negative error pointer on error.
+ */
+struct dma_fence *xe_lrc_alloc_seqno_fence(void)
+{
+	return xe_hw_fence_alloc();
+}
+
+/**
+ * xe_lrc_free_seqno_fence() - Free an lrc seqno fence.
+ * @fence: Pointer to the fence to free.
+ *
+ * Frees an lrc seqno fence that hasn't yet been
+ * initialized.
+ */
+void xe_lrc_free_seqno_fence(struct dma_fence *fence)
+{
+	xe_hw_fence_free(fence);
+}
+
+/**
+ * xe_lrc_init_seqno_fence() - Initialize an lrc seqno fence.
+ * @lrc: Pointer to the lrc.
+ * @fence: Pointer to the fence to initialize.
+ *
+ * Initializes a pre-allocated lrc seqno fence.
+ * After initialization, the fence is subject to normal
+ * dma-fence refcounting.
+ */
+void xe_lrc_init_seqno_fence(struct xe_lrc *lrc, struct dma_fence *fence)
+{
+	xe_hw_fence_init(fence, &lrc->fence_ctx, __xe_lrc_seqno_map(lrc));
+}
+
 struct dma_fence *xe_lrc_create_seqno_fence(struct xe_lrc *lrc)
 {
-	return &xe_hw_fence_create(&lrc->fence_ctx,
-				   __xe_lrc_seqno_map(lrc))->dma;
+	struct dma_fence *fence = xe_lrc_alloc_seqno_fence();
+
+	if (IS_ERR(fence))
+		return fence;
+
+	xe_lrc_init_seqno_fence(lrc, fence);
+	return fence;
 }
 
 s32 xe_lrc_seqno(struct xe_lrc *lrc)
diff --git a/drivers/gpu/drm/xe/xe_lrc.h b/drivers/gpu/drm/xe/xe_lrc.h
index e0e841963c23..84e4f4ef7f68 100644
--- a/drivers/gpu/drm/xe/xe_lrc.h
+++ b/drivers/gpu/drm/xe/xe_lrc.h
@@ -44,6 +44,9 @@ void xe_lrc_write_ctx_reg(struct xe_lrc *lrc, int reg_nr, u32 val);
 u64 xe_lrc_descriptor(struct xe_lrc *lrc);
 
 u32 xe_lrc_seqno_ggtt_addr(struct xe_lrc *lrc);
+struct dma_fence *xe_lrc_alloc_seqno_fence(void);
+void xe_lrc_free_seqno_fence(struct dma_fence *fence);
+void xe_lrc_init_seqno_fence(struct xe_lrc *lrc, struct dma_fence *fence);
 struct dma_fence *xe_lrc_create_seqno_fence(struct xe_lrc *lrc);
 s32 xe_lrc_seqno(struct xe_lrc *lrc);
 
-- 
2.44.0


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH v2 3/5] drm/xe: Don't initialize fences at xe_sched_job_create()
  2024-05-23 15:29 [PATCH v2 0/5] drm/xe: Allow migrate vm gpu submissions from reclaim context Thomas Hellström
  2024-05-23 15:29 ` [PATCH v2 1/5] drm/xe: Decouple job seqno and lrc seqno Thomas Hellström
  2024-05-23 15:29 ` [PATCH v2 2/5] drm/xe: Split lrc seqno fence creation up Thomas Hellström
@ 2024-05-23 15:29 ` Thomas Hellström
  2024-05-23 18:16   ` Matthew Brost
  2024-05-23 15:29 ` [PATCH v2 4/5] drm/xe: Remove xe_lrc_create_seqno_fence() Thomas Hellström
                   ` (2 subsequent siblings)
  5 siblings, 1 reply; 10+ messages in thread
From: Thomas Hellström @ 2024-05-23 15:29 UTC (permalink / raw)
  To: intel-xe; +Cc: Thomas Hellström, Matthew Brost

Pre-allocate but don't initialize fences at xe_sched_job_create(),
and initialize / arm them instead at xe_sched_job_arm(). This
makes it possible to move xe_sched_job_create() with its memory
allocation out of any lock that is required for fence
initialization, and that may not allow memory allocation under it.

Replaces the struct dma_fence_array for parallell jobs with a
struct dma_fence_chain, since the former doesn't allow
a split-up between allocation and initialization.

v2:
- Rebase.
- Don't always use the first lrc when initializing parallel
  lrc fences.
- Use dma_fence_chain_contained() to access the lrc fences.

Signed-off-by: Thomas Hellström <thomas.hellstrom@linux.intel.com>
---
 drivers/gpu/drm/xe/xe_exec_queue.c       |   5 -
 drivers/gpu/drm/xe/xe_exec_queue_types.h |  10 --
 drivers/gpu/drm/xe/xe_ring_ops.c         |  12 +-
 drivers/gpu/drm/xe/xe_sched_job.c        | 159 +++++++++++++----------
 drivers/gpu/drm/xe/xe_sched_job_types.h  |  18 ++-
 drivers/gpu/drm/xe/xe_trace.h            |   2 +-
 6 files changed, 113 insertions(+), 93 deletions(-)

diff --git a/drivers/gpu/drm/xe/xe_exec_queue.c b/drivers/gpu/drm/xe/xe_exec_queue.c
index e4607f0e3456..a5969271a964 100644
--- a/drivers/gpu/drm/xe/xe_exec_queue.c
+++ b/drivers/gpu/drm/xe/xe_exec_queue.c
@@ -96,11 +96,6 @@ static struct xe_exec_queue *__xe_exec_queue_alloc(struct xe_device *xe,
 		}
 	}
 
-	if (xe_exec_queue_is_parallel(q)) {
-		q->parallel.composite_fence_ctx = dma_fence_context_alloc(1);
-		q->parallel.composite_fence_seqno = 0;
-	}
-
 	return q;
 }
 
diff --git a/drivers/gpu/drm/xe/xe_exec_queue_types.h b/drivers/gpu/drm/xe/xe_exec_queue_types.h
index ee78d497d838..f0c40e8ad80a 100644
--- a/drivers/gpu/drm/xe/xe_exec_queue_types.h
+++ b/drivers/gpu/drm/xe/xe_exec_queue_types.h
@@ -103,16 +103,6 @@ struct xe_exec_queue {
 		struct xe_guc_exec_queue *guc;
 	};
 
-	/**
-	 * @parallel: parallel submission state
-	 */
-	struct {
-		/** @parallel.composite_fence_ctx: context composite fence */
-		u64 composite_fence_ctx;
-		/** @parallel.composite_fence_seqno: seqno for composite fence */
-		u32 composite_fence_seqno;
-	} parallel;
-
 	/** @sched_props: scheduling properties */
 	struct {
 		/** @sched_props.timeslice_us: timeslice period in micro-seconds */
diff --git a/drivers/gpu/drm/xe/xe_ring_ops.c b/drivers/gpu/drm/xe/xe_ring_ops.c
index 2705d1f9d572..f75756e7a87b 100644
--- a/drivers/gpu/drm/xe/xe_ring_ops.c
+++ b/drivers/gpu/drm/xe/xe_ring_ops.c
@@ -366,7 +366,7 @@ static void emit_migration_job_gen12(struct xe_sched_job *job,
 
 	dw[i++] = MI_ARB_ON_OFF | MI_ARB_DISABLE; /* Enabled again below */
 
-	i = emit_bb_start(job->batch_addr[0], BIT(8), dw, i);
+	i = emit_bb_start(job->ptrs[0].batch_addr, BIT(8), dw, i);
 
 	if (!IS_SRIOV_VF(gt_to_xe(job->q->gt))) {
 		/* XXX: Do we need this? Leaving for now. */
@@ -375,7 +375,7 @@ static void emit_migration_job_gen12(struct xe_sched_job *job,
 		dw[i++] = preparser_disable(false);
 	}
 
-	i = emit_bb_start(job->batch_addr[1], BIT(8), dw, i);
+	i = emit_bb_start(job->ptrs[1].batch_addr, BIT(8), dw, i);
 
 	dw[i++] = MI_FLUSH_DW | MI_INVALIDATE_TLB | job->migrate_flush_flags |
 		MI_FLUSH_DW_OP_STOREDW | MI_FLUSH_IMM_DW;
@@ -397,7 +397,7 @@ static void emit_job_gen12_gsc(struct xe_sched_job *job)
 	xe_gt_assert(gt, job->q->width <= 1); /* no parallel submission for GSCCS */
 
 	__emit_job_gen12_simple(job, job->q->lrc,
-				job->batch_addr[0],
+				job->ptrs[0].batch_addr,
 				xe_sched_job_lrc_seqno(job));
 }
 
@@ -413,7 +413,7 @@ static void emit_job_gen12_copy(struct xe_sched_job *job)
 
 	for (i = 0; i < job->q->width; ++i)
 		__emit_job_gen12_simple(job, job->q->lrc + i,
-					job->batch_addr[i],
+					job->ptrs[i].batch_addr,
 					xe_sched_job_lrc_seqno(job));
 }
 
@@ -424,7 +424,7 @@ static void emit_job_gen12_video(struct xe_sched_job *job)
 	/* FIXME: Not doing parallel handshake for now */
 	for (i = 0; i < job->q->width; ++i)
 		__emit_job_gen12_video(job, job->q->lrc + i,
-				       job->batch_addr[i],
+				       job->ptrs[i].batch_addr,
 				       xe_sched_job_lrc_seqno(job));
 }
 
@@ -434,7 +434,7 @@ static void emit_job_gen12_render_compute(struct xe_sched_job *job)
 
 	for (i = 0; i < job->q->width; ++i)
 		__emit_job_gen12_render_compute(job, job->q->lrc + i,
-						job->batch_addr[i],
+						job->ptrs[i].batch_addr,
 						xe_sched_job_lrc_seqno(job));
 }
 
diff --git a/drivers/gpu/drm/xe/xe_sched_job.c b/drivers/gpu/drm/xe/xe_sched_job.c
index 8dd612b8b2b0..95c6c0411592 100644
--- a/drivers/gpu/drm/xe/xe_sched_job.c
+++ b/drivers/gpu/drm/xe/xe_sched_job.c
@@ -6,7 +6,7 @@
 #include "xe_sched_job.h"
 
 #include <drm/xe_drm.h>
-#include <linux/dma-fence-array.h>
+#include <linux/dma-fence-chain.h>
 #include <linux/slab.h>
 
 #include "xe_device.h"
@@ -29,7 +29,7 @@ int __init xe_sched_job_module_init(void)
 	xe_sched_job_slab =
 		kmem_cache_create("xe_sched_job",
 				  sizeof(struct xe_sched_job) +
-				  sizeof(u64), 0,
+				  sizeof(struct xe_job_ptrs), 0,
 				  SLAB_HWCACHE_ALIGN, NULL);
 	if (!xe_sched_job_slab)
 		return -ENOMEM;
@@ -37,7 +37,7 @@ int __init xe_sched_job_module_init(void)
 	xe_sched_job_parallel_slab =
 		kmem_cache_create("xe_sched_job_parallel",
 				  sizeof(struct xe_sched_job) +
-				  sizeof(u64) *
+				  sizeof(struct xe_job_ptrs) *
 				  XE_HW_ENGINE_MAX_INSTANCE, 0,
 				  SLAB_HWCACHE_ALIGN, NULL);
 	if (!xe_sched_job_parallel_slab) {
@@ -79,26 +79,33 @@ static struct xe_device *job_to_xe(struct xe_sched_job *job)
 	return gt_to_xe(job->q->gt);
 }
 
+/* Free unused pre-allocated fences */
+static void xe_sched_job_free_fences(struct xe_sched_job *job)
+{
+	int i;
+
+	for (i = 0; i < job->q->width; ++i) {
+		struct xe_job_ptrs *ptrs = &job->ptrs[i];
+
+		if (ptrs->lrc_fence)
+			xe_lrc_free_seqno_fence(ptrs->lrc_fence);
+		if (ptrs->chain_fence)
+			dma_fence_chain_free(ptrs->chain_fence);
+	}
+}
+
 struct xe_sched_job *xe_sched_job_create(struct xe_exec_queue *q,
 					 u64 *batch_addr)
 {
-	struct xe_sched_job *job;
-	struct dma_fence **fences;
 	bool is_migration = xe_sched_job_is_migration(q);
+	struct xe_sched_job *job;
 	int err;
-	int i, j;
+	int i;
 	u32 width;
 
 	/* only a kernel context can submit a vm-less job */
 	XE_WARN_ON(!q->vm && !(q->flags & EXEC_QUEUE_FLAG_KERNEL));
 
-	/* Migration and kernel engines have their own locking */
-	if (!(q->flags & (EXEC_QUEUE_FLAG_KERNEL | EXEC_QUEUE_FLAG_VM))) {
-		lockdep_assert_held(&q->vm->lock);
-		if (!xe_vm_in_lr_mode(q->vm))
-			xe_vm_assert_held(q->vm);
-	}
-
 	job = job_alloc(xe_exec_queue_is_parallel(q) || is_migration);
 	if (!job)
 		return ERR_PTR(-ENOMEM);
@@ -111,43 +118,25 @@ struct xe_sched_job *xe_sched_job_create(struct xe_exec_queue *q,
 	if (err)
 		goto err_free;
 
-	if (!xe_exec_queue_is_parallel(q)) {
-		job->fence = xe_lrc_create_seqno_fence(q->lrc);
-		if (IS_ERR(job->fence)) {
-			err = PTR_ERR(job->fence);
-			goto err_sched_job;
-		}
-		job->lrc_seqno = job->fence->seqno;
-	} else {
-		struct dma_fence_array *cf;
+	for (i = 0; i < q->width; ++i) {
+		struct dma_fence *fence = xe_lrc_alloc_seqno_fence();
+		struct dma_fence_chain *chain;
 
-		fences = kmalloc_array(q->width, sizeof(*fences), GFP_KERNEL);
-		if (!fences) {
-			err = -ENOMEM;
+		if (IS_ERR(fence)) {
+			err = PTR_ERR(fence);
 			goto err_sched_job;
 		}
+		job->ptrs[i].lrc_fence = fence;
 
-		for (j = 0; j < q->width; ++j) {
-			fences[j] = xe_lrc_create_seqno_fence(q->lrc + j);
-			if (IS_ERR(fences[j])) {
-				err = PTR_ERR(fences[j]);
-				goto err_fences;
-			}
-			if (!j)
-				job->lrc_seqno = fences[0]->seqno;
-		}
+		if (i + 1 == q->width)
+			continue;
 
-		cf = dma_fence_array_create(q->width, fences,
-					    q->parallel.composite_fence_ctx,
-					    q->parallel.composite_fence_seqno++,
-					    false);
-		if (!cf) {
-			--q->parallel.composite_fence_seqno;
+		chain = dma_fence_chain_alloc();
+		if (!chain) {
 			err = -ENOMEM;
-			goto err_fences;
+			goto err_sched_job;
 		}
-
-		job->fence = &cf->base;
+		job->ptrs[i].chain_fence = chain;
 	}
 
 	width = q->width;
@@ -155,7 +144,7 @@ struct xe_sched_job *xe_sched_job_create(struct xe_exec_queue *q,
 		width = 2;
 
 	for (i = 0; i < width; ++i)
-		job->batch_addr[i] = batch_addr[i];
+		job->ptrs[i].batch_addr = batch_addr[i];
 
 	/* All other jobs require a VM to be open which has a ref */
 	if (unlikely(q->flags & EXEC_QUEUE_FLAG_KERNEL))
@@ -165,13 +154,8 @@ struct xe_sched_job *xe_sched_job_create(struct xe_exec_queue *q,
 	trace_xe_sched_job_create(job);
 	return job;
 
-err_fences:
-	for (j = j - 1; j >= 0; --j) {
-		--q->lrc[j].fence_ctx.next_seqno;
-		dma_fence_put(fences[j]);
-	}
-	kfree(fences);
 err_sched_job:
+	xe_sched_job_free_fences(job);
 	drm_sched_job_cleanup(&job->drm);
 err_free:
 	xe_exec_queue_put(q);
@@ -193,33 +177,39 @@ void xe_sched_job_destroy(struct kref *ref)
 
 	if (unlikely(job->q->flags & EXEC_QUEUE_FLAG_KERNEL))
 		xe_pm_runtime_put(job_to_xe(job));
+	xe_sched_job_free_fences(job);
 	xe_exec_queue_put(job->q);
 	dma_fence_put(job->fence);
 	drm_sched_job_cleanup(&job->drm);
 	job_free(job);
 }
 
-void xe_sched_job_set_error(struct xe_sched_job *job, int error)
+/* Set the error status under the fence to avoid racing with signaling */
+static bool xe_fence_set_error(struct dma_fence *fence, int error)
 {
-	if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &job->fence->flags))
-		return;
+	unsigned long irq_flags;
+	bool signaled;
 
-	dma_fence_set_error(job->fence, error);
+	spin_lock_irqsave(fence->lock, irq_flags);
+	signaled = test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence->flags);
+	if (!signaled)
+		dma_fence_set_error(fence, error);
+	spin_unlock_irqrestore(fence->lock, irq_flags);
+
+	return signaled;
+}
 
-	if (dma_fence_is_array(job->fence)) {
-		struct dma_fence_array *array =
-			to_dma_fence_array(job->fence);
-		struct dma_fence **child = array->fences;
-		unsigned int nchild = array->num_fences;
+void xe_sched_job_set_error(struct xe_sched_job *job, int error)
+{
+	if (xe_fence_set_error(job->fence, error))
+		return;
 
-		do {
-			struct dma_fence *current_fence = *child++;
+	if (dma_fence_is_chain(job->fence)) {
+		struct dma_fence *iter;
 
-			if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT,
-				     &current_fence->flags))
-				continue;
-			dma_fence_set_error(current_fence, error);
-		} while (--nchild);
+		dma_fence_chain_for_each(iter, job->fence)
+			xe_fence_set_error(dma_fence_chain_contained(iter),
+					   error);
 	}
 
 	trace_xe_sched_job_set_error(job);
@@ -234,7 +224,7 @@ bool xe_sched_job_started(struct xe_sched_job *job)
 
 	return !__dma_fence_is_later(xe_sched_job_lrc_seqno(job),
 				     xe_lrc_start_seqno(lrc),
-				     dma_fence_array_first(job->fence)->ops);
+				     dma_fence_chain_contained(job->fence)->ops);
 }
 
 bool xe_sched_job_completed(struct xe_sched_job *job)
@@ -248,13 +238,24 @@ bool xe_sched_job_completed(struct xe_sched_job *job)
 
 	return !__dma_fence_is_later(xe_sched_job_lrc_seqno(job),
 				     xe_lrc_seqno(lrc),
-				     dma_fence_array_first(job->fence)->ops);
+				     dma_fence_chain_contained(job->fence)->ops);
 }
 
 void xe_sched_job_arm(struct xe_sched_job *job)
 {
 	struct xe_exec_queue *q = job->q;
+	struct dma_fence *fence, *prev;
 	struct xe_vm *vm = q->vm;
+	u64 seqno = 0;
+	int i;
+
+	/* Migration and kernel engines have their own locking */
+	if (IS_ENABLED(CONFIG_LOCKDEP) &&
+	    !(q->flags & (EXEC_QUEUE_FLAG_KERNEL | EXEC_QUEUE_FLAG_VM))) {
+		lockdep_assert_held(&q->vm->lock);
+		if (!xe_vm_in_lr_mode(q->vm))
+			xe_vm_assert_held(q->vm);
+	}
 
 	if (vm && !xe_sched_job_is_migration(q) && !xe_vm_in_lr_mode(vm) &&
 	    (vm->batch_invalidate_tlb || vm->tlb_flush_seqno != q->tlb_flush_seqno)) {
@@ -263,6 +264,25 @@ void xe_sched_job_arm(struct xe_sched_job *job)
 		job->ring_ops_flush_tlb = true;
 	}
 
+	/* Arm the pre-allocated fences */
+	for (i = 0; i < q->width; prev = fence, ++i) {
+		struct dma_fence_chain *chain;
+
+		fence = job->ptrs[i].lrc_fence;
+		xe_lrc_init_seqno_fence(&q->lrc[i], fence);
+		job->ptrs[i].lrc_fence = NULL;
+		if (!i) {
+			job->lrc_seqno = fence->seqno;
+			continue;
+		}
+
+		chain = job->ptrs[i - 1].chain_fence;
+		dma_fence_chain_init(chain, prev, fence, seqno++);
+		job->ptrs[i - 1].chain_fence = NULL;
+		fence = &chain->base;
+	}
+
+	job->fence = fence;
 	drm_sched_job_arm(&job->drm);
 }
 
@@ -322,7 +342,8 @@ xe_sched_job_snapshot_capture(struct xe_sched_job *job)
 
 	snapshot->batch_addr_len = q->width;
 	for (i = 0; i < q->width; i++)
-		snapshot->batch_addr[i] = xe_device_uncanonicalize_addr(xe, job->batch_addr[i]);
+		snapshot->batch_addr[i] =
+			xe_device_uncanonicalize_addr(xe, job->ptrs[i].batch_addr);
 
 	return snapshot;
 }
diff --git a/drivers/gpu/drm/xe/xe_sched_job_types.h b/drivers/gpu/drm/xe/xe_sched_job_types.h
index 990ddac55ed6..0d3f76fb05ce 100644
--- a/drivers/gpu/drm/xe/xe_sched_job_types.h
+++ b/drivers/gpu/drm/xe/xe_sched_job_types.h
@@ -11,6 +11,20 @@
 #include <drm/gpu_scheduler.h>
 
 struct xe_exec_queue;
+struct dma_fence;
+struct dma_fence_chain;
+
+/**
+ * struct xe_job_ptrs - Per hw engine instance data
+ */
+struct xe_job_ptrs {
+	/** @lrc_fence: Pre-allocated uinitialized lrc fence.*/
+	struct dma_fence *lrc_fence;
+	/** @chain_fence: Pre-allocated ninitialized fence chain node. */
+	struct dma_fence_chain *chain_fence;
+	/** @batch_addr: Batch buffer address. */
+	u64 batch_addr;
+};
 
 /**
  * struct xe_sched_job - XE schedule job (batch buffer tracking)
@@ -43,8 +57,8 @@ struct xe_sched_job {
 	u32 migrate_flush_flags;
 	/** @ring_ops_flush_tlb: The ring ops need to flush TLB before payload. */
 	bool ring_ops_flush_tlb;
-	/** @batch_addr: batch buffer address of job */
-	u64 batch_addr[];
+	/** @ptrs: per instance pointers. */
+	struct xe_job_ptrs ptrs[];
 };
 
 struct xe_sched_job_snapshot {
diff --git a/drivers/gpu/drm/xe/xe_trace.h b/drivers/gpu/drm/xe/xe_trace.h
index 6c6cecc58f63..450f407c66e8 100644
--- a/drivers/gpu/drm/xe/xe_trace.h
+++ b/drivers/gpu/drm/xe/xe_trace.h
@@ -272,7 +272,7 @@ DECLARE_EVENT_CLASS(xe_sched_job,
 			   __entry->flags = job->q->flags;
 			   __entry->error = job->fence->error;
 			   __entry->fence = job->fence;
-			   __entry->batch_addr = (u64)job->batch_addr[0];
+			   __entry->batch_addr = (u64)job->ptrs[0].batch_addr;
 			   ),
 
 		    TP_printk("fence=%p, seqno=%u, lrc_seqno=%u, guc_id=%d, batch_addr=0x%012llx, guc_state=0x%x, flags=0x%x, error=%d",
-- 
2.44.0


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH v2 4/5] drm/xe: Remove xe_lrc_create_seqno_fence()
  2024-05-23 15:29 [PATCH v2 0/5] drm/xe: Allow migrate vm gpu submissions from reclaim context Thomas Hellström
                   ` (2 preceding siblings ...)
  2024-05-23 15:29 ` [PATCH v2 3/5] drm/xe: Don't initialize fences at xe_sched_job_create() Thomas Hellström
@ 2024-05-23 15:29 ` Thomas Hellström
  2024-05-24  6:56   ` Matthew Brost
  2024-05-23 15:29 ` [PATCH v2 5/5] drm/xe: Move job creation out of the struct xe_migrate::job_mutex Thomas Hellström
  2024-05-23 17:14 ` ✗ CI.Patch_applied: failure for drm/xe: Allow migrate vm gpu submissions from reclaim context (rev2) Patchwork
  5 siblings, 1 reply; 10+ messages in thread
From: Thomas Hellström @ 2024-05-23 15:29 UTC (permalink / raw)
  To: intel-xe; +Cc: Thomas Hellström, Matthew Brost

It's not used anymore.

Signed-off-by: Thomas Hellström <thomas.hellstrom@linux.intel.com>
---
 drivers/gpu/drm/xe/xe_lrc.c | 11 -----------
 drivers/gpu/drm/xe/xe_lrc.h |  1 -
 2 files changed, 12 deletions(-)

diff --git a/drivers/gpu/drm/xe/xe_lrc.c b/drivers/gpu/drm/xe/xe_lrc.c
index 7441230ad627..0c7d8dad1cb1 100644
--- a/drivers/gpu/drm/xe/xe_lrc.c
+++ b/drivers/gpu/drm/xe/xe_lrc.c
@@ -1069,17 +1069,6 @@ void xe_lrc_init_seqno_fence(struct xe_lrc *lrc, struct dma_fence *fence)
 	xe_hw_fence_init(fence, &lrc->fence_ctx, __xe_lrc_seqno_map(lrc));
 }
 
-struct dma_fence *xe_lrc_create_seqno_fence(struct xe_lrc *lrc)
-{
-	struct dma_fence *fence = xe_lrc_alloc_seqno_fence();
-
-	if (IS_ERR(fence))
-		return fence;
-
-	xe_lrc_init_seqno_fence(lrc, fence);
-	return fence;
-}
-
 s32 xe_lrc_seqno(struct xe_lrc *lrc)
 {
 	struct iosys_map map = __xe_lrc_seqno_map(lrc);
diff --git a/drivers/gpu/drm/xe/xe_lrc.h b/drivers/gpu/drm/xe/xe_lrc.h
index 84e4f4ef7f68..2e6d4de16e6c 100644
--- a/drivers/gpu/drm/xe/xe_lrc.h
+++ b/drivers/gpu/drm/xe/xe_lrc.h
@@ -47,7 +47,6 @@ u32 xe_lrc_seqno_ggtt_addr(struct xe_lrc *lrc);
 struct dma_fence *xe_lrc_alloc_seqno_fence(void);
 void xe_lrc_free_seqno_fence(struct dma_fence *fence);
 void xe_lrc_init_seqno_fence(struct xe_lrc *lrc, struct dma_fence *fence);
-struct dma_fence *xe_lrc_create_seqno_fence(struct xe_lrc *lrc);
 s32 xe_lrc_seqno(struct xe_lrc *lrc);
 
 u32 xe_lrc_start_seqno_ggtt_addr(struct xe_lrc *lrc);
-- 
2.44.0


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH v2 5/5] drm/xe: Move job creation out of the struct xe_migrate::job_mutex
  2024-05-23 15:29 [PATCH v2 0/5] drm/xe: Allow migrate vm gpu submissions from reclaim context Thomas Hellström
                   ` (3 preceding siblings ...)
  2024-05-23 15:29 ` [PATCH v2 4/5] drm/xe: Remove xe_lrc_create_seqno_fence() Thomas Hellström
@ 2024-05-23 15:29 ` Thomas Hellström
  2024-05-23 17:14 ` ✗ CI.Patch_applied: failure for drm/xe: Allow migrate vm gpu submissions from reclaim context (rev2) Patchwork
  5 siblings, 0 replies; 10+ messages in thread
From: Thomas Hellström @ 2024-05-23 15:29 UTC (permalink / raw)
  To: intel-xe; +Cc: Thomas Hellström, Matthew Brost

In order to be able to run gpu jobs from reclaim context,
move job creation (where allocation takes place) out of the
struct xe_migrate::job_mutex, and prime that mutex as reclaim
tainted.

Jobs that may need to run from reclaim context include
CCS metadata extraction at shrinking time.

Signed-off-by: Thomas Hellström <thomas.hellstrom@linux.intel.com>
---
 drivers/gpu/drm/xe/xe_migrate.c | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/xe/xe_migrate.c b/drivers/gpu/drm/xe/xe_migrate.c
index bacb23de411b..cccffaf3db06 100644
--- a/drivers/gpu/drm/xe/xe_migrate.c
+++ b/drivers/gpu/drm/xe/xe_migrate.c
@@ -383,6 +383,9 @@ struct xe_migrate *xe_migrate_init(struct xe_tile *tile)
 	}
 
 	mutex_init(&m->job_mutex);
+	fs_reclaim_acquire(GFP_KERNEL);
+	might_lock(&m->job_mutex);
+	fs_reclaim_release(GFP_KERNEL);
 
 	err = drmm_add_action_or_reset(&xe->drm, xe_migrate_fini, m);
 	if (err)
@@ -807,7 +810,6 @@ struct dma_fence *xe_migrate_copy(struct xe_migrate *m,
 						  IS_DGFX(xe) ? dst_is_vram : dst_is_pltt,
 						  src_L0, ccs_ofs, copy_ccs);
 
-		mutex_lock(&m->job_mutex);
 		job = xe_bb_create_migration_job(m->q, bb,
 						 xe_migrate_batch_base(m, usm),
 						 update_idx);
@@ -827,6 +829,7 @@ struct dma_fence *xe_migrate_copy(struct xe_migrate *m,
 				goto err_job;
 		}
 
+		mutex_lock(&m->job_mutex);
 		xe_sched_job_arm(job);
 		dma_fence_put(fence);
 		fence = dma_fence_get(&job->drm.s_fence->finished);
@@ -844,7 +847,6 @@ struct dma_fence *xe_migrate_copy(struct xe_migrate *m,
 err_job:
 		xe_sched_job_put(job);
 err:
-		mutex_unlock(&m->job_mutex);
 		xe_bb_free(bb, NULL);
 
 err_sync:
@@ -1044,7 +1046,6 @@ struct dma_fence *xe_migrate_clear(struct xe_migrate *m,
 			flush_flags = MI_FLUSH_DW_CCS;
 		}
 
-		mutex_lock(&m->job_mutex);
 		job = xe_bb_create_migration_job(m->q, bb,
 						 xe_migrate_batch_base(m, usm),
 						 update_idx);
@@ -1067,6 +1068,7 @@ struct dma_fence *xe_migrate_clear(struct xe_migrate *m,
 				goto err_job;
 		}
 
+		mutex_lock(&m->job_mutex);
 		xe_sched_job_arm(job);
 		dma_fence_put(fence);
 		fence = dma_fence_get(&job->drm.s_fence->finished);
@@ -1083,7 +1085,6 @@ struct dma_fence *xe_migrate_clear(struct xe_migrate *m,
 err_job:
 		xe_sched_job_put(job);
 err:
-		mutex_unlock(&m->job_mutex);
 		xe_bb_free(bb, NULL);
 err_sync:
 		/* Sync partial copies if any. FIXME: job_mutex? */
@@ -1377,9 +1378,6 @@ xe_migrate_update_pgtables(struct xe_migrate *m,
 			write_pgtable(tile, bb, 0, &updates[i], pt_update);
 	}
 
-	if (!q)
-		mutex_lock(&m->job_mutex);
-
 	job = xe_bb_create_migration_job(q ?: m->q, bb,
 					 xe_migrate_batch_base(m, usm),
 					 update_idx);
@@ -1420,6 +1418,9 @@ xe_migrate_update_pgtables(struct xe_migrate *m,
 		if (err)
 			goto err_job;
 	}
+	if (!q)
+		mutex_lock(&m->job_mutex);
+
 	xe_sched_job_arm(job);
 	fence = dma_fence_get(&job->drm.s_fence->finished);
 	xe_sched_job_push(job);
@@ -1435,8 +1436,6 @@ xe_migrate_update_pgtables(struct xe_migrate *m,
 err_job:
 	xe_sched_job_put(job);
 err_bb:
-	if (!q)
-		mutex_unlock(&m->job_mutex);
 	xe_bb_free(bb, NULL);
 err:
 	drm_suballoc_free(sa_bo, NULL);
-- 
2.44.0


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* ✗ CI.Patch_applied: failure for drm/xe: Allow migrate vm gpu submissions from reclaim context (rev2)
  2024-05-23 15:29 [PATCH v2 0/5] drm/xe: Allow migrate vm gpu submissions from reclaim context Thomas Hellström
                   ` (4 preceding siblings ...)
  2024-05-23 15:29 ` [PATCH v2 5/5] drm/xe: Move job creation out of the struct xe_migrate::job_mutex Thomas Hellström
@ 2024-05-23 17:14 ` Patchwork
  5 siblings, 0 replies; 10+ messages in thread
From: Patchwork @ 2024-05-23 17:14 UTC (permalink / raw)
  To: Thomas Hellström; +Cc: intel-xe

== Series Details ==

Series: drm/xe: Allow migrate vm gpu submissions from reclaim context (rev2)
URL   : https://patchwork.freedesktop.org/series/133918/
State : failure

== Summary ==

=== Applying kernel patches on branch 'drm-tip' with base: ===
Base commit: 699062dc7046 drm-tip: 2024y-05m-23d-15h-54m-28s UTC integration manifest
=== git am output follows ===
error: patch failed: drivers/gpu/drm/xe/xe_sched_job.c:155
error: drivers/gpu/drm/xe/xe_sched_job.c: patch does not apply
hint: Use 'git am --show-current-patch=diff' to see the failed patch
Applying: drm/xe: Decouple job seqno and lrc seqno
Applying: drm/xe: Split lrc seqno fence creation up
Applying: drm/xe: Don't initialize fences at xe_sched_job_create()
Patch failed at 0003 drm/xe: Don't initialize fences at xe_sched_job_create()
When you have resolved this problem, run "git am --continue".
If you prefer to skip this patch, run "git am --skip" instead.
To restore the original branch and stop patching, run "git am --abort".

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v2 2/5] drm/xe: Split lrc seqno fence creation up
  2024-05-23 15:29 ` [PATCH v2 2/5] drm/xe: Split lrc seqno fence creation up Thomas Hellström
@ 2024-05-23 17:58   ` Matthew Brost
  0 siblings, 0 replies; 10+ messages in thread
From: Matthew Brost @ 2024-05-23 17:58 UTC (permalink / raw)
  To: Thomas Hellström; +Cc: intel-xe

On Thu, May 23, 2024 at 05:29:08PM +0200, Thomas Hellström wrote:
> Since sometimes a lock is required to initialize a seqno fence,
> and it might be desirable not to hold that lock while performing
> memory allocations, split the lrc seqno fence creation up into an
> allocation phase and an initialization phase.
> 
> Since lrc seqno fences under the hood are hw_fences, do the same
> for these and remove the xe_hw_fence_create() function since it
> is not used anymore.
> 
> Signed-off-by: Thomas Hellström <thomas.hellstrom@linux.intel.com>

I think in general this a patch it actually a really good patch to
perhaps share with the team as a learning. We (mainly me) have conflated
create / init functions frequently in Xe and I think this the 3rd or 4th
time we have had to split a function. Probably best practices going
forward would be always split these upfront.

Anyways, patch LGTM:
Reviewed-by: Matthew Brost <matthew.brost@intel.com>

> ---
>  drivers/gpu/drm/xe/xe_hw_fence.c | 59 +++++++++++++++++++++++++-------
>  drivers/gpu/drm/xe/xe_hw_fence.h |  7 ++--
>  drivers/gpu/drm/xe/xe_lrc.c      | 48 ++++++++++++++++++++++++--
>  drivers/gpu/drm/xe/xe_lrc.h      |  3 ++
>  4 files changed, 101 insertions(+), 16 deletions(-)
> 
> diff --git a/drivers/gpu/drm/xe/xe_hw_fence.c b/drivers/gpu/drm/xe/xe_hw_fence.c
> index f872ef103127..35c0063a831a 100644
> --- a/drivers/gpu/drm/xe/xe_hw_fence.c
> +++ b/drivers/gpu/drm/xe/xe_hw_fence.c
> @@ -208,23 +208,58 @@ static struct xe_hw_fence *to_xe_hw_fence(struct dma_fence *fence)
>  	return container_of(fence, struct xe_hw_fence, dma);
>  }
>  
> -struct xe_hw_fence *xe_hw_fence_create(struct xe_hw_fence_ctx *ctx,
> -				       struct iosys_map seqno_map)
> +/**
> + * xe_hw_fence_alloc() -  Allocate an hw fence.
> + *
> + * Allocate but don't initialize an hw fence.
> + *
> + * Return: Pointer to the allocated fence or
> + * negative error pointer on error.
> + */
> +struct dma_fence *xe_hw_fence_alloc(void)
>  {
> -	struct xe_hw_fence *fence;
> +	struct xe_hw_fence *hw_fence = fence_alloc();
>  
> -	fence = fence_alloc();
> -	if (!fence)
> +	if (!hw_fence)
>  		return ERR_PTR(-ENOMEM);
>  
> -	fence->ctx = ctx;
> -	fence->seqno_map = seqno_map;
> -	INIT_LIST_HEAD(&fence->irq_link);
> +	return &hw_fence->dma;
> +}
>  
> -	dma_fence_init(&fence->dma, &xe_hw_fence_ops, &ctx->irq->lock,
> -		       ctx->dma_fence_ctx, ctx->next_seqno++);
> +/**
> + * xe_hw_fence_free() - Free an hw fence.
> + * @fence: Pointer to the fence to free.
> + *
> + * Frees an hw fence that hasn't yet been
> + * initialized.
> + */
> +void xe_hw_fence_free(struct dma_fence *fence)
> +{
> +	fence_free(&fence->rcu);
> +}
>  
> -	trace_xe_hw_fence_create(fence);
> +/**
> + * xe_hw_fence_init() - Initialize an hw fence.
> + * @fence: Pointer to the fence to initialize.
> + * @ctx: Pointer to the struct xe_hw_fence_ctx fence context.
> + * @seqno_map: Pointer to the map into where the seqno is blitted.
> + *
> + * Initializes a pre-allocated hw fence.
> + * After initialization, the fence is subject to normal
> + * dma-fence refcounting.
> + */
> +void xe_hw_fence_init(struct dma_fence *fence, struct xe_hw_fence_ctx *ctx,
> +		      struct iosys_map seqno_map)
> +{
> +	struct  xe_hw_fence *hw_fence =
> +		container_of(fence, typeof(*hw_fence), dma);
> +
> +	hw_fence->ctx = ctx;
> +	hw_fence->seqno_map = seqno_map;
> +	INIT_LIST_HEAD(&hw_fence->irq_link);
> +
> +	dma_fence_init(fence, &xe_hw_fence_ops, &ctx->irq->lock,
> +		       ctx->dma_fence_ctx, ctx->next_seqno++);
>  
> -	return fence;
> +	trace_xe_hw_fence_create(hw_fence);
>  }
> diff --git a/drivers/gpu/drm/xe/xe_hw_fence.h b/drivers/gpu/drm/xe/xe_hw_fence.h
> index cfe5fd603787..f13a1c4982c7 100644
> --- a/drivers/gpu/drm/xe/xe_hw_fence.h
> +++ b/drivers/gpu/drm/xe/xe_hw_fence.h
> @@ -24,7 +24,10 @@ void xe_hw_fence_ctx_init(struct xe_hw_fence_ctx *ctx, struct xe_gt *gt,
>  			  struct xe_hw_fence_irq *irq, const char *name);
>  void xe_hw_fence_ctx_finish(struct xe_hw_fence_ctx *ctx);
>  
> -struct xe_hw_fence *xe_hw_fence_create(struct xe_hw_fence_ctx *ctx,
> -				       struct iosys_map seqno_map);
> +struct dma_fence *xe_hw_fence_alloc(void);
>  
> +void xe_hw_fence_free(struct dma_fence *fence);
> +
> +void xe_hw_fence_init(struct dma_fence *fence, struct xe_hw_fence_ctx *ctx,
> +		      struct iosys_map seqno_map);
>  #endif
> diff --git a/drivers/gpu/drm/xe/xe_lrc.c b/drivers/gpu/drm/xe/xe_lrc.c
> index 9b0a4078add3..7441230ad627 100644
> --- a/drivers/gpu/drm/xe/xe_lrc.c
> +++ b/drivers/gpu/drm/xe/xe_lrc.c
> @@ -1030,10 +1030,54 @@ u32 xe_lrc_seqno_ggtt_addr(struct xe_lrc *lrc)
>  	return __xe_lrc_seqno_ggtt_addr(lrc);
>  }
>  
> +/**
> + * xe_lrc_alloc_seqno_fence() - Allocate an lrc seqno fence.
> + *
> + * Allocate but don't initialize an lrc seqno fence.
> + *
> + * Return: Pointer to the allocated fence or
> + * negative error pointer on error.
> + */
> +struct dma_fence *xe_lrc_alloc_seqno_fence(void)
> +{
> +	return xe_hw_fence_alloc();
> +}
> +
> +/**
> + * xe_lrc_free_seqno_fence() - Free an lrc seqno fence.
> + * @fence: Pointer to the fence to free.
> + *
> + * Frees an lrc seqno fence that hasn't yet been
> + * initialized.
> + */
> +void xe_lrc_free_seqno_fence(struct dma_fence *fence)
> +{
> +	xe_hw_fence_free(fence);
> +}
> +
> +/**
> + * xe_lrc_init_seqno_fence() - Initialize an lrc seqno fence.
> + * @lrc: Pointer to the lrc.
> + * @fence: Pointer to the fence to initialize.
> + *
> + * Initializes a pre-allocated lrc seqno fence.
> + * After initialization, the fence is subject to normal
> + * dma-fence refcounting.
> + */
> +void xe_lrc_init_seqno_fence(struct xe_lrc *lrc, struct dma_fence *fence)
> +{
> +	xe_hw_fence_init(fence, &lrc->fence_ctx, __xe_lrc_seqno_map(lrc));
> +}
> +
>  struct dma_fence *xe_lrc_create_seqno_fence(struct xe_lrc *lrc)
>  {
> -	return &xe_hw_fence_create(&lrc->fence_ctx,
> -				   __xe_lrc_seqno_map(lrc))->dma;
> +	struct dma_fence *fence = xe_lrc_alloc_seqno_fence();
> +
> +	if (IS_ERR(fence))
> +		return fence;
> +
> +	xe_lrc_init_seqno_fence(lrc, fence);
> +	return fence;
>  }
>  
>  s32 xe_lrc_seqno(struct xe_lrc *lrc)
> diff --git a/drivers/gpu/drm/xe/xe_lrc.h b/drivers/gpu/drm/xe/xe_lrc.h
> index e0e841963c23..84e4f4ef7f68 100644
> --- a/drivers/gpu/drm/xe/xe_lrc.h
> +++ b/drivers/gpu/drm/xe/xe_lrc.h
> @@ -44,6 +44,9 @@ void xe_lrc_write_ctx_reg(struct xe_lrc *lrc, int reg_nr, u32 val);
>  u64 xe_lrc_descriptor(struct xe_lrc *lrc);
>  
>  u32 xe_lrc_seqno_ggtt_addr(struct xe_lrc *lrc);
> +struct dma_fence *xe_lrc_alloc_seqno_fence(void);
> +void xe_lrc_free_seqno_fence(struct dma_fence *fence);
> +void xe_lrc_init_seqno_fence(struct xe_lrc *lrc, struct dma_fence *fence);
>  struct dma_fence *xe_lrc_create_seqno_fence(struct xe_lrc *lrc);
>  s32 xe_lrc_seqno(struct xe_lrc *lrc);
>  
> -- 
> 2.44.0
> 

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v2 3/5] drm/xe: Don't initialize fences at xe_sched_job_create()
  2024-05-23 15:29 ` [PATCH v2 3/5] drm/xe: Don't initialize fences at xe_sched_job_create() Thomas Hellström
@ 2024-05-23 18:16   ` Matthew Brost
  0 siblings, 0 replies; 10+ messages in thread
From: Matthew Brost @ 2024-05-23 18:16 UTC (permalink / raw)
  To: Thomas Hellström; +Cc: intel-xe

On Thu, May 23, 2024 at 05:29:09PM +0200, Thomas Hellström wrote:
> Pre-allocate but don't initialize fences at xe_sched_job_create(),
> and initialize / arm them instead at xe_sched_job_arm(). This
> makes it possible to move xe_sched_job_create() with its memory
> allocation out of any lock that is required for fence
> initialization, and that may not allow memory allocation under it.
> 
> Replaces the struct dma_fence_array for parallell jobs with a
> struct dma_fence_chain, since the former doesn't allow
> a split-up between allocation and initialization.
> 
> v2:
> - Rebase.
> - Don't always use the first lrc when initializing parallel
>   lrc fences.
> - Use dma_fence_chain_contained() to access the lrc fences.
> 
> Signed-off-by: Thomas Hellström <thomas.hellstrom@linux.intel.com>
> ---
>  drivers/gpu/drm/xe/xe_exec_queue.c       |   5 -
>  drivers/gpu/drm/xe/xe_exec_queue_types.h |  10 --
>  drivers/gpu/drm/xe/xe_ring_ops.c         |  12 +-
>  drivers/gpu/drm/xe/xe_sched_job.c        | 159 +++++++++++++----------
>  drivers/gpu/drm/xe/xe_sched_job_types.h  |  18 ++-
>  drivers/gpu/drm/xe/xe_trace.h            |   2 +-
>  6 files changed, 113 insertions(+), 93 deletions(-)
> 
> diff --git a/drivers/gpu/drm/xe/xe_exec_queue.c b/drivers/gpu/drm/xe/xe_exec_queue.c
> index e4607f0e3456..a5969271a964 100644
> --- a/drivers/gpu/drm/xe/xe_exec_queue.c
> +++ b/drivers/gpu/drm/xe/xe_exec_queue.c
> @@ -96,11 +96,6 @@ static struct xe_exec_queue *__xe_exec_queue_alloc(struct xe_device *xe,
>  		}
>  	}
>  
> -	if (xe_exec_queue_is_parallel(q)) {
> -		q->parallel.composite_fence_ctx = dma_fence_context_alloc(1);
> -		q->parallel.composite_fence_seqno = 0;
> -	}
> -
>  	return q;
>  }
>  
> diff --git a/drivers/gpu/drm/xe/xe_exec_queue_types.h b/drivers/gpu/drm/xe/xe_exec_queue_types.h
> index ee78d497d838..f0c40e8ad80a 100644
> --- a/drivers/gpu/drm/xe/xe_exec_queue_types.h
> +++ b/drivers/gpu/drm/xe/xe_exec_queue_types.h
> @@ -103,16 +103,6 @@ struct xe_exec_queue {
>  		struct xe_guc_exec_queue *guc;
>  	};
>  
> -	/**
> -	 * @parallel: parallel submission state
> -	 */
> -	struct {
> -		/** @parallel.composite_fence_ctx: context composite fence */
> -		u64 composite_fence_ctx;
> -		/** @parallel.composite_fence_seqno: seqno for composite fence */
> -		u32 composite_fence_seqno;
> -	} parallel;
> -
>  	/** @sched_props: scheduling properties */
>  	struct {
>  		/** @sched_props.timeslice_us: timeslice period in micro-seconds */
> diff --git a/drivers/gpu/drm/xe/xe_ring_ops.c b/drivers/gpu/drm/xe/xe_ring_ops.c
> index 2705d1f9d572..f75756e7a87b 100644
> --- a/drivers/gpu/drm/xe/xe_ring_ops.c
> +++ b/drivers/gpu/drm/xe/xe_ring_ops.c
> @@ -366,7 +366,7 @@ static void emit_migration_job_gen12(struct xe_sched_job *job,
>  
>  	dw[i++] = MI_ARB_ON_OFF | MI_ARB_DISABLE; /* Enabled again below */
>  
> -	i = emit_bb_start(job->batch_addr[0], BIT(8), dw, i);
> +	i = emit_bb_start(job->ptrs[0].batch_addr, BIT(8), dw, i);
>  
>  	if (!IS_SRIOV_VF(gt_to_xe(job->q->gt))) {
>  		/* XXX: Do we need this? Leaving for now. */
> @@ -375,7 +375,7 @@ static void emit_migration_job_gen12(struct xe_sched_job *job,
>  		dw[i++] = preparser_disable(false);
>  	}
>  
> -	i = emit_bb_start(job->batch_addr[1], BIT(8), dw, i);
> +	i = emit_bb_start(job->ptrs[1].batch_addr, BIT(8), dw, i);
>  
>  	dw[i++] = MI_FLUSH_DW | MI_INVALIDATE_TLB | job->migrate_flush_flags |
>  		MI_FLUSH_DW_OP_STOREDW | MI_FLUSH_IMM_DW;
> @@ -397,7 +397,7 @@ static void emit_job_gen12_gsc(struct xe_sched_job *job)
>  	xe_gt_assert(gt, job->q->width <= 1); /* no parallel submission for GSCCS */
>  
>  	__emit_job_gen12_simple(job, job->q->lrc,
> -				job->batch_addr[0],
> +				job->ptrs[0].batch_addr,
>  				xe_sched_job_lrc_seqno(job));
>  }
>  
> @@ -413,7 +413,7 @@ static void emit_job_gen12_copy(struct xe_sched_job *job)
>  
>  	for (i = 0; i < job->q->width; ++i)
>  		__emit_job_gen12_simple(job, job->q->lrc + i,
> -					job->batch_addr[i],
> +					job->ptrs[i].batch_addr,
>  					xe_sched_job_lrc_seqno(job));
>  }
>  
> @@ -424,7 +424,7 @@ static void emit_job_gen12_video(struct xe_sched_job *job)
>  	/* FIXME: Not doing parallel handshake for now */
>  	for (i = 0; i < job->q->width; ++i)
>  		__emit_job_gen12_video(job, job->q->lrc + i,
> -				       job->batch_addr[i],
> +				       job->ptrs[i].batch_addr,
>  				       xe_sched_job_lrc_seqno(job));
>  }
>  
> @@ -434,7 +434,7 @@ static void emit_job_gen12_render_compute(struct xe_sched_job *job)
>  
>  	for (i = 0; i < job->q->width; ++i)
>  		__emit_job_gen12_render_compute(job, job->q->lrc + i,
> -						job->batch_addr[i],
> +						job->ptrs[i].batch_addr,
>  						xe_sched_job_lrc_seqno(job));
>  }
>  
> diff --git a/drivers/gpu/drm/xe/xe_sched_job.c b/drivers/gpu/drm/xe/xe_sched_job.c
> index 8dd612b8b2b0..95c6c0411592 100644
> --- a/drivers/gpu/drm/xe/xe_sched_job.c
> +++ b/drivers/gpu/drm/xe/xe_sched_job.c
> @@ -6,7 +6,7 @@
>  #include "xe_sched_job.h"
>  
>  #include <drm/xe_drm.h>
> -#include <linux/dma-fence-array.h>
> +#include <linux/dma-fence-chain.h>
>  #include <linux/slab.h>
>  
>  #include "xe_device.h"
> @@ -29,7 +29,7 @@ int __init xe_sched_job_module_init(void)
>  	xe_sched_job_slab =
>  		kmem_cache_create("xe_sched_job",
>  				  sizeof(struct xe_sched_job) +
> -				  sizeof(u64), 0,
> +				  sizeof(struct xe_job_ptrs), 0,
>  				  SLAB_HWCACHE_ALIGN, NULL);
>  	if (!xe_sched_job_slab)
>  		return -ENOMEM;
> @@ -37,7 +37,7 @@ int __init xe_sched_job_module_init(void)
>  	xe_sched_job_parallel_slab =
>  		kmem_cache_create("xe_sched_job_parallel",
>  				  sizeof(struct xe_sched_job) +
> -				  sizeof(u64) *
> +				  sizeof(struct xe_job_ptrs) *
>  				  XE_HW_ENGINE_MAX_INSTANCE, 0,
>  				  SLAB_HWCACHE_ALIGN, NULL);
>  	if (!xe_sched_job_parallel_slab) {
> @@ -79,26 +79,33 @@ static struct xe_device *job_to_xe(struct xe_sched_job *job)
>  	return gt_to_xe(job->q->gt);
>  }
>  
> +/* Free unused pre-allocated fences */
> +static void xe_sched_job_free_fences(struct xe_sched_job *job)
> +{
> +	int i;
> +
> +	for (i = 0; i < job->q->width; ++i) {
> +		struct xe_job_ptrs *ptrs = &job->ptrs[i];
> +
> +		if (ptrs->lrc_fence)
> +			xe_lrc_free_seqno_fence(ptrs->lrc_fence);
> +		if (ptrs->chain_fence)
> +			dma_fence_chain_free(ptrs->chain_fence);
> +	}
> +}
> +
>  struct xe_sched_job *xe_sched_job_create(struct xe_exec_queue *q,
>  					 u64 *batch_addr)
>  {
> -	struct xe_sched_job *job;
> -	struct dma_fence **fences;
>  	bool is_migration = xe_sched_job_is_migration(q);
> +	struct xe_sched_job *job;
>  	int err;
> -	int i, j;
> +	int i;
>  	u32 width;
>  
>  	/* only a kernel context can submit a vm-less job */
>  	XE_WARN_ON(!q->vm && !(q->flags & EXEC_QUEUE_FLAG_KERNEL));
>  
> -	/* Migration and kernel engines have their own locking */
> -	if (!(q->flags & (EXEC_QUEUE_FLAG_KERNEL | EXEC_QUEUE_FLAG_VM))) {
> -		lockdep_assert_held(&q->vm->lock);
> -		if (!xe_vm_in_lr_mode(q->vm))
> -			xe_vm_assert_held(q->vm);
> -	}
> -
>  	job = job_alloc(xe_exec_queue_is_parallel(q) || is_migration);
>  	if (!job)
>  		return ERR_PTR(-ENOMEM);
> @@ -111,43 +118,25 @@ struct xe_sched_job *xe_sched_job_create(struct xe_exec_queue *q,
>  	if (err)
>  		goto err_free;
>  
> -	if (!xe_exec_queue_is_parallel(q)) {
> -		job->fence = xe_lrc_create_seqno_fence(q->lrc);
> -		if (IS_ERR(job->fence)) {
> -			err = PTR_ERR(job->fence);
> -			goto err_sched_job;
> -		}
> -		job->lrc_seqno = job->fence->seqno;
> -	} else {
> -		struct dma_fence_array *cf;
> +	for (i = 0; i < q->width; ++i) {
> +		struct dma_fence *fence = xe_lrc_alloc_seqno_fence();
> +		struct dma_fence_chain *chain;
>  
> -		fences = kmalloc_array(q->width, sizeof(*fences), GFP_KERNEL);
> -		if (!fences) {
> -			err = -ENOMEM;
> +		if (IS_ERR(fence)) {
> +			err = PTR_ERR(fence);
>  			goto err_sched_job;
>  		}
> +		job->ptrs[i].lrc_fence = fence;
>  
> -		for (j = 0; j < q->width; ++j) {
> -			fences[j] = xe_lrc_create_seqno_fence(q->lrc + j);
> -			if (IS_ERR(fences[j])) {
> -				err = PTR_ERR(fences[j]);
> -				goto err_fences;
> -			}
> -			if (!j)
> -				job->lrc_seqno = fences[0]->seqno;
> -		}
> +		if (i + 1 == q->width)
> +			continue;
>  
> -		cf = dma_fence_array_create(q->width, fences,
> -					    q->parallel.composite_fence_ctx,
> -					    q->parallel.composite_fence_seqno++,
> -					    false);
> -		if (!cf) {
> -			--q->parallel.composite_fence_seqno;
> +		chain = dma_fence_chain_alloc();
> +		if (!chain) {
>  			err = -ENOMEM;
> -			goto err_fences;
> +			goto err_sched_job;
>  		}
> -
> -		job->fence = &cf->base;
> +		job->ptrs[i].chain_fence = chain;
>  	}
>  
>  	width = q->width;
> @@ -155,7 +144,7 @@ struct xe_sched_job *xe_sched_job_create(struct xe_exec_queue *q,
>  		width = 2;
>  
>  	for (i = 0; i < width; ++i)
> -		job->batch_addr[i] = batch_addr[i];
> +		job->ptrs[i].batch_addr = batch_addr[i];
>  
>  	/* All other jobs require a VM to be open which has a ref */
>  	if (unlikely(q->flags & EXEC_QUEUE_FLAG_KERNEL))
> @@ -165,13 +154,8 @@ struct xe_sched_job *xe_sched_job_create(struct xe_exec_queue *q,
>  	trace_xe_sched_job_create(job);
>  	return job;
>  
> -err_fences:
> -	for (j = j - 1; j >= 0; --j) {
> -		--q->lrc[j].fence_ctx.next_seqno;
> -		dma_fence_put(fences[j]);
> -	}
> -	kfree(fences);
>  err_sched_job:
> +	xe_sched_job_free_fences(job);
>  	drm_sched_job_cleanup(&job->drm);
>  err_free:
>  	xe_exec_queue_put(q);
> @@ -193,33 +177,39 @@ void xe_sched_job_destroy(struct kref *ref)
>  
>  	if (unlikely(job->q->flags & EXEC_QUEUE_FLAG_KERNEL))
>  		xe_pm_runtime_put(job_to_xe(job));
> +	xe_sched_job_free_fences(job);
>  	xe_exec_queue_put(job->q);
>  	dma_fence_put(job->fence);
>  	drm_sched_job_cleanup(&job->drm);
>  	job_free(job);
>  }
>  
> -void xe_sched_job_set_error(struct xe_sched_job *job, int error)
> +/* Set the error status under the fence to avoid racing with signaling */
> +static bool xe_fence_set_error(struct dma_fence *fence, int error)
>  {
> -	if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &job->fence->flags))
> -		return;
> +	unsigned long irq_flags;
> +	bool signaled;
>  
> -	dma_fence_set_error(job->fence, error);
> +	spin_lock_irqsave(fence->lock, irq_flags);
> +	signaled = test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence->flags);
> +	if (!signaled)
> +		dma_fence_set_error(fence, error);
> +	spin_unlock_irqrestore(fence->lock, irq_flags);
> +
> +	return signaled;
> +}
>  
> -	if (dma_fence_is_array(job->fence)) {
> -		struct dma_fence_array *array =
> -			to_dma_fence_array(job->fence);
> -		struct dma_fence **child = array->fences;
> -		unsigned int nchild = array->num_fences;
> +void xe_sched_job_set_error(struct xe_sched_job *job, int error)
> +{
> +	if (xe_fence_set_error(job->fence, error))
> +		return;
>  
> -		do {
> -			struct dma_fence *current_fence = *child++;
> +	if (dma_fence_is_chain(job->fence)) {
> +		struct dma_fence *iter;
>  
> -			if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT,
> -				     &current_fence->flags))
> -				continue;
> -			dma_fence_set_error(current_fence, error);
> -		} while (--nchild);
> +		dma_fence_chain_for_each(iter, job->fence)
> +			xe_fence_set_error(dma_fence_chain_contained(iter),
> +					   error);
>  	}
>  
>  	trace_xe_sched_job_set_error(job);
> @@ -234,7 +224,7 @@ bool xe_sched_job_started(struct xe_sched_job *job)
>  
>  	return !__dma_fence_is_later(xe_sched_job_lrc_seqno(job),
>  				     xe_lrc_start_seqno(lrc),
> -				     dma_fence_array_first(job->fence)->ops);
> +				     dma_fence_chain_contained(job->fence)->ops);
>  }
>  
>  bool xe_sched_job_completed(struct xe_sched_job *job)
> @@ -248,13 +238,24 @@ bool xe_sched_job_completed(struct xe_sched_job *job)
>  
>  	return !__dma_fence_is_later(xe_sched_job_lrc_seqno(job),
>  				     xe_lrc_seqno(lrc),
> -				     dma_fence_array_first(job->fence)->ops);
> +				     dma_fence_chain_contained(job->fence)->ops);
>  }
>  
>  void xe_sched_job_arm(struct xe_sched_job *job)
>  {
>  	struct xe_exec_queue *q = job->q;
> +	struct dma_fence *fence, *prev;
>  	struct xe_vm *vm = q->vm;
> +	u64 seqno = 0;
> +	int i;
> +
> +	/* Migration and kernel engines have their own locking */
> +	if (IS_ENABLED(CONFIG_LOCKDEP) &&
> +	    !(q->flags & (EXEC_QUEUE_FLAG_KERNEL | EXEC_QUEUE_FLAG_VM))) {
> +		lockdep_assert_held(&q->vm->lock);
> +		if (!xe_vm_in_lr_mode(q->vm))
> +			xe_vm_assert_held(q->vm);
> +	}
>  
>  	if (vm && !xe_sched_job_is_migration(q) && !xe_vm_in_lr_mode(vm) &&
>  	    (vm->batch_invalidate_tlb || vm->tlb_flush_seqno != q->tlb_flush_seqno)) {
> @@ -263,6 +264,25 @@ void xe_sched_job_arm(struct xe_sched_job *job)
>  		job->ring_ops_flush_tlb = true;
>  	}
>  
> +	/* Arm the pre-allocated fences */
> +	for (i = 0; i < q->width; prev = fence, ++i) {
> +		struct dma_fence_chain *chain;
> +
> +		fence = job->ptrs[i].lrc_fence;
> +		xe_lrc_init_seqno_fence(&q->lrc[i], fence);
> +		job->ptrs[i].lrc_fence = NULL;
> +		if (!i) {
> +			job->lrc_seqno = fence->seqno;
> +			continue;
> +		}

I removed the assert in my RFC for seqno being equal to the composite
fence but version of that I think still has value as the job->lrc_seqno
is used in ring ops for every LRC in the exec queue.

Something like:

		if (!i) {
			job->lrc_seqno = fence->seqno;
			continue;
		} else {
			xe_assert(xe, job->lrc_seqno == fence->seqno);
		}

With that addition:
Reviewed-by: Matthew Brost <matthew.brost@intel.com>

> +
> +		chain = job->ptrs[i - 1].chain_fence;
> +		dma_fence_chain_init(chain, prev, fence, seqno++);
> +		job->ptrs[i - 1].chain_fence = NULL;
> +		fence = &chain->base;
> +	}
> +
> +	job->fence = fence;
>  	drm_sched_job_arm(&job->drm);
>  }
>  
> @@ -322,7 +342,8 @@ xe_sched_job_snapshot_capture(struct xe_sched_job *job)
>  
>  	snapshot->batch_addr_len = q->width;
>  	for (i = 0; i < q->width; i++)
> -		snapshot->batch_addr[i] = xe_device_uncanonicalize_addr(xe, job->batch_addr[i]);
> +		snapshot->batch_addr[i] =
> +			xe_device_uncanonicalize_addr(xe, job->ptrs[i].batch_addr);
>  
>  	return snapshot;
>  }
> diff --git a/drivers/gpu/drm/xe/xe_sched_job_types.h b/drivers/gpu/drm/xe/xe_sched_job_types.h
> index 990ddac55ed6..0d3f76fb05ce 100644
> --- a/drivers/gpu/drm/xe/xe_sched_job_types.h
> +++ b/drivers/gpu/drm/xe/xe_sched_job_types.h
> @@ -11,6 +11,20 @@
>  #include <drm/gpu_scheduler.h>
>  
>  struct xe_exec_queue;
> +struct dma_fence;
> +struct dma_fence_chain;
> +
> +/**
> + * struct xe_job_ptrs - Per hw engine instance data
> + */
> +struct xe_job_ptrs {
> +	/** @lrc_fence: Pre-allocated uinitialized lrc fence.*/
> +	struct dma_fence *lrc_fence;
> +	/** @chain_fence: Pre-allocated ninitialized fence chain node. */
> +	struct dma_fence_chain *chain_fence;
> +	/** @batch_addr: Batch buffer address. */
> +	u64 batch_addr;
> +};
>  
>  /**
>   * struct xe_sched_job - XE schedule job (batch buffer tracking)
> @@ -43,8 +57,8 @@ struct xe_sched_job {
>  	u32 migrate_flush_flags;
>  	/** @ring_ops_flush_tlb: The ring ops need to flush TLB before payload. */
>  	bool ring_ops_flush_tlb;
> -	/** @batch_addr: batch buffer address of job */
> -	u64 batch_addr[];
> +	/** @ptrs: per instance pointers. */
> +	struct xe_job_ptrs ptrs[];
>  };
>  
>  struct xe_sched_job_snapshot {
> diff --git a/drivers/gpu/drm/xe/xe_trace.h b/drivers/gpu/drm/xe/xe_trace.h
> index 6c6cecc58f63..450f407c66e8 100644
> --- a/drivers/gpu/drm/xe/xe_trace.h
> +++ b/drivers/gpu/drm/xe/xe_trace.h
> @@ -272,7 +272,7 @@ DECLARE_EVENT_CLASS(xe_sched_job,
>  			   __entry->flags = job->q->flags;
>  			   __entry->error = job->fence->error;
>  			   __entry->fence = job->fence;
> -			   __entry->batch_addr = (u64)job->batch_addr[0];
> +			   __entry->batch_addr = (u64)job->ptrs[0].batch_addr;
>  			   ),
>  
>  		    TP_printk("fence=%p, seqno=%u, lrc_seqno=%u, guc_id=%d, batch_addr=0x%012llx, guc_state=0x%x, flags=0x%x, error=%d",
> -- 
> 2.44.0
> 

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v2 4/5] drm/xe: Remove xe_lrc_create_seqno_fence()
  2024-05-23 15:29 ` [PATCH v2 4/5] drm/xe: Remove xe_lrc_create_seqno_fence() Thomas Hellström
@ 2024-05-24  6:56   ` Matthew Brost
  0 siblings, 0 replies; 10+ messages in thread
From: Matthew Brost @ 2024-05-24  6:56 UTC (permalink / raw)
  To: Thomas Hellström; +Cc: intel-xe

On Thu, May 23, 2024 at 05:29:10PM +0200, Thomas Hellström wrote:
> It's not used anymore.
> 
> Signed-off-by: Thomas Hellström <thomas.hellstrom@linux.intel.com>

Reviewed-by: Matthew Brost <matthew.brost@intel.com>

> ---
>  drivers/gpu/drm/xe/xe_lrc.c | 11 -----------
>  drivers/gpu/drm/xe/xe_lrc.h |  1 -
>  2 files changed, 12 deletions(-)
> 
> diff --git a/drivers/gpu/drm/xe/xe_lrc.c b/drivers/gpu/drm/xe/xe_lrc.c
> index 7441230ad627..0c7d8dad1cb1 100644
> --- a/drivers/gpu/drm/xe/xe_lrc.c
> +++ b/drivers/gpu/drm/xe/xe_lrc.c
> @@ -1069,17 +1069,6 @@ void xe_lrc_init_seqno_fence(struct xe_lrc *lrc, struct dma_fence *fence)
>  	xe_hw_fence_init(fence, &lrc->fence_ctx, __xe_lrc_seqno_map(lrc));
>  }
>  
> -struct dma_fence *xe_lrc_create_seqno_fence(struct xe_lrc *lrc)
> -{
> -	struct dma_fence *fence = xe_lrc_alloc_seqno_fence();
> -
> -	if (IS_ERR(fence))
> -		return fence;
> -
> -	xe_lrc_init_seqno_fence(lrc, fence);
> -	return fence;
> -}
> -
>  s32 xe_lrc_seqno(struct xe_lrc *lrc)
>  {
>  	struct iosys_map map = __xe_lrc_seqno_map(lrc);
> diff --git a/drivers/gpu/drm/xe/xe_lrc.h b/drivers/gpu/drm/xe/xe_lrc.h
> index 84e4f4ef7f68..2e6d4de16e6c 100644
> --- a/drivers/gpu/drm/xe/xe_lrc.h
> +++ b/drivers/gpu/drm/xe/xe_lrc.h
> @@ -47,7 +47,6 @@ u32 xe_lrc_seqno_ggtt_addr(struct xe_lrc *lrc);
>  struct dma_fence *xe_lrc_alloc_seqno_fence(void);
>  void xe_lrc_free_seqno_fence(struct dma_fence *fence);
>  void xe_lrc_init_seqno_fence(struct xe_lrc *lrc, struct dma_fence *fence);
> -struct dma_fence *xe_lrc_create_seqno_fence(struct xe_lrc *lrc);
>  s32 xe_lrc_seqno(struct xe_lrc *lrc);
>  
>  u32 xe_lrc_start_seqno_ggtt_addr(struct xe_lrc *lrc);
> -- 
> 2.44.0
> 

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2024-05-24  6:58 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2024-05-23 15:29 [PATCH v2 0/5] drm/xe: Allow migrate vm gpu submissions from reclaim context Thomas Hellström
2024-05-23 15:29 ` [PATCH v2 1/5] drm/xe: Decouple job seqno and lrc seqno Thomas Hellström
2024-05-23 15:29 ` [PATCH v2 2/5] drm/xe: Split lrc seqno fence creation up Thomas Hellström
2024-05-23 17:58   ` Matthew Brost
2024-05-23 15:29 ` [PATCH v2 3/5] drm/xe: Don't initialize fences at xe_sched_job_create() Thomas Hellström
2024-05-23 18:16   ` Matthew Brost
2024-05-23 15:29 ` [PATCH v2 4/5] drm/xe: Remove xe_lrc_create_seqno_fence() Thomas Hellström
2024-05-24  6:56   ` Matthew Brost
2024-05-23 15:29 ` [PATCH v2 5/5] drm/xe: Move job creation out of the struct xe_migrate::job_mutex Thomas Hellström
2024-05-23 17:14 ` ✗ CI.Patch_applied: failure for drm/xe: Allow migrate vm gpu submissions from reclaim context (rev2) Patchwork

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox