[PATCH 1/7] drm/amdgpu: don't reemit ring contents more than once

All of lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH 1/7] drm/amdgpu: don't reemit ring contents more than once
@ 2025-12-18 22:41 Alex Deucher
  2025-12-18 22:41 ` [PATCH 2/7] drm/amdgpu: always backup and reemit fences Alex Deucher
                   ` (5 more replies)
  0 siblings, 6 replies; 11+ messages in thread
From: Alex Deucher @ 2025-12-18 22:41 UTC (permalink / raw)
  To: amd-gfx; +Cc: Alex Deucher, Timur Kristóf

If we cancel a bad job and reemit the ring contents, and
we get another timeout, cancel everything rather than reemitting.
The wptr markers are only relevant for the original emit.  If
we reemit, the wptr markers are no longer correct.

Reviewed-by: Timur Kristóf <timur.kristof@gmail.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c | 22 +++++++++++++++++-----
 drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h  |  2 ++
 2 files changed, 19 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
index 1fe31d2f27060..334ddd6e48c06 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
@@ -710,6 +710,7 @@ void amdgpu_fence_driver_guilty_force_completion(struct amdgpu_fence *af)
 	struct amdgpu_ring *ring = af->ring;
 	unsigned long flags;
 	u32 seq, last_seq;
+	bool reemitted = false;
 
 	last_seq = amdgpu_fence_read(ring) & ring->fence_drv.num_fences_mask;
 	seq = ring->fence_drv.sync_seq & ring->fence_drv.num_fences_mask;
@@ -727,7 +728,9 @@ void amdgpu_fence_driver_guilty_force_completion(struct amdgpu_fence *af)
 		if (unprocessed && !dma_fence_is_signaled_locked(unprocessed)) {
 			fence = container_of(unprocessed, struct amdgpu_fence, base);
 
-			if (fence == af)
+			if (fence->reemitted > 1)
+				reemitted = true;
+			else if (fence == af)
 				dma_fence_set_error(&fence->base, -ETIME);
 			else if (fence->context == af->context)
 				dma_fence_set_error(&fence->base, -ECANCELED);
@@ -735,9 +738,16 @@ void amdgpu_fence_driver_guilty_force_completion(struct amdgpu_fence *af)
 		rcu_read_unlock();
 	} while (last_seq != seq);
 	spin_unlock_irqrestore(&ring->fence_drv.lock, flags);
-	/* signal the guilty fence */
-	amdgpu_fence_write(ring, (u32)af->base.seqno);
-	amdgpu_fence_process(ring);
+
+	if (reemitted) {
+		/* if we've already reemitted once then just cancel everything */
+		amdgpu_fence_driver_force_completion(af->ring);
+		af->ring->ring_backup_entries_to_copy = 0;
+	} else {
+		/* signal the guilty fence */
+		amdgpu_fence_write(ring, (u32)af->base.seqno);
+		amdgpu_fence_process(ring);
+	}
 }
 
 void amdgpu_fence_save_wptr(struct amdgpu_fence *af)
@@ -785,10 +795,12 @@ void amdgpu_ring_backup_unprocessed_commands(struct amdgpu_ring *ring,
 			/* save everything if the ring is not guilty, otherwise
 			 * just save the content from other contexts.
 			 */
-			if (!guilty_fence || (fence->context != guilty_fence->context))
+			if (!fence->reemitted &&
+			    (!guilty_fence || (fence->context != guilty_fence->context)))
 				amdgpu_ring_backup_unprocessed_command(ring, wptr,
 								       fence->wptr);
 			wptr = fence->wptr;
+			fence->reemitted++;
 		}
 		rcu_read_unlock();
 	} while (last_seq != seq);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
index a1fb0fadb6eab..d881829528976 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
@@ -150,6 +150,8 @@ struct amdgpu_fence {
 	u64				wptr;
 	/* fence context for resets */
 	u64				context;
+	/* has this fence been reemitted */
+	unsigned int			reemitted;
 };
 
 extern const struct drm_sched_backend_ops amdgpu_sched_ops;
-- 
2.52.0


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH 2/7] drm/amdgpu: always backup and reemit fences
  2025-12-18 22:41 [PATCH 1/7] drm/amdgpu: don't reemit ring contents more than once Alex Deucher
@ 2025-12-18 22:41 ` Alex Deucher
  2025-12-18 22:41 ` [PATCH 3/7] drm/amdgpu: use dma_fence_get_status() for adapter reset Alex Deucher
                   ` (4 subsequent siblings)
  5 siblings, 0 replies; 11+ messages in thread
From: Alex Deucher @ 2025-12-18 22:41 UTC (permalink / raw)
  To: amd-gfx; +Cc: Alex Deucher, Timur Kristóf

If when we backup the ring contents for reemit before a
ring reset, we skip jobs associated with the bad
context, however, we need to make sure the fences
are reemited as unprocessed submissions may depend on
them.

v2: clean up fence handling, make helpers static

Reviewed-by: Timur Kristóf <timur.kristof@gmail.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c | 24 ++++++++++++++++++-----
 drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h  |  5 ++++-
 2 files changed, 23 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
index 334ddd6e48c06..3a23cce5f769a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
@@ -89,6 +89,16 @@ static u32 amdgpu_fence_read(struct amdgpu_ring *ring)
 	return seq;
 }
 
+static void amdgpu_fence_save_fence_wptr_start(struct amdgpu_fence *af)
+{
+	af->fence_wptr_start = af->ring->wptr;
+}
+
+static void amdgpu_fence_save_fence_wptr_end(struct amdgpu_fence *af)
+{
+	af->fence_wptr_end = af->ring->wptr;
+}
+
 /**
  * amdgpu_fence_emit - emit a fence on the requested ring
  *
@@ -116,8 +126,10 @@ int amdgpu_fence_emit(struct amdgpu_ring *ring, struct amdgpu_fence *af,
 		       &ring->fence_drv.lock,
 		       adev->fence_context + ring->idx, seq);
 
+	amdgpu_fence_save_fence_wptr_start(af);
 	amdgpu_ring_emit_fence(ring, ring->fence_drv.gpu_addr,
 			       seq, flags | AMDGPU_FENCE_FLAG_INT);
+	amdgpu_fence_save_fence_wptr_end(af);
 	amdgpu_fence_save_wptr(af);
 	pm_runtime_get_noresume(adev_to_drm(adev)->dev);
 	ptr = &ring->fence_drv.fences[seq & ring->fence_drv.num_fences_mask];
@@ -743,10 +755,6 @@ void amdgpu_fence_driver_guilty_force_completion(struct amdgpu_fence *af)
 		/* if we've already reemitted once then just cancel everything */
 		amdgpu_fence_driver_force_completion(af->ring);
 		af->ring->ring_backup_entries_to_copy = 0;
-	} else {
-		/* signal the guilty fence */
-		amdgpu_fence_write(ring, (u32)af->base.seqno);
-		amdgpu_fence_process(ring);
 	}
 }
 
@@ -796,9 +804,15 @@ void amdgpu_ring_backup_unprocessed_commands(struct amdgpu_ring *ring,
 			 * just save the content from other contexts.
 			 */
 			if (!fence->reemitted &&
-			    (!guilty_fence || (fence->context != guilty_fence->context)))
+			    (!guilty_fence || (fence->context != guilty_fence->context))) {
 				amdgpu_ring_backup_unprocessed_command(ring, wptr,
 								       fence->wptr);
+			} else if (!fence->reemitted) {
+				/* always save the fence */
+				amdgpu_ring_backup_unprocessed_command(ring,
+								       fence->fence_wptr_start,
+								       fence->fence_wptr_end);
+			}
 			wptr = fence->wptr;
 			fence->reemitted++;
 		}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
index d881829528976..87c9df6c2ecfe 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
@@ -146,12 +146,15 @@ struct amdgpu_fence {
 	struct amdgpu_ring		*ring;
 	ktime_t				start_timestamp;
 
-	/* wptr for the fence for resets */
+	/* wptr for the total submission for resets */
 	u64				wptr;
 	/* fence context for resets */
 	u64				context;
 	/* has this fence been reemitted */
 	unsigned int			reemitted;
+	/* wptr for the fence for the submission */
+	u64				fence_wptr_start;
+	u64				fence_wptr_end;
 };
 
 extern const struct drm_sched_backend_ops amdgpu_sched_ops;
-- 
2.52.0


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH 3/7] drm/amdgpu: use dma_fence_get_status() for adapter reset
  2025-12-18 22:41 [PATCH 1/7] drm/amdgpu: don't reemit ring contents more than once Alex Deucher
  2025-12-18 22:41 ` [PATCH 2/7] drm/amdgpu: always backup and reemit fences Alex Deucher
@ 2025-12-18 22:41 ` Alex Deucher
  2025-12-18 22:41 ` [PATCH 4/7] drm/amdgpu: avoid a warning in timedout job handler Alex Deucher
                   ` (3 subsequent siblings)
  5 siblings, 0 replies; 11+ messages in thread
From: Alex Deucher @ 2025-12-18 22:41 UTC (permalink / raw)
  To: amd-gfx; +Cc: Alex Deucher

We need to check if the fence was signaled without an
error as the per queue resets may have signalled the fence
while attempting to reset the queue.

Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index cd19a896dd4c4..198d1872b9247 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -6367,7 +6367,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
 	 *
 	 * job->base holds a reference to parent fence
 	 */
-	if (job && dma_fence_is_signaled(&job->hw_fence->base)) {
+	if (job && (dma_fence_get_status(&job->hw_fence->base) > 0)) {
 		job_signaled = true;
 		dev_info(adev->dev, "Guilty job already signaled, skipping HW reset");
 		goto skip_hw_reset;
-- 
2.52.0


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH 4/7] drm/amdgpu: avoid a warning in timedout job handler
  2025-12-18 22:41 [PATCH 1/7] drm/amdgpu: don't reemit ring contents more than once Alex Deucher
  2025-12-18 22:41 ` [PATCH 2/7] drm/amdgpu: always backup and reemit fences Alex Deucher
  2025-12-18 22:41 ` [PATCH 3/7] drm/amdgpu: use dma_fence_get_status() for adapter reset Alex Deucher
@ 2025-12-18 22:41 ` Alex Deucher
  2025-12-18 22:41 ` [PATCH 5/7] drm/amdgpu: mark fences with errors before ring reset Alex Deucher
                   ` (2 subsequent siblings)
  5 siblings, 0 replies; 11+ messages in thread
From: Alex Deucher @ 2025-12-18 22:41 UTC (permalink / raw)
  To: amd-gfx; +Cc: Alex Deucher, Timur Kristóf

Only set an error on the fence if the fence is not
signalled.  We can end up with a warning if the
per queue reset path signals the fence and sets an error
as part of the reset, but fails to recover.

Reviewed-by: Timur Kristóf <timur.kristof@gmail.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
index 67fde99724bad..7f5d01164897f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -147,7 +147,8 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
 		dev_err(adev->dev, "Ring %s reset failed\n", ring->sched.name);
 	}
 
-	dma_fence_set_error(&s_job->s_fence->finished, -ETIME);
+	if (dma_fence_get_status(&s_job->s_fence->finished) == 0)
+		dma_fence_set_error(&s_job->s_fence->finished, -ETIME);
 
 	amdgpu_vm_put_task_info(ti);
 
-- 
2.52.0


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH 5/7] drm/amdgpu: mark fences with errors before ring reset
  2025-12-18 22:41 [PATCH 1/7] drm/amdgpu: don't reemit ring contents more than once Alex Deucher
                   ` (2 preceding siblings ...)
  2025-12-18 22:41 ` [PATCH 4/7] drm/amdgpu: avoid a warning in timedout job handler Alex Deucher
@ 2025-12-18 22:41 ` Alex Deucher
  2025-12-18 22:41 ` [PATCH 6/7] drm/amdgpu/gfx9: rework pipeline sync packet sequence Alex Deucher
  2025-12-18 22:41 ` [PATCH 7/7] drm/amdgpu/gfx9: Implement KGQ ring reset Alex Deucher
  5 siblings, 0 replies; 11+ messages in thread
From: Alex Deucher @ 2025-12-18 22:41 UTC (permalink / raw)
  To: amd-gfx; +Cc: Alex Deucher

Mark fences with errors before we reset the rings as
we may end up signalling fences as part of the reset
sequence.  The error needs to be set before the fence
is signalled.  On GC10 and newer, this isn't a problem
since we don't signal any fences.  On GC9, we need to
signal the fence after the reset to unblock the recovery
sequence.

Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
index 600e6bb98af7a..5defdebd7091e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
@@ -872,6 +872,10 @@ void amdgpu_ring_reset_helper_begin(struct amdgpu_ring *ring,
 	drm_sched_wqueue_stop(&ring->sched);
 	/* back up the non-guilty commands */
 	amdgpu_ring_backup_unprocessed_commands(ring, guilty_fence);
+	/* signal the guilty fence and set an error on all fences from the context */
+	if (guilty_fence)
+		amdgpu_fence_driver_guilty_force_completion(guilty_fence);
+
 }
 
 int amdgpu_ring_reset_helper_end(struct amdgpu_ring *ring,
@@ -885,9 +889,6 @@ int amdgpu_ring_reset_helper_end(struct amdgpu_ring *ring,
 	if (r)
 		return r;
 
-	/* signal the guilty fence and set an error on all fences from the context */
-	if (guilty_fence)
-		amdgpu_fence_driver_guilty_force_completion(guilty_fence);
 	/* Re-emit the non-guilty commands */
 	if (ring->ring_backup_entries_to_copy) {
 		amdgpu_ring_alloc_reemit(ring, ring->ring_backup_entries_to_copy);
-- 
2.52.0


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH 6/7] drm/amdgpu/gfx9: rework pipeline sync packet sequence
  2025-12-18 22:41 [PATCH 1/7] drm/amdgpu: don't reemit ring contents more than once Alex Deucher
                   ` (3 preceding siblings ...)
  2025-12-18 22:41 ` [PATCH 5/7] drm/amdgpu: mark fences with errors before ring reset Alex Deucher
@ 2025-12-18 22:41 ` Alex Deucher
  2025-12-19  1:17   ` Timur Kristóf
  2025-12-18 22:41 ` [PATCH 7/7] drm/amdgpu/gfx9: Implement KGQ ring reset Alex Deucher
  5 siblings, 1 reply; 11+ messages in thread
From: Alex Deucher @ 2025-12-18 22:41 UTC (permalink / raw)
  To: amd-gfx; +Cc: Alex Deucher

Replace WAIT_REG_MEM with EVENT_WRITE flushes for all
shader types and PFP_SYNC_ME.  That should accomplish
the same thing and avoid having to wait on a fence
preventing any issues with pipeline syncs during
queue resets.

Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 32 ++++++++++++++++++---------
 1 file changed, 21 insertions(+), 11 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
index 7b012ca1153ea..d9dee3c11a05d 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
@@ -5572,15 +5572,26 @@ static void gfx_v9_0_ring_emit_fence(struct amdgpu_ring *ring, u64 addr,
 	amdgpu_ring_write(ring, 0);
 }
 
-static void gfx_v9_0_ring_emit_pipeline_sync(struct amdgpu_ring *ring)
+static void gfx_v9_0_ring_emit_event_write(struct amdgpu_ring *ring,
+					   uint32_t event_type,
+					   uint32_t event_index)
 {
-	int usepfp = (ring->funcs->type == AMDGPU_RING_TYPE_GFX);
-	uint32_t seq = ring->fence_drv.sync_seq;
-	uint64_t addr = ring->fence_drv.gpu_addr;
+	amdgpu_ring_write(ring, PACKET3(PACKET3_EVENT_WRITE, 0));
+	amdgpu_ring_write(ring, EVENT_TYPE(event_type) |
+			  EVENT_INDEX(event_index));
+}
 
-	gfx_v9_0_wait_reg_mem(ring, usepfp, 1, 0,
-			      lower_32_bits(addr), upper_32_bits(addr),
-			      seq, 0xffffffff, 4);
+static void gfx_v9_0_ring_emit_pipeline_sync(struct amdgpu_ring *ring)
+{
+	if (ring->funcs->type == AMDGPU_RING_TYPE_GFX) {
+		gfx_v9_0_ring_emit_event_write(ring, VS_PARTIAL_FLUSH, 4);
+		gfx_v9_0_ring_emit_event_write(ring, PS_PARTIAL_FLUSH, 4);
+		gfx_v9_0_ring_emit_event_write(ring, CS_PARTIAL_FLUSH, 4);
+		amdgpu_ring_write(ring, PACKET3(PACKET3_PFP_SYNC_ME, 0));
+		amdgpu_ring_write(ring, 0x0);
+	} else {
+		gfx_v9_0_ring_emit_event_write(ring, CS_PARTIAL_FLUSH, 4);
+	}
 }
 
 static void gfx_v9_0_ring_emit_vm_flush(struct amdgpu_ring *ring,
@@ -7404,7 +7415,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_gfx = {
 	.set_wptr = gfx_v9_0_ring_set_wptr_gfx,
 	.emit_frame_size = /* totally 242 maximum if 16 IBs */
 		5 +  /* COND_EXEC */
-		7 +  /* PIPELINE_SYNC */
+		8 +  /* PIPELINE_SYNC */
 		SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
 		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
 		2 + /* VM_FLUSH */
@@ -7460,7 +7471,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_sw_ring_funcs_gfx = {
 	.set_wptr = amdgpu_sw_ring_set_wptr_gfx,
 	.emit_frame_size = /* totally 242 maximum if 16 IBs */
 		5 +  /* COND_EXEC */
-		7 +  /* PIPELINE_SYNC */
+		8 +  /* PIPELINE_SYNC */
 		SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
 		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
 		2 + /* VM_FLUSH */
@@ -7521,7 +7532,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = {
 		20 + /* gfx_v9_0_ring_emit_gds_switch */
 		7 + /* gfx_v9_0_ring_emit_hdp_flush */
 		5 + /* hdp invalidate */
-		7 + /* gfx_v9_0_ring_emit_pipeline_sync */
+		2 + /* gfx_v9_0_ring_emit_pipeline_sync */
 		SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
 		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
 		8 + 8 + 8 + /* gfx_v9_0_ring_emit_fence x3 for user fence, vm fence */
@@ -7564,7 +7575,6 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_kiq = {
 		20 + /* gfx_v9_0_ring_emit_gds_switch */
 		7 + /* gfx_v9_0_ring_emit_hdp_flush */
 		5 + /* hdp invalidate */
-		7 + /* gfx_v9_0_ring_emit_pipeline_sync */
 		SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
 		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
 		8 + 8 + 8, /* gfx_v9_0_ring_emit_fence_kiq x3 for user fence, vm fence */
-- 
2.52.0


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: [PATCH 6/7] drm/amdgpu/gfx9: rework pipeline sync packet sequence
  2025-12-18 22:41 ` [PATCH 6/7] drm/amdgpu/gfx9: rework pipeline sync packet sequence Alex Deucher
@ 2025-12-19  1:17   ` Timur Kristóf
  2025-12-19 15:37     ` Alex Deucher
  0 siblings, 1 reply; 11+ messages in thread
From: Timur Kristóf @ 2025-12-19  1:17 UTC (permalink / raw)
  To: amd-gfx; +Cc: Alex Deucher, Alex Deucher

On 2025. december 18., csütörtök 16:41:40 középső államokbeli zónaidő Alex 
Deucher wrote:
> Replace WAIT_REG_MEM with EVENT_WRITE flushes for all
> shader types and PFP_SYNC_ME.  That should accomplish
> the same thing and avoid having to wait on a fence
> preventing any issues with pipeline syncs during
> queue resets.
> 
> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 32 ++++++++++++++++++---------
>  1 file changed, 21 insertions(+), 11 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c index 7b012ca1153ea..d9dee3c11a05d
> 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> @@ -5572,15 +5572,26 @@ static void gfx_v9_0_ring_emit_fence(struct
> amdgpu_ring *ring, u64 addr, amdgpu_ring_write(ring, 0);
>  }
> 
> -static void gfx_v9_0_ring_emit_pipeline_sync(struct amdgpu_ring *ring)
> +static void gfx_v9_0_ring_emit_event_write(struct amdgpu_ring *ring,
> +					   uint32_t event_type,
> +					   uint32_t 
event_index)
>  {
> -	int usepfp = (ring->funcs->type == AMDGPU_RING_TYPE_GFX);
> -	uint32_t seq = ring->fence_drv.sync_seq;
> -	uint64_t addr = ring->fence_drv.gpu_addr;
> +	amdgpu_ring_write(ring, PACKET3(PACKET3_EVENT_WRITE, 0));
> +	amdgpu_ring_write(ring, EVENT_TYPE(event_type) |
> +			  EVENT_INDEX(event_index));
> +}
> 
> -	gfx_v9_0_wait_reg_mem(ring, usepfp, 1, 0,
> -			      lower_32_bits(addr), 
upper_32_bits(addr),
> -			      seq, 0xffffffff, 4);
> +static void gfx_v9_0_ring_emit_pipeline_sync(struct amdgpu_ring *ring)
> +{
> +	if (ring->funcs->type == AMDGPU_RING_TYPE_GFX) {
> +		gfx_v9_0_ring_emit_event_write(ring, VS_PARTIAL_FLUSH, 
4);

Is VS_PARTIAL_FLUSH necessary when we already have PS_PARTIAL_FLUSH?
When we wait for all PS to finish, wouldn't that imply that all VS had already 
finished as well?

> +		gfx_v9_0_ring_emit_event_write(ring, PS_PARTIAL_FLUSH, 
4);
> +		gfx_v9_0_ring_emit_event_write(ring, CS_PARTIAL_FLUSH, 
4);
> +		amdgpu_ring_write(ring, PACKET3(PACKET3_PFP_SYNC_ME, 
0));
> +		amdgpu_ring_write(ring, 0x0);

The above sequence just waits for all shaders to finish, but as far as I 
understand it doesn't wait for memory writes and cache flushes. Please correct 
me if I'm wrong about this. For that, I think we do need an ACQUIRE_MEM 
packet. (And, if the ACQUIRE_MEM is done on the PFP then we won't need the 
PFP_SYNC_ME.)

> +	} else {
> +		gfx_v9_0_ring_emit_event_write(ring, CS_PARTIAL_FLUSH, 
4);
> +	}
>  }
> 
>  static void gfx_v9_0_ring_emit_vm_flush(struct amdgpu_ring *ring,
> @@ -7404,7 +7415,7 @@ static const struct amdgpu_ring_funcs
> gfx_v9_0_ring_funcs_gfx = { .set_wptr = gfx_v9_0_ring_set_wptr_gfx,
>  	.emit_frame_size = /* totally 242 maximum if 16 IBs */
>  		5 +  /* COND_EXEC */
> -		7 +  /* PIPELINE_SYNC */
> +		8 +  /* PIPELINE_SYNC */
>  		SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
>  		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
>  		2 + /* VM_FLUSH */
> @@ -7460,7 +7471,7 @@ static const struct amdgpu_ring_funcs
> gfx_v9_0_sw_ring_funcs_gfx = { .set_wptr = amdgpu_sw_ring_set_wptr_gfx,
>  	.emit_frame_size = /* totally 242 maximum if 16 IBs */
>  		5 +  /* COND_EXEC */
> -		7 +  /* PIPELINE_SYNC */
> +		8 +  /* PIPELINE_SYNC */
>  		SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
>  		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
>  		2 + /* VM_FLUSH */
> @@ -7521,7 +7532,7 @@ static const struct amdgpu_ring_funcs
> gfx_v9_0_ring_funcs_compute = { 20 + /* gfx_v9_0_ring_emit_gds_switch */
>  		7 + /* gfx_v9_0_ring_emit_hdp_flush */
>  		5 + /* hdp invalidate */
> -		7 + /* gfx_v9_0_ring_emit_pipeline_sync */
> +		2 + /* gfx_v9_0_ring_emit_pipeline_sync */
>  		SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
>  		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
>  		8 + 8 + 8 + /* gfx_v9_0_ring_emit_fence x3 for user 
fence, vm fence */
> @@ -7564,7 +7575,6 @@ static const struct amdgpu_ring_funcs
> gfx_v9_0_ring_funcs_kiq = { 20 + /* gfx_v9_0_ring_emit_gds_switch */
>  		7 + /* gfx_v9_0_ring_emit_hdp_flush */
>  		5 + /* hdp invalidate */
> -		7 + /* gfx_v9_0_ring_emit_pipeline_sync */
>  		SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
>  		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
>  		8 + 8 + 8, /* gfx_v9_0_ring_emit_fence_kiq x3 for user 
fence, vm fence */





^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 6/7] drm/amdgpu/gfx9: rework pipeline sync packet sequence
  2025-12-19  1:17   ` Timur Kristóf
@ 2025-12-19 15:37     ` Alex Deucher
  2025-12-19 16:33       ` Timur Kristóf
  0 siblings, 1 reply; 11+ messages in thread
From: Alex Deucher @ 2025-12-19 15:37 UTC (permalink / raw)
  To: Timur Kristóf; +Cc: amd-gfx, Alex Deucher

On Thu, Dec 18, 2025 at 9:36 PM Timur Kristóf <timur.kristof@gmail.com> wrote:
>
> On 2025. december 18., csütörtök 16:41:40 középső államokbeli zónaidő Alex
> Deucher wrote:
> > Replace WAIT_REG_MEM with EVENT_WRITE flushes for all
> > shader types and PFP_SYNC_ME.  That should accomplish
> > the same thing and avoid having to wait on a fence
> > preventing any issues with pipeline syncs during
> > queue resets.
> >
> > Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
> > ---
> >  drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 32 ++++++++++++++++++---------
> >  1 file changed, 21 insertions(+), 11 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> > b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c index 7b012ca1153ea..d9dee3c11a05d
> > 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> > @@ -5572,15 +5572,26 @@ static void gfx_v9_0_ring_emit_fence(struct
> > amdgpu_ring *ring, u64 addr, amdgpu_ring_write(ring, 0);
> >  }
> >
> > -static void gfx_v9_0_ring_emit_pipeline_sync(struct amdgpu_ring *ring)
> > +static void gfx_v9_0_ring_emit_event_write(struct amdgpu_ring *ring,
> > +                                        uint32_t event_type,
> > +                                        uint32_t
> event_index)
> >  {
> > -     int usepfp = (ring->funcs->type == AMDGPU_RING_TYPE_GFX);
> > -     uint32_t seq = ring->fence_drv.sync_seq;
> > -     uint64_t addr = ring->fence_drv.gpu_addr;
> > +     amdgpu_ring_write(ring, PACKET3(PACKET3_EVENT_WRITE, 0));
> > +     amdgpu_ring_write(ring, EVENT_TYPE(event_type) |
> > +                       EVENT_INDEX(event_index));
> > +}
> >
> > -     gfx_v9_0_wait_reg_mem(ring, usepfp, 1, 0,
> > -                           lower_32_bits(addr),
> upper_32_bits(addr),
> > -                           seq, 0xffffffff, 4);
> > +static void gfx_v9_0_ring_emit_pipeline_sync(struct amdgpu_ring *ring)
> > +{
> > +     if (ring->funcs->type == AMDGPU_RING_TYPE_GFX) {
> > +             gfx_v9_0_ring_emit_event_write(ring, VS_PARTIAL_FLUSH,
> 4);
>
> Is VS_PARTIAL_FLUSH necessary when we already have PS_PARTIAL_FLUSH?
> When we wait for all PS to finish, wouldn't that imply that all VS had already
> finished as well?

I'm not sure.  The CP docs recommend all 3 if you want to wait for the
engine to idle.

>
> > +             gfx_v9_0_ring_emit_event_write(ring, PS_PARTIAL_FLUSH,
> 4);
> > +             gfx_v9_0_ring_emit_event_write(ring, CS_PARTIAL_FLUSH,
> 4);
> > +             amdgpu_ring_write(ring, PACKET3(PACKET3_PFP_SYNC_ME,
> 0));
> > +             amdgpu_ring_write(ring, 0x0);
>
> The above sequence just waits for all shaders to finish, but as far as I
> understand it doesn't wait for memory writes and cache flushes. Please correct
> me if I'm wrong about this. For that, I think we do need an ACQUIRE_MEM
> packet. (And, if the ACQUIRE_MEM is done on the PFP then we won't need the
> PFP_SYNC_ME.)

There is already a RELEASE_MEM (the fence from the previous job) prior
to this packet that would have flushed the caches.  We just want to
block the PFP from further fetching until that is complete.  In the
good case, the RELEASE_MEM would have handled pipeline idling and
cache flushes so these would be effectively noops and in the reset
case, we don't care because that bad job is gone anyway.  I guess
probably all we really need is the PFP_SYNC_ME.

Alex

>
> > +     } else {
> > +             gfx_v9_0_ring_emit_event_write(ring, CS_PARTIAL_FLUSH,
> 4);
> > +     }
> >  }
> >
> >  static void gfx_v9_0_ring_emit_vm_flush(struct amdgpu_ring *ring,
> > @@ -7404,7 +7415,7 @@ static const struct amdgpu_ring_funcs
> > gfx_v9_0_ring_funcs_gfx = { .set_wptr = gfx_v9_0_ring_set_wptr_gfx,
> >       .emit_frame_size = /* totally 242 maximum if 16 IBs */
> >               5 +  /* COND_EXEC */
> > -             7 +  /* PIPELINE_SYNC */
> > +             8 +  /* PIPELINE_SYNC */
> >               SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
> >               SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
> >               2 + /* VM_FLUSH */
> > @@ -7460,7 +7471,7 @@ static const struct amdgpu_ring_funcs
> > gfx_v9_0_sw_ring_funcs_gfx = { .set_wptr = amdgpu_sw_ring_set_wptr_gfx,
> >       .emit_frame_size = /* totally 242 maximum if 16 IBs */
> >               5 +  /* COND_EXEC */
> > -             7 +  /* PIPELINE_SYNC */
> > +             8 +  /* PIPELINE_SYNC */
> >               SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
> >               SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
> >               2 + /* VM_FLUSH */
> > @@ -7521,7 +7532,7 @@ static const struct amdgpu_ring_funcs
> > gfx_v9_0_ring_funcs_compute = { 20 + /* gfx_v9_0_ring_emit_gds_switch */
> >               7 + /* gfx_v9_0_ring_emit_hdp_flush */
> >               5 + /* hdp invalidate */
> > -             7 + /* gfx_v9_0_ring_emit_pipeline_sync */
> > +             2 + /* gfx_v9_0_ring_emit_pipeline_sync */
> >               SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
> >               SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
> >               8 + 8 + 8 + /* gfx_v9_0_ring_emit_fence x3 for user
> fence, vm fence */
> > @@ -7564,7 +7575,6 @@ static const struct amdgpu_ring_funcs
> > gfx_v9_0_ring_funcs_kiq = { 20 + /* gfx_v9_0_ring_emit_gds_switch */
> >               7 + /* gfx_v9_0_ring_emit_hdp_flush */
> >               5 + /* hdp invalidate */
> > -             7 + /* gfx_v9_0_ring_emit_pipeline_sync */
> >               SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
> >               SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
> >               8 + 8 + 8, /* gfx_v9_0_ring_emit_fence_kiq x3 for user
> fence, vm fence */
>
>
>
>

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 6/7] drm/amdgpu/gfx9: rework pipeline sync packet sequence
  2025-12-19 15:37     ` Alex Deucher
@ 2025-12-19 16:33       ` Timur Kristóf
  2025-12-19 16:46         ` Deucher, Alexander
  0 siblings, 1 reply; 11+ messages in thread
From: Timur Kristóf @ 2025-12-19 16:33 UTC (permalink / raw)
  To: Alex Deucher; +Cc: amd-gfx, Alex Deucher

On 2025. december 19., péntek 9:37:16 középső államokbeli zónaidő Alex Deucher 
wrote:
> On Thu, Dec 18, 2025 at 9:36 PM Timur Kristóf <timur.kristof@gmail.com> 
wrote:
> > On 2025. december 18., csütörtök 16:41:40 középső államokbeli zónaidő Alex
> > 
> > Deucher wrote:
> > > Replace WAIT_REG_MEM with EVENT_WRITE flushes for all
> > > shader types and PFP_SYNC_ME.  That should accomplish
> > > the same thing and avoid having to wait on a fence
> > > preventing any issues with pipeline syncs during
> > > queue resets.
> > > 
> > > Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
> > > ---
> > > 
> > >  drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 32 ++++++++++++++++++---------
> > >  1 file changed, 21 insertions(+), 11 deletions(-)
> > > 
> > > diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> > > b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c index
> > > 7b012ca1153ea..d9dee3c11a05d
> > > 100644
> > > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> > > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> > > @@ -5572,15 +5572,26 @@ static void gfx_v9_0_ring_emit_fence(struct
> > > amdgpu_ring *ring, u64 addr, amdgpu_ring_write(ring, 0);
> > > 
> > >  }
> > > 
> > > -static void gfx_v9_0_ring_emit_pipeline_sync(struct amdgpu_ring *ring)
> > > +static void gfx_v9_0_ring_emit_event_write(struct amdgpu_ring *ring,
> > > +                                        uint32_t event_type,
> > > +                                        uint32_t
> > 
> > event_index)
> > 
> > >  {
> > > 
> > > -     int usepfp = (ring->funcs->type == AMDGPU_RING_TYPE_GFX);
> > > -     uint32_t seq = ring->fence_drv.sync_seq;
> > > -     uint64_t addr = ring->fence_drv.gpu_addr;
> > > +     amdgpu_ring_write(ring, PACKET3(PACKET3_EVENT_WRITE, 0));
> > > +     amdgpu_ring_write(ring, EVENT_TYPE(event_type) |
> > > +                       EVENT_INDEX(event_index));
> > > +}
> > > 
> > > -     gfx_v9_0_wait_reg_mem(ring, usepfp, 1, 0,
> > > -                           lower_32_bits(addr),
> > 
> > upper_32_bits(addr),
> > 
> > > -                           seq, 0xffffffff, 4);
> > > +static void gfx_v9_0_ring_emit_pipeline_sync(struct amdgpu_ring *ring)
> > > +{
> > > +     if (ring->funcs->type == AMDGPU_RING_TYPE_GFX) {
> > > +             gfx_v9_0_ring_emit_event_write(ring, VS_PARTIAL_FLUSH,
> > 
> > 4);
> > 
> > Is VS_PARTIAL_FLUSH necessary when we already have PS_PARTIAL_FLUSH?
> > When we wait for all PS to finish, wouldn't that imply that all VS had
> > already finished as well?
> 
> I'm not sure.  The CP docs recommend all 3 if you want to wait for the
> engine to idle.

Alright, it doesn't hurt to have it here.

> 
> > > +             gfx_v9_0_ring_emit_event_write(ring, PS_PARTIAL_FLUSH,
> > 
> > 4);
> > 
> > > +             gfx_v9_0_ring_emit_event_write(ring, CS_PARTIAL_FLUSH,
> > 
> > 4);
> > 
> > > +             amdgpu_ring_write(ring, PACKET3(PACKET3_PFP_SYNC_ME,
> > 
> > 0));
> > 
> > > +             amdgpu_ring_write(ring, 0x0);
> > 
> > The above sequence just waits for all shaders to finish, but as far as I
> > understand it doesn't wait for memory writes and cache flushes. Please
> > correct me if I'm wrong about this. For that, I think we do need an
> > ACQUIRE_MEM packet. (And, if the ACQUIRE_MEM is done on the PFP then we
> > won't need the PFP_SYNC_ME.)
> 
> There is already a RELEASE_MEM (the fence from the previous job) prior
> to this packet that would have flushed the caches.  We just want to
> block the PFP from further fetching until that is complete.  In the
> good case, the RELEASE_MEM would have handled pipeline idling and
> cache flushes so these would be effectively noops and in the reset
> case, we don't care because that bad job is gone anyway.  I guess
> probably all we really need is the PFP_SYNC_ME.
> 
> Alex

RELEASE_MEM doesn't wait for the GPU to go idle, RELEASE_MEM just promises to 
write to the given fence address when the specified operations (eg. shaders and 
cache flush) are complete. Here in the pipeline sync, we actually want to wait 
for the GPU to go idle, and AFAIK we need an ACQUIRE_MEM for that.

> 
> > > +     } else {
> > > +             gfx_v9_0_ring_emit_event_write(ring, CS_PARTIAL_FLUSH,
> > 
> > 4);
> > 
> > > +     }
> > > 
> > >  }
> > >  
> > >  static void gfx_v9_0_ring_emit_vm_flush(struct amdgpu_ring *ring,
> > > 
> > > @@ -7404,7 +7415,7 @@ static const struct amdgpu_ring_funcs
> > > gfx_v9_0_ring_funcs_gfx = { .set_wptr = gfx_v9_0_ring_set_wptr_gfx,
> > > 
> > >       .emit_frame_size = /* totally 242 maximum if 16 IBs */
> > >       
> > >               5 +  /* COND_EXEC */
> > > 
> > > -             7 +  /* PIPELINE_SYNC */
> > > +             8 +  /* PIPELINE_SYNC */
> > > 
> > >               SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
> > >               SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
> > >               2 + /* VM_FLUSH */
> > > 
> > > @@ -7460,7 +7471,7 @@ static const struct amdgpu_ring_funcs
> > > gfx_v9_0_sw_ring_funcs_gfx = { .set_wptr = amdgpu_sw_ring_set_wptr_gfx,
> > > 
> > >       .emit_frame_size = /* totally 242 maximum if 16 IBs */
> > >       
> > >               5 +  /* COND_EXEC */
> > > 
> > > -             7 +  /* PIPELINE_SYNC */
> > > +             8 +  /* PIPELINE_SYNC */
> > > 
> > >               SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
> > >               SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
> > >               2 + /* VM_FLUSH */
> > > 
> > > @@ -7521,7 +7532,7 @@ static const struct amdgpu_ring_funcs
> > > gfx_v9_0_ring_funcs_compute = { 20 + /* gfx_v9_0_ring_emit_gds_switch */
> > > 
> > >               7 + /* gfx_v9_0_ring_emit_hdp_flush */
> > >               5 + /* hdp invalidate */
> > > 
> > > -             7 + /* gfx_v9_0_ring_emit_pipeline_sync */
> > > +             2 + /* gfx_v9_0_ring_emit_pipeline_sync */
> > > 
> > >               SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
> > >               SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
> > >               8 + 8 + 8 + /* gfx_v9_0_ring_emit_fence x3 for user
> > 
> > fence, vm fence */
> > 
> > > @@ -7564,7 +7575,6 @@ static const struct amdgpu_ring_funcs
> > > gfx_v9_0_ring_funcs_kiq = { 20 + /* gfx_v9_0_ring_emit_gds_switch */
> > > 
> > >               7 + /* gfx_v9_0_ring_emit_hdp_flush */
> > >               5 + /* hdp invalidate */
> > > 
> > > -             7 + /* gfx_v9_0_ring_emit_pipeline_sync */
> > > 
> > >               SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
> > >               SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
> > >               8 + 8 + 8, /* gfx_v9_0_ring_emit_fence_kiq x3 for user
> > 
> > fence, vm fence */





^ permalink raw reply	[flat|nested] 11+ messages in thread

* RE: [PATCH 6/7] drm/amdgpu/gfx9: rework pipeline sync packet sequence
  2025-12-19 16:33       ` Timur Kristóf
@ 2025-12-19 16:46         ` Deucher, Alexander
  0 siblings, 0 replies; 11+ messages in thread
From: Deucher, Alexander @ 2025-12-19 16:46 UTC (permalink / raw)
  To: Timur Kristóf, Alex Deucher; +Cc: amd-gfx@lists.freedesktop.org

[Public]

> -----Original Message-----
> From: Timur Kristóf <timur.kristof@gmail.com>
> Sent: Friday, December 19, 2025 11:34 AM
> To: Alex Deucher <alexdeucher@gmail.com>
> Cc: amd-gfx@lists.freedesktop.org; Deucher, Alexander
> <Alexander.Deucher@amd.com>
> Subject: Re: [PATCH 6/7] drm/amdgpu/gfx9: rework pipeline sync packet
> sequence
>
> On 2025. december 19., péntek 9:37:16 középső államokbeli zónaidő Alex
> Deucher
> wrote:
> > On Thu, Dec 18, 2025 at 9:36 PM Timur Kristóf
> > <timur.kristof@gmail.com>
> wrote:
> > > On 2025. december 18., csütörtök 16:41:40 középső államokbeli
> > > zónaidő Alex
> > >
> > > Deucher wrote:
> > > > Replace WAIT_REG_MEM with EVENT_WRITE flushes for all shader types
> > > > and PFP_SYNC_ME.  That should accomplish the same thing and avoid
> > > > having to wait on a fence preventing any issues with pipeline
> > > > syncs during queue resets.
> > > >
> > > > Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
> > > > ---
> > > >
> > > >  drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 32
> > > > ++++++++++++++++++---------
> > > >  1 file changed, 21 insertions(+), 11 deletions(-)
> > > >
> > > > diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> > > > b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c index
> > > > 7b012ca1153ea..d9dee3c11a05d
> > > > 100644
> > > > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> > > > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> > > > @@ -5572,15 +5572,26 @@ static void
> > > > gfx_v9_0_ring_emit_fence(struct amdgpu_ring *ring, u64 addr,
> > > > amdgpu_ring_write(ring, 0);
> > > >
> > > >  }
> > > >
> > > > -static void gfx_v9_0_ring_emit_pipeline_sync(struct amdgpu_ring
> > > > *ring)
> > > > +static void gfx_v9_0_ring_emit_event_write(struct amdgpu_ring *ring,
> > > > +                                        uint32_t event_type,
> > > > +                                        uint32_t
> > >
> > > event_index)
> > >
> > > >  {
> > > >
> > > > -     int usepfp = (ring->funcs->type == AMDGPU_RING_TYPE_GFX);
> > > > -     uint32_t seq = ring->fence_drv.sync_seq;
> > > > -     uint64_t addr = ring->fence_drv.gpu_addr;
> > > > +     amdgpu_ring_write(ring, PACKET3(PACKET3_EVENT_WRITE, 0));
> > > > +     amdgpu_ring_write(ring, EVENT_TYPE(event_type) |
> > > > +                       EVENT_INDEX(event_index)); }
> > > >
> > > > -     gfx_v9_0_wait_reg_mem(ring, usepfp, 1, 0,
> > > > -                           lower_32_bits(addr),
> > >
> > > upper_32_bits(addr),
> > >
> > > > -                           seq, 0xffffffff, 4);
> > > > +static void gfx_v9_0_ring_emit_pipeline_sync(struct amdgpu_ring
> > > > +*ring) {
> > > > +     if (ring->funcs->type == AMDGPU_RING_TYPE_GFX) {
> > > > +             gfx_v9_0_ring_emit_event_write(ring,
> > > > +VS_PARTIAL_FLUSH,
> > >
> > > 4);
> > >
> > > Is VS_PARTIAL_FLUSH necessary when we already have
> PS_PARTIAL_FLUSH?
> > > When we wait for all PS to finish, wouldn't that imply that all VS
> > > had already finished as well?
> >
> > I'm not sure.  The CP docs recommend all 3 if you want to wait for the
> > engine to idle.
>
> Alright, it doesn't hurt to have it here.
>
> >
> > > > +             gfx_v9_0_ring_emit_event_write(ring,
> > > > + PS_PARTIAL_FLUSH,
> > >
> > > 4);
> > >
> > > > +             gfx_v9_0_ring_emit_event_write(ring,
> > > > + CS_PARTIAL_FLUSH,
> > >
> > > 4);
> > >
> > > > +             amdgpu_ring_write(ring, PACKET3(PACKET3_PFP_SYNC_ME,
> > >
> > > 0));
> > >
> > > > +             amdgpu_ring_write(ring, 0x0);
> > >
> > > The above sequence just waits for all shaders to finish, but as far
> > > as I understand it doesn't wait for memory writes and cache flushes.
> > > Please correct me if I'm wrong about this. For that, I think we do
> > > need an ACQUIRE_MEM packet. (And, if the ACQUIRE_MEM is done on
> the
> > > PFP then we won't need the PFP_SYNC_ME.)
> >
> > There is already a RELEASE_MEM (the fence from the previous job) prior
> > to this packet that would have flushed the caches.  We just want to
> > block the PFP from further fetching until that is complete.  In the
> > good case, the RELEASE_MEM would have handled pipeline idling and
> > cache flushes so these would be effectively noops and in the reset
> > case, we don't care because that bad job is gone anyway.  I guess
> > probably all we really need is the PFP_SYNC_ME.
> >
> > Alex
>
> RELEASE_MEM doesn't wait for the GPU to go idle, RELEASE_MEM just
> promises to write to the given fence address when the specified operations
> (eg. shaders and cache flush) are complete. Here in the pipeline sync, we
> actually want to wait for the GPU to go idle, and AFAIK we need an
> ACQUIRE_MEM for that.

I was thinking the EVENT_WRITE would handle that, but you're right, I don't think it handles the caches, only the pipeline, and the fence is asynchronous.  I'll add the ACQUIRE_MEM.

Alex

>
> >
> > > > +     } else {
> > > > +             gfx_v9_0_ring_emit_event_write(ring, CS_PARTIAL_FLUSH,
> > >
> > > 4);
> > >
> > > > +     }
> > > >
> > > >  }
> > > >
> > > >  static void gfx_v9_0_ring_emit_vm_flush(struct amdgpu_ring *ring,
> > > >
> > > > @@ -7404,7 +7415,7 @@ static const struct amdgpu_ring_funcs
> > > > gfx_v9_0_ring_funcs_gfx = { .set_wptr = gfx_v9_0_ring_set_wptr_gfx,
> > > >
> > > >       .emit_frame_size = /* totally 242 maximum if 16 IBs */
> > > >
> > > >               5 +  /* COND_EXEC */
> > > >
> > > > -             7 +  /* PIPELINE_SYNC */
> > > > +             8 +  /* PIPELINE_SYNC */
> > > >
> > > >               SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
> > > >               SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
> > > >               2 + /* VM_FLUSH */
> > > >
> > > > @@ -7460,7 +7471,7 @@ static const struct amdgpu_ring_funcs
> > > > gfx_v9_0_sw_ring_funcs_gfx = { .set_wptr =
> amdgpu_sw_ring_set_wptr_gfx,
> > > >
> > > >       .emit_frame_size = /* totally 242 maximum if 16 IBs */
> > > >
> > > >               5 +  /* COND_EXEC */
> > > >
> > > > -             7 +  /* PIPELINE_SYNC */
> > > > +             8 +  /* PIPELINE_SYNC */
> > > >
> > > >               SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
> > > >               SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
> > > >               2 + /* VM_FLUSH */
> > > >
> > > > @@ -7521,7 +7532,7 @@ static const struct amdgpu_ring_funcs
> > > > gfx_v9_0_ring_funcs_compute = { 20 + /*
> gfx_v9_0_ring_emit_gds_switch */
> > > >
> > > >               7 + /* gfx_v9_0_ring_emit_hdp_flush */
> > > >               5 + /* hdp invalidate */
> > > >
> > > > -             7 + /* gfx_v9_0_ring_emit_pipeline_sync */
> > > > +             2 + /* gfx_v9_0_ring_emit_pipeline_sync */
> > > >
> > > >               SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
> > > >               SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
> > > >               8 + 8 + 8 + /* gfx_v9_0_ring_emit_fence x3 for user
> > >
> > > fence, vm fence */
> > >
> > > > @@ -7564,7 +7575,6 @@ static const struct amdgpu_ring_funcs
> > > > gfx_v9_0_ring_funcs_kiq = { 20 + /* gfx_v9_0_ring_emit_gds_switch */
> > > >
> > > >               7 + /* gfx_v9_0_ring_emit_hdp_flush */
> > > >               5 + /* hdp invalidate */
> > > >
> > > > -             7 + /* gfx_v9_0_ring_emit_pipeline_sync */
> > > >
> > > >               SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
> > > >               SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
> > > >               8 + 8 + 8, /* gfx_v9_0_ring_emit_fence_kiq x3 for user
> > >
> > > fence, vm fence */
>
>
>


^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH 7/7] drm/amdgpu/gfx9: Implement KGQ ring reset
  2025-12-18 22:41 [PATCH 1/7] drm/amdgpu: don't reemit ring contents more than once Alex Deucher
                   ` (4 preceding siblings ...)
  2025-12-18 22:41 ` [PATCH 6/7] drm/amdgpu/gfx9: rework pipeline sync packet sequence Alex Deucher
@ 2025-12-18 22:41 ` Alex Deucher
  5 siblings, 0 replies; 11+ messages in thread
From: Alex Deucher @ 2025-12-18 22:41 UTC (permalink / raw)
  To: amd-gfx; +Cc: Alex Deucher, Jiqian Chen

GFX ring resets work differently on pre-GFX10 hardware since
there is no MQD managed by the scheduler.
For ring reset, you need issue the reset via CP_VMID_RESET
via KIQ or MMIO and submit the following to the gfx ring to
complete the reset:
1. EOP packet with EXEC bit set
2. WAIT_REG_MEM to wait for the fence
3. Clear CP_VMID_RESET to 0
4. EVENT_WRITE ENABLE_LEGACY_PIPELINE
5. EOP packet with EXEC bit set
6. WAIT_REG_MEM to wait for the fence
Once those commands have completed the reset should
be complete and the ring can accept new packets.

Tested-by: Jiqian Chen <Jiqian.Chen@amd.com> (v1)
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 92 ++++++++++++++++++++++++++-
 1 file changed, 89 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
index d9dee3c11a05d..9ffd7c8adc57a 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
@@ -2411,8 +2411,10 @@ static int gfx_v9_0_sw_init(struct amdgpu_ip_block *ip_block)
 		amdgpu_get_soft_full_reset_mask(&adev->gfx.gfx_ring[0]);
 	adev->gfx.compute_supported_reset =
 		amdgpu_get_soft_full_reset_mask(&adev->gfx.compute_ring[0]);
-	if (!amdgpu_sriov_vf(adev) && !adev->debug_disable_gpu_ring_reset)
+	if (!amdgpu_sriov_vf(adev) && !adev->debug_disable_gpu_ring_reset) {
 		adev->gfx.compute_supported_reset |= AMDGPU_RESET_TYPE_PER_QUEUE;
+		adev->gfx.gfx_supported_reset |= AMDGPU_RESET_TYPE_PER_QUEUE;
+	}
 
 	r = amdgpu_gfx_kiq_init(adev, GFX9_MEC_HPD_SIZE, 0);
 	if (r) {
@@ -7175,6 +7177,91 @@ static void gfx_v9_ring_insert_nop(struct amdgpu_ring *ring, uint32_t num_nop)
 	amdgpu_ring_insert_nop(ring, num_nop - 1);
 }
 
+static void gfx_v9_0_ring_emit_wreg_me(struct amdgpu_ring *ring,
+				       uint32_t reg,
+				       uint32_t val)
+{
+	uint32_t cmd = 0;
+
+	switch (ring->funcs->type) {
+	case AMDGPU_RING_TYPE_KIQ:
+		cmd = (1 << 16); /* no inc addr */
+		break;
+	default:
+		cmd = WR_CONFIRM;
+		break;
+	}
+	amdgpu_ring_write(ring, PACKET3(PACKET3_WRITE_DATA, 3));
+	amdgpu_ring_write(ring, cmd);
+	amdgpu_ring_write(ring, reg);
+	amdgpu_ring_write(ring, 0);
+	amdgpu_ring_write(ring, val);
+}
+
+static int gfx_v9_0_reset_kgq(struct amdgpu_ring *ring,
+			      unsigned int vmid,
+			      struct amdgpu_fence *timedout_fence)
+{
+	struct amdgpu_device *adev = ring->adev;
+	struct amdgpu_kiq *kiq = &adev->gfx.kiq[0];
+	struct amdgpu_ring *kiq_ring = &kiq->ring;
+	unsigned long flags;
+	u32 tmp;
+	int r;
+
+	amdgpu_ring_reset_helper_begin(ring, timedout_fence);
+
+	spin_lock_irqsave(&kiq->ring_lock, flags);
+
+	if (amdgpu_ring_alloc(kiq_ring, 5)) {
+		spin_unlock_irqrestore(&kiq->ring_lock, flags);
+		return -ENOMEM;
+	}
+
+	/* send the reset - 5 */
+	tmp = REG_SET_FIELD(0, CP_VMID_RESET, RESET_REQUEST, 1 << vmid);
+	gfx_v9_0_ring_emit_wreg(kiq_ring,
+				SOC15_REG_OFFSET(GC, 0, mmCP_VMID_RESET), tmp);
+	amdgpu_ring_commit(kiq_ring);
+	r = amdgpu_ring_test_ring(kiq_ring);
+	spin_unlock_irqrestore(&kiq->ring_lock, flags);
+	if (r)
+		return r;
+
+	if (amdgpu_ring_alloc(ring, 8 + 7 + 5 + 2 + 8 + 7))
+		return -ENOMEM;
+	/* emit the fence to finish the reset - 8 */
+	ring->trail_seq++;
+	gfx_v9_0_ring_emit_fence(ring, ring->trail_fence_gpu_addr,
+				 ring->trail_seq, AMDGPU_FENCE_FLAG_EXEC);
+	/* wait for the fence - 7 */
+	gfx_v9_0_wait_reg_mem(ring, 0, 1, 0,
+			      lower_32_bits(ring->trail_fence_gpu_addr),
+			      upper_32_bits(ring->trail_fence_gpu_addr),
+			      ring->trail_seq, 0xffffffff, 4);
+	/* clear mmCP_VMID_RESET - 5 */
+	gfx_v9_0_ring_emit_wreg_me(ring,
+				   SOC15_REG_OFFSET(GC, 0, mmCP_VMID_RESET), 0);
+	/* event write ENABLE_LEGACY_PIPELINE - 2 */
+	gfx_v9_0_ring_emit_event_write(ring, ENABLE_LEGACY_PIPELINE, 0);
+	/* emit a regular fence - 8 */
+	ring->trail_seq++;
+	gfx_v9_0_ring_emit_fence(ring, ring->trail_fence_gpu_addr,
+				 ring->trail_seq, AMDGPU_FENCE_FLAG_EXEC);
+	/* wait for the fence - 7 */
+	gfx_v9_0_wait_reg_mem(ring, 1, 1, 0,
+			      lower_32_bits(ring->trail_fence_gpu_addr),
+			      upper_32_bits(ring->trail_fence_gpu_addr),
+			      ring->trail_seq, 0xffffffff, 4);
+	amdgpu_ring_commit(ring);
+	/* wait for the commands to complete */
+	r = amdgpu_ring_test_ring(ring);
+	if (r)
+		return r;
+
+	return amdgpu_ring_reset_helper_end(ring, timedout_fence);
+}
+
 static int gfx_v9_0_reset_kcq(struct amdgpu_ring *ring,
 			      unsigned int vmid,
 			      struct amdgpu_fence *timedout_fence)
@@ -7453,9 +7540,9 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_gfx = {
 	.emit_wreg = gfx_v9_0_ring_emit_wreg,
 	.emit_reg_wait = gfx_v9_0_ring_emit_reg_wait,
 	.emit_reg_write_reg_wait = gfx_v9_0_ring_emit_reg_write_reg_wait,
-	.soft_recovery = gfx_v9_0_ring_soft_recovery,
 	.emit_mem_sync = gfx_v9_0_emit_mem_sync,
 	.emit_cleaner_shader = gfx_v9_0_ring_emit_cleaner_shader,
+	.reset = gfx_v9_0_reset_kgq,
 	.begin_use = amdgpu_gfx_enforce_isolation_ring_begin_use,
 	.end_use = amdgpu_gfx_enforce_isolation_ring_end_use,
 };
@@ -7554,7 +7641,6 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = {
 	.emit_wreg = gfx_v9_0_ring_emit_wreg,
 	.emit_reg_wait = gfx_v9_0_ring_emit_reg_wait,
 	.emit_reg_write_reg_wait = gfx_v9_0_ring_emit_reg_write_reg_wait,
-	.soft_recovery = gfx_v9_0_ring_soft_recovery,
 	.emit_mem_sync = gfx_v9_0_emit_mem_sync,
 	.emit_wave_limit = gfx_v9_0_emit_wave_limit,
 	.reset = gfx_v9_0_reset_kcq,
-- 
2.52.0


^ permalink raw reply related	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2025-12-19 16:46 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-12-18 22:41 [PATCH 1/7] drm/amdgpu: don't reemit ring contents more than once Alex Deucher
2025-12-18 22:41 ` [PATCH 2/7] drm/amdgpu: always backup and reemit fences Alex Deucher
2025-12-18 22:41 ` [PATCH 3/7] drm/amdgpu: use dma_fence_get_status() for adapter reset Alex Deucher
2025-12-18 22:41 ` [PATCH 4/7] drm/amdgpu: avoid a warning in timedout job handler Alex Deucher
2025-12-18 22:41 ` [PATCH 5/7] drm/amdgpu: mark fences with errors before ring reset Alex Deucher
2025-12-18 22:41 ` [PATCH 6/7] drm/amdgpu/gfx9: rework pipeline sync packet sequence Alex Deucher
2025-12-19  1:17   ` Timur Kristóf
2025-12-19 15:37     ` Alex Deucher
2025-12-19 16:33       ` Timur Kristóf
2025-12-19 16:46         ` Deucher, Alexander
2025-12-18 22:41 ` [PATCH 7/7] drm/amdgpu/gfx9: Implement KGQ ring reset Alex Deucher

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.