[PATCH 1/2] drm/amdgpu: Reset IH OVERFLOW_CLEAR bit after writing rptr

All of lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH 1/2] drm/amdgpu: Reset IH OVERFLOW_CLEAR bit after writing rptr
@ 2024-01-14 13:00 ` Friedrich Vock
  0 siblings, 0 replies; 28+ messages in thread
From: Friedrich Vock @ 2024-01-14 13:00 UTC (permalink / raw)
  To: amd-gfx; +Cc: Alex Deucher, Friedrich Vock, stable, Joshua Ashton

Allows us to detect subsequent IH ring buffer overflows as well.

Cc: Joshua Ashton <joshua@froggi.es>
Cc: Alex Deucher <alexander.deucher@amd.com>
Cc: stable@vger.kernel.org

Signed-off-by: Friedrich Vock <friedrich.vock@gmx.de>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h  |  2 ++
 drivers/gpu/drm/amd/amdgpu/cik_ih.c     | 13 +++++++++++++
 drivers/gpu/drm/amd/amdgpu/cz_ih.c      | 14 +++++++++++++-
 drivers/gpu/drm/amd/amdgpu/iceland_ih.c | 14 +++++++++++++-
 drivers/gpu/drm/amd/amdgpu/ih_v6_0.c    | 13 +++++++++++++
 drivers/gpu/drm/amd/amdgpu/ih_v6_1.c    | 13 +++++++++++++
 drivers/gpu/drm/amd/amdgpu/navi10_ih.c  | 12 ++++++++++++
 drivers/gpu/drm/amd/amdgpu/si_ih.c      | 12 ++++++++++++
 drivers/gpu/drm/amd/amdgpu/tonga_ih.c   | 13 +++++++++++++
 drivers/gpu/drm/amd/amdgpu/vega10_ih.c  | 12 ++++++++++++
 drivers/gpu/drm/amd/amdgpu/vega20_ih.c  | 12 ++++++++++++
 11 files changed, 128 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h
index 508f02eb0cf8..6041ec727f06 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h
@@ -69,6 +69,8 @@ struct amdgpu_ih_ring {
 	unsigned		rptr;
 	struct amdgpu_ih_regs	ih_regs;

+	bool overflow;
+
 	/* For waiting on IH processing at checkpoint. */
 	wait_queue_head_t wait_process;
 	uint64_t		processed_timestamp;
diff --git a/drivers/gpu/drm/amd/amdgpu/cik_ih.c b/drivers/gpu/drm/amd/amdgpu/cik_ih.c
index 6f7c031dd197..807cc30c9e33 100644
--- a/drivers/gpu/drm/amd/amdgpu/cik_ih.c
+++ b/drivers/gpu/drm/amd/amdgpu/cik_ih.c
@@ -204,6 +204,7 @@ static u32 cik_ih_get_wptr(struct amdgpu_device *adev,
 		tmp = RREG32(mmIH_RB_CNTL);
 		tmp |= IH_RB_CNTL__WPTR_OVERFLOW_CLEAR_MASK;
 		WREG32(mmIH_RB_CNTL, tmp);
+		ih->overflow = true;
 	}
 	return (wptr & ih->ptr_mask);
 }
@@ -274,7 +275,19 @@ static void cik_ih_decode_iv(struct amdgpu_device *adev,
 static void cik_ih_set_rptr(struct amdgpu_device *adev,
 			    struct amdgpu_ih_ring *ih)
 {
+	u32 tmp;
+
 	WREG32(mmIH_RB_RPTR, ih->rptr);
+
+	/* If we overflowed previously (and thus set the OVERFLOW_CLEAR bit),
+	 * reset it here to detect more overflows if they occur.
+	 */
+	if (ih->overflow) {
+		tmp = RREG32(mmIH_RB_CNTL);
+		tmp &= ~IH_RB_CNTL__WPTR_OVERFLOW_CLEAR_MASK;
+		WREG32(mmIH_RB_CNTL, tmp);
+		ih->overflow = false;
+	}
 }

 static int cik_ih_early_init(void *handle)
diff --git a/drivers/gpu/drm/amd/amdgpu/cz_ih.c b/drivers/gpu/drm/amd/amdgpu/cz_ih.c
index b8c47e0cf37a..076559668573 100644
--- a/drivers/gpu/drm/amd/amdgpu/cz_ih.c
+++ b/drivers/gpu/drm/amd/amdgpu/cz_ih.c
@@ -215,7 +215,7 @@ static u32 cz_ih_get_wptr(struct amdgpu_device *adev,
 	tmp = RREG32(mmIH_RB_CNTL);
 	tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 1);
 	WREG32(mmIH_RB_CNTL, tmp);
-
+	ih->overflow = true;

 out:
 	return (wptr & ih->ptr_mask);
@@ -266,7 +266,19 @@ static void cz_ih_decode_iv(struct amdgpu_device *adev,
 static void cz_ih_set_rptr(struct amdgpu_device *adev,
 			   struct amdgpu_ih_ring *ih)
 {
+	u32 tmp;
+
 	WREG32(mmIH_RB_RPTR, ih->rptr);
+
+	/* If we overflowed previously (and thus set the OVERFLOW_CLEAR bit),
+	 * reset it here to detect more overflows if they occur.
+	 */
+	if (ih->overflow) {
+		tmp = RREG32(mmIH_RB_CNTL);
+		tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 0);
+		WREG32(mmIH_RB_CNTL, tmp);
+		ih->overflow = false;
+	}
 }

 static int cz_ih_early_init(void *handle)
diff --git a/drivers/gpu/drm/amd/amdgpu/iceland_ih.c b/drivers/gpu/drm/amd/amdgpu/iceland_ih.c
index aecad530b10a..1a5e668643d1 100644
--- a/drivers/gpu/drm/amd/amdgpu/iceland_ih.c
+++ b/drivers/gpu/drm/amd/amdgpu/iceland_ih.c
@@ -214,7 +214,7 @@ static u32 iceland_ih_get_wptr(struct amdgpu_device *adev,
 	tmp = RREG32(mmIH_RB_CNTL);
 	tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 1);
 	WREG32(mmIH_RB_CNTL, tmp);
-
+	ih->overflow = true;

 out:
 	return (wptr & ih->ptr_mask);
@@ -265,7 +265,19 @@ static void iceland_ih_decode_iv(struct amdgpu_device *adev,
 static void iceland_ih_set_rptr(struct amdgpu_device *adev,
 				struct amdgpu_ih_ring *ih)
 {
+	u32 tmp;
+
 	WREG32(mmIH_RB_RPTR, ih->rptr);
+
+	/* If we overflowed previously (and thus set the OVERFLOW_CLEAR bit),
+	 * reset it here to detect more overflows if they occur.
+	 */
+	if (ih->overflow) {
+		tmp = RREG32(mmIH_RB_CNTL);
+		tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 0);
+		WREG32(mmIH_RB_CNTL, tmp);
+		ih->overflow = false;
+	}
 }

 static int iceland_ih_early_init(void *handle)
diff --git a/drivers/gpu/drm/amd/amdgpu/ih_v6_0.c b/drivers/gpu/drm/amd/amdgpu/ih_v6_0.c
index d9ed7332d805..ce8f7feec713 100644
--- a/drivers/gpu/drm/amd/amdgpu/ih_v6_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/ih_v6_0.c
@@ -418,6 +418,8 @@ static u32 ih_v6_0_get_wptr(struct amdgpu_device *adev,
 	tmp = RREG32_NO_KIQ(ih_regs->ih_rb_cntl);
 	tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 1);
 	WREG32_NO_KIQ(ih_regs->ih_rb_cntl, tmp);
+	ih->overflow = true;
+
 out:
 	return (wptr & ih->ptr_mask);
 }
@@ -459,6 +461,7 @@ static void ih_v6_0_irq_rearm(struct amdgpu_device *adev,
 static void ih_v6_0_set_rptr(struct amdgpu_device *adev,
 			       struct amdgpu_ih_ring *ih)
 {
+	u32 tmp;
 	struct amdgpu_ih_regs *ih_regs;

 	if (ih->use_doorbell) {
@@ -472,6 +475,16 @@ static void ih_v6_0_set_rptr(struct amdgpu_device *adev,
 		ih_regs = &ih->ih_regs;
 		WREG32(ih_regs->ih_rb_rptr, ih->rptr);
 	}
+
+	/* If we overflowed previously (and thus set the OVERFLOW_CLEAR bit),
+	 * reset it here to detect more overflows if they occur.
+	 */
+	if (ih->overflow) {
+		tmp = RREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl);
+		tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 0);
+		WREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl, tmp);
+		ih->overflow = false;
+	}
 }

 /**
diff --git a/drivers/gpu/drm/amd/amdgpu/ih_v6_1.c b/drivers/gpu/drm/amd/amdgpu/ih_v6_1.c
index 8fb05eae340a..668788ad34d9 100644
--- a/drivers/gpu/drm/amd/amdgpu/ih_v6_1.c
+++ b/drivers/gpu/drm/amd/amdgpu/ih_v6_1.c
@@ -418,6 +418,8 @@ static u32 ih_v6_1_get_wptr(struct amdgpu_device *adev,
 	tmp = RREG32_NO_KIQ(ih_regs->ih_rb_cntl);
 	tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 1);
 	WREG32_NO_KIQ(ih_regs->ih_rb_cntl, tmp);
+	ih->overflow = true;
+
 out:
 	return (wptr & ih->ptr_mask);
 }
@@ -459,6 +461,7 @@ static void ih_v6_1_irq_rearm(struct amdgpu_device *adev,
 static void ih_v6_1_set_rptr(struct amdgpu_device *adev,
 			       struct amdgpu_ih_ring *ih)
 {
+	u32 tmp;
 	struct amdgpu_ih_regs *ih_regs;

 	if (ih->use_doorbell) {
@@ -472,6 +475,16 @@ static void ih_v6_1_set_rptr(struct amdgpu_device *adev,
 		ih_regs = &ih->ih_regs;
 		WREG32(ih_regs->ih_rb_rptr, ih->rptr);
 	}
+
+	/* If we overflowed previously (and thus set the OVERFLOW_CLEAR bit),
+	 * reset it here to detect more overflows if they occur.
+	 */
+	if (ih->overflow) {
+		tmp = RREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl);
+		tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 0);
+		WREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl, tmp);
+		ih->overflow = false;
+	}
 }

 /**
diff --git a/drivers/gpu/drm/amd/amdgpu/navi10_ih.c b/drivers/gpu/drm/amd/amdgpu/navi10_ih.c
index e64b33115848..0bdac923cb4d 100644
--- a/drivers/gpu/drm/amd/amdgpu/navi10_ih.c
+++ b/drivers/gpu/drm/amd/amdgpu/navi10_ih.c
@@ -442,6 +442,7 @@ static u32 navi10_ih_get_wptr(struct amdgpu_device *adev,
 	tmp = RREG32_NO_KIQ(ih_regs->ih_rb_cntl);
 	tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 1);
 	WREG32_NO_KIQ(ih_regs->ih_rb_cntl, tmp);
+	ih->overflow = true;
 out:
 	return (wptr & ih->ptr_mask);
 }
@@ -483,6 +484,7 @@ static void navi10_ih_irq_rearm(struct amdgpu_device *adev,
 static void navi10_ih_set_rptr(struct amdgpu_device *adev,
 			       struct amdgpu_ih_ring *ih)
 {
+	u32 tmp;
 	struct amdgpu_ih_regs *ih_regs;

 	if (ih == &adev->irq.ih_soft)
@@ -499,6 +501,16 @@ static void navi10_ih_set_rptr(struct amdgpu_device *adev,
 		ih_regs = &ih->ih_regs;
 		WREG32(ih_regs->ih_rb_rptr, ih->rptr);
 	}
+
+	/* If we overflowed previously (and thus set the OVERFLOW_CLEAR bit),
+	 * reset it here to detect more overflows if they occur.
+	 */
+	if (ih->overflow) {
+		tmp = RREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl);
+		tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 0);
+		WREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl, tmp);
+		ih->overflow = false;
+	}
 }

 /**
diff --git a/drivers/gpu/drm/amd/amdgpu/si_ih.c b/drivers/gpu/drm/amd/amdgpu/si_ih.c
index 9a24f17a5750..ff35056d2b54 100644
--- a/drivers/gpu/drm/amd/amdgpu/si_ih.c
+++ b/drivers/gpu/drm/amd/amdgpu/si_ih.c
@@ -119,6 +119,7 @@ static u32 si_ih_get_wptr(struct amdgpu_device *adev,
 		tmp = RREG32(IH_RB_CNTL);
 		tmp |= IH_RB_CNTL__WPTR_OVERFLOW_CLEAR_MASK;
 		WREG32(IH_RB_CNTL, tmp);
+		ih->overflow = true;
 	}
 	return (wptr & ih->ptr_mask);
 }
@@ -147,7 +148,18 @@ static void si_ih_decode_iv(struct amdgpu_device *adev,
 static void si_ih_set_rptr(struct amdgpu_device *adev,
 			   struct amdgpu_ih_ring *ih)
 {
+	u32 tmp;
+
 	WREG32(IH_RB_RPTR, ih->rptr);
+
+	/* If we overflowed previously (and thus set the OVERFLOW_CLEAR bit),
+	 * reset it here to detect more overflows if they occur.
+	 */
+	if (ih->overflow) {
+		tmp = RREG32(IH_RB_CNTL);
+		tmp &= ~IH_RB_CNTL__WPTR_OVERFLOW_CLEAR_MASK;
+		WREG32(IH_RB_CNTL, tmp);
+	}
 }

 static int si_ih_early_init(void *handle)
diff --git a/drivers/gpu/drm/amd/amdgpu/tonga_ih.c b/drivers/gpu/drm/amd/amdgpu/tonga_ih.c
index 917707bba7f3..6f5090d3db48 100644
--- a/drivers/gpu/drm/amd/amdgpu/tonga_ih.c
+++ b/drivers/gpu/drm/amd/amdgpu/tonga_ih.c
@@ -218,6 +218,7 @@ static u32 tonga_ih_get_wptr(struct amdgpu_device *adev,
 	tmp = RREG32(mmIH_RB_CNTL);
 	tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 1);
 	WREG32(mmIH_RB_CNTL, tmp);
+	ih->overflow = true;

 out:
 	return (wptr & ih->ptr_mask);
@@ -268,6 +269,8 @@ static void tonga_ih_decode_iv(struct amdgpu_device *adev,
 static void tonga_ih_set_rptr(struct amdgpu_device *adev,
 			      struct amdgpu_ih_ring *ih)
 {
+	u32 tmp;
+
 	if (ih->use_doorbell) {
 		/* XXX check if swapping is necessary on BE */
 		*ih->rptr_cpu = ih->rptr;
@@ -275,6 +278,16 @@ static void tonga_ih_set_rptr(struct amdgpu_device *adev,
 	} else {
 		WREG32(mmIH_RB_RPTR, ih->rptr);
 	}
+
+	/* If we overflowed previously (and thus set the OVERFLOW_CLEAR bit),
+	 * reset it here to detect more overflows if they occur.
+	 */
+	if (ih->overflow) {
+		tmp = RREG32(mmIH_RB_CNTL);
+		tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 0);
+		WREG32(mmIH_RB_CNTL, tmp);
+		ih->overflow = false;
+	}
 }

 static int tonga_ih_early_init(void *handle)
diff --git a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
index d364c6dd152c..bb005924f194 100644
--- a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
+++ b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
@@ -372,6 +372,7 @@ static u32 vega10_ih_get_wptr(struct amdgpu_device *adev,
 	tmp = RREG32_NO_KIQ(ih_regs->ih_rb_cntl);
 	tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 1);
 	WREG32_NO_KIQ(ih_regs->ih_rb_cntl, tmp);
+	ih->overflow = true;

 out:
 	return (wptr & ih->ptr_mask);
@@ -413,6 +414,7 @@ static void vega10_ih_irq_rearm(struct amdgpu_device *adev,
 static void vega10_ih_set_rptr(struct amdgpu_device *adev,
 			       struct amdgpu_ih_ring *ih)
 {
+	u32 tmp;
 	struct amdgpu_ih_regs *ih_regs;

 	if (ih == &adev->irq.ih_soft)
@@ -429,6 +431,16 @@ static void vega10_ih_set_rptr(struct amdgpu_device *adev,
 		ih_regs = &ih->ih_regs;
 		WREG32(ih_regs->ih_rb_rptr, ih->rptr);
 	}
+
+	/* If we overflowed previously (and thus set the OVERFLOW_CLEAR bit),
+	 * reset it here to detect more overflows if they occur.
+	 */
+	if (ih->overflow) {
+		tmp = RREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl);
+		tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 0);
+		WREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl, tmp);
+		ih->overflow = false;
+	}
 }

 /**
diff --git a/drivers/gpu/drm/amd/amdgpu/vega20_ih.c b/drivers/gpu/drm/amd/amdgpu/vega20_ih.c
index ddfc6941f9d5..bb725a970697 100644
--- a/drivers/gpu/drm/amd/amdgpu/vega20_ih.c
+++ b/drivers/gpu/drm/amd/amdgpu/vega20_ih.c
@@ -420,6 +420,7 @@ static u32 vega20_ih_get_wptr(struct amdgpu_device *adev,
 	tmp = RREG32_NO_KIQ(ih_regs->ih_rb_cntl);
 	tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 1);
 	WREG32_NO_KIQ(ih_regs->ih_rb_cntl, tmp);
+	ih->overflow = true;

 out:
 	return (wptr & ih->ptr_mask);
@@ -462,6 +463,7 @@ static void vega20_ih_irq_rearm(struct amdgpu_device *adev,
 static void vega20_ih_set_rptr(struct amdgpu_device *adev,
 			       struct amdgpu_ih_ring *ih)
 {
+	u32 tmp;
 	struct amdgpu_ih_regs *ih_regs;

 	if (ih == &adev->irq.ih_soft)
@@ -478,6 +480,16 @@ static void vega20_ih_set_rptr(struct amdgpu_device *adev,
 		ih_regs = &ih->ih_regs;
 		WREG32(ih_regs->ih_rb_rptr, ih->rptr);
 	}
+
+	/* If we overflowed previously (and thus set the OVERFLOW_CLEAR bit),
+	 * reset it here to detect more overflows if they occur.
+	 */
+	if (ih->overflow) {
+		tmp = RREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl);
+		tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 0);
+		WREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl, tmp);
+		ih->overflow = false;
+	}
 }

 /**
--
2.43.0


^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [PATCH 1/2] drm/amdgpu: Reset IH OVERFLOW_CLEAR bit after writing rptr
@ 2024-01-14 13:00 ` Friedrich Vock
  0 siblings, 0 replies; 28+ messages in thread
From: Friedrich Vock @ 2024-01-14 13:00 UTC (permalink / raw)
  To: amd-gfx; +Cc: Friedrich Vock, Joshua Ashton, Alex Deucher, stable

Allows us to detect subsequent IH ring buffer overflows as well.

Cc: Joshua Ashton <joshua@froggi.es>
Cc: Alex Deucher <alexander.deucher@amd.com>
Cc: stable@vger.kernel.org

Signed-off-by: Friedrich Vock <friedrich.vock@gmx.de>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h  |  2 ++
 drivers/gpu/drm/amd/amdgpu/cik_ih.c     | 13 +++++++++++++
 drivers/gpu/drm/amd/amdgpu/cz_ih.c      | 14 +++++++++++++-
 drivers/gpu/drm/amd/amdgpu/iceland_ih.c | 14 +++++++++++++-
 drivers/gpu/drm/amd/amdgpu/ih_v6_0.c    | 13 +++++++++++++
 drivers/gpu/drm/amd/amdgpu/ih_v6_1.c    | 13 +++++++++++++
 drivers/gpu/drm/amd/amdgpu/navi10_ih.c  | 12 ++++++++++++
 drivers/gpu/drm/amd/amdgpu/si_ih.c      | 12 ++++++++++++
 drivers/gpu/drm/amd/amdgpu/tonga_ih.c   | 13 +++++++++++++
 drivers/gpu/drm/amd/amdgpu/vega10_ih.c  | 12 ++++++++++++
 drivers/gpu/drm/amd/amdgpu/vega20_ih.c  | 12 ++++++++++++
 11 files changed, 128 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h
index 508f02eb0cf8..6041ec727f06 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h
@@ -69,6 +69,8 @@ struct amdgpu_ih_ring {
 	unsigned		rptr;
 	struct amdgpu_ih_regs	ih_regs;

+	bool overflow;
+
 	/* For waiting on IH processing at checkpoint. */
 	wait_queue_head_t wait_process;
 	uint64_t		processed_timestamp;
diff --git a/drivers/gpu/drm/amd/amdgpu/cik_ih.c b/drivers/gpu/drm/amd/amdgpu/cik_ih.c
index 6f7c031dd197..807cc30c9e33 100644
--- a/drivers/gpu/drm/amd/amdgpu/cik_ih.c
+++ b/drivers/gpu/drm/amd/amdgpu/cik_ih.c
@@ -204,6 +204,7 @@ static u32 cik_ih_get_wptr(struct amdgpu_device *adev,
 		tmp = RREG32(mmIH_RB_CNTL);
 		tmp |= IH_RB_CNTL__WPTR_OVERFLOW_CLEAR_MASK;
 		WREG32(mmIH_RB_CNTL, tmp);
+		ih->overflow = true;
 	}
 	return (wptr & ih->ptr_mask);
 }
@@ -274,7 +275,19 @@ static void cik_ih_decode_iv(struct amdgpu_device *adev,
 static void cik_ih_set_rptr(struct amdgpu_device *adev,
 			    struct amdgpu_ih_ring *ih)
 {
+	u32 tmp;
+
 	WREG32(mmIH_RB_RPTR, ih->rptr);
+
+	/* If we overflowed previously (and thus set the OVERFLOW_CLEAR bit),
+	 * reset it here to detect more overflows if they occur.
+	 */
+	if (ih->overflow) {
+		tmp = RREG32(mmIH_RB_CNTL);
+		tmp &= ~IH_RB_CNTL__WPTR_OVERFLOW_CLEAR_MASK;
+		WREG32(mmIH_RB_CNTL, tmp);
+		ih->overflow = false;
+	}
 }

 static int cik_ih_early_init(void *handle)
diff --git a/drivers/gpu/drm/amd/amdgpu/cz_ih.c b/drivers/gpu/drm/amd/amdgpu/cz_ih.c
index b8c47e0cf37a..076559668573 100644
--- a/drivers/gpu/drm/amd/amdgpu/cz_ih.c
+++ b/drivers/gpu/drm/amd/amdgpu/cz_ih.c
@@ -215,7 +215,7 @@ static u32 cz_ih_get_wptr(struct amdgpu_device *adev,
 	tmp = RREG32(mmIH_RB_CNTL);
 	tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 1);
 	WREG32(mmIH_RB_CNTL, tmp);
-
+	ih->overflow = true;

 out:
 	return (wptr & ih->ptr_mask);
@@ -266,7 +266,19 @@ static void cz_ih_decode_iv(struct amdgpu_device *adev,
 static void cz_ih_set_rptr(struct amdgpu_device *adev,
 			   struct amdgpu_ih_ring *ih)
 {
+	u32 tmp;
+
 	WREG32(mmIH_RB_RPTR, ih->rptr);
+
+	/* If we overflowed previously (and thus set the OVERFLOW_CLEAR bit),
+	 * reset it here to detect more overflows if they occur.
+	 */
+	if (ih->overflow) {
+		tmp = RREG32(mmIH_RB_CNTL);
+		tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 0);
+		WREG32(mmIH_RB_CNTL, tmp);
+		ih->overflow = false;
+	}
 }

 static int cz_ih_early_init(void *handle)
diff --git a/drivers/gpu/drm/amd/amdgpu/iceland_ih.c b/drivers/gpu/drm/amd/amdgpu/iceland_ih.c
index aecad530b10a..1a5e668643d1 100644
--- a/drivers/gpu/drm/amd/amdgpu/iceland_ih.c
+++ b/drivers/gpu/drm/amd/amdgpu/iceland_ih.c
@@ -214,7 +214,7 @@ static u32 iceland_ih_get_wptr(struct amdgpu_device *adev,
 	tmp = RREG32(mmIH_RB_CNTL);
 	tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 1);
 	WREG32(mmIH_RB_CNTL, tmp);
-
+	ih->overflow = true;

 out:
 	return (wptr & ih->ptr_mask);
@@ -265,7 +265,19 @@ static void iceland_ih_decode_iv(struct amdgpu_device *adev,
 static void iceland_ih_set_rptr(struct amdgpu_device *adev,
 				struct amdgpu_ih_ring *ih)
 {
+	u32 tmp;
+
 	WREG32(mmIH_RB_RPTR, ih->rptr);
+
+	/* If we overflowed previously (and thus set the OVERFLOW_CLEAR bit),
+	 * reset it here to detect more overflows if they occur.
+	 */
+	if (ih->overflow) {
+		tmp = RREG32(mmIH_RB_CNTL);
+		tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 0);
+		WREG32(mmIH_RB_CNTL, tmp);
+		ih->overflow = false;
+	}
 }

 static int iceland_ih_early_init(void *handle)
diff --git a/drivers/gpu/drm/amd/amdgpu/ih_v6_0.c b/drivers/gpu/drm/amd/amdgpu/ih_v6_0.c
index d9ed7332d805..ce8f7feec713 100644
--- a/drivers/gpu/drm/amd/amdgpu/ih_v6_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/ih_v6_0.c
@@ -418,6 +418,8 @@ static u32 ih_v6_0_get_wptr(struct amdgpu_device *adev,
 	tmp = RREG32_NO_KIQ(ih_regs->ih_rb_cntl);
 	tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 1);
 	WREG32_NO_KIQ(ih_regs->ih_rb_cntl, tmp);
+	ih->overflow = true;
+
 out:
 	return (wptr & ih->ptr_mask);
 }
@@ -459,6 +461,7 @@ static void ih_v6_0_irq_rearm(struct amdgpu_device *adev,
 static void ih_v6_0_set_rptr(struct amdgpu_device *adev,
 			       struct amdgpu_ih_ring *ih)
 {
+	u32 tmp;
 	struct amdgpu_ih_regs *ih_regs;

 	if (ih->use_doorbell) {
@@ -472,6 +475,16 @@ static void ih_v6_0_set_rptr(struct amdgpu_device *adev,
 		ih_regs = &ih->ih_regs;
 		WREG32(ih_regs->ih_rb_rptr, ih->rptr);
 	}
+
+	/* If we overflowed previously (and thus set the OVERFLOW_CLEAR bit),
+	 * reset it here to detect more overflows if they occur.
+	 */
+	if (ih->overflow) {
+		tmp = RREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl);
+		tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 0);
+		WREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl, tmp);
+		ih->overflow = false;
+	}
 }

 /**
diff --git a/drivers/gpu/drm/amd/amdgpu/ih_v6_1.c b/drivers/gpu/drm/amd/amdgpu/ih_v6_1.c
index 8fb05eae340a..668788ad34d9 100644
--- a/drivers/gpu/drm/amd/amdgpu/ih_v6_1.c
+++ b/drivers/gpu/drm/amd/amdgpu/ih_v6_1.c
@@ -418,6 +418,8 @@ static u32 ih_v6_1_get_wptr(struct amdgpu_device *adev,
 	tmp = RREG32_NO_KIQ(ih_regs->ih_rb_cntl);
 	tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 1);
 	WREG32_NO_KIQ(ih_regs->ih_rb_cntl, tmp);
+	ih->overflow = true;
+
 out:
 	return (wptr & ih->ptr_mask);
 }
@@ -459,6 +461,7 @@ static void ih_v6_1_irq_rearm(struct amdgpu_device *adev,
 static void ih_v6_1_set_rptr(struct amdgpu_device *adev,
 			       struct amdgpu_ih_ring *ih)
 {
+	u32 tmp;
 	struct amdgpu_ih_regs *ih_regs;

 	if (ih->use_doorbell) {
@@ -472,6 +475,16 @@ static void ih_v6_1_set_rptr(struct amdgpu_device *adev,
 		ih_regs = &ih->ih_regs;
 		WREG32(ih_regs->ih_rb_rptr, ih->rptr);
 	}
+
+	/* If we overflowed previously (and thus set the OVERFLOW_CLEAR bit),
+	 * reset it here to detect more overflows if they occur.
+	 */
+	if (ih->overflow) {
+		tmp = RREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl);
+		tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 0);
+		WREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl, tmp);
+		ih->overflow = false;
+	}
 }

 /**
diff --git a/drivers/gpu/drm/amd/amdgpu/navi10_ih.c b/drivers/gpu/drm/amd/amdgpu/navi10_ih.c
index e64b33115848..0bdac923cb4d 100644
--- a/drivers/gpu/drm/amd/amdgpu/navi10_ih.c
+++ b/drivers/gpu/drm/amd/amdgpu/navi10_ih.c
@@ -442,6 +442,7 @@ static u32 navi10_ih_get_wptr(struct amdgpu_device *adev,
 	tmp = RREG32_NO_KIQ(ih_regs->ih_rb_cntl);
 	tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 1);
 	WREG32_NO_KIQ(ih_regs->ih_rb_cntl, tmp);
+	ih->overflow = true;
 out:
 	return (wptr & ih->ptr_mask);
 }
@@ -483,6 +484,7 @@ static void navi10_ih_irq_rearm(struct amdgpu_device *adev,
 static void navi10_ih_set_rptr(struct amdgpu_device *adev,
 			       struct amdgpu_ih_ring *ih)
 {
+	u32 tmp;
 	struct amdgpu_ih_regs *ih_regs;

 	if (ih == &adev->irq.ih_soft)
@@ -499,6 +501,16 @@ static void navi10_ih_set_rptr(struct amdgpu_device *adev,
 		ih_regs = &ih->ih_regs;
 		WREG32(ih_regs->ih_rb_rptr, ih->rptr);
 	}
+
+	/* If we overflowed previously (and thus set the OVERFLOW_CLEAR bit),
+	 * reset it here to detect more overflows if they occur.
+	 */
+	if (ih->overflow) {
+		tmp = RREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl);
+		tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 0);
+		WREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl, tmp);
+		ih->overflow = false;
+	}
 }

 /**
diff --git a/drivers/gpu/drm/amd/amdgpu/si_ih.c b/drivers/gpu/drm/amd/amdgpu/si_ih.c
index 9a24f17a5750..ff35056d2b54 100644
--- a/drivers/gpu/drm/amd/amdgpu/si_ih.c
+++ b/drivers/gpu/drm/amd/amdgpu/si_ih.c
@@ -119,6 +119,7 @@ static u32 si_ih_get_wptr(struct amdgpu_device *adev,
 		tmp = RREG32(IH_RB_CNTL);
 		tmp |= IH_RB_CNTL__WPTR_OVERFLOW_CLEAR_MASK;
 		WREG32(IH_RB_CNTL, tmp);
+		ih->overflow = true;
 	}
 	return (wptr & ih->ptr_mask);
 }
@@ -147,7 +148,18 @@ static void si_ih_decode_iv(struct amdgpu_device *adev,
 static void si_ih_set_rptr(struct amdgpu_device *adev,
 			   struct amdgpu_ih_ring *ih)
 {
+	u32 tmp;
+
 	WREG32(IH_RB_RPTR, ih->rptr);
+
+	/* If we overflowed previously (and thus set the OVERFLOW_CLEAR bit),
+	 * reset it here to detect more overflows if they occur.
+	 */
+	if (ih->overflow) {
+		tmp = RREG32(IH_RB_CNTL);
+		tmp &= ~IH_RB_CNTL__WPTR_OVERFLOW_CLEAR_MASK;
+		WREG32(IH_RB_CNTL, tmp);
+	}
 }

 static int si_ih_early_init(void *handle)
diff --git a/drivers/gpu/drm/amd/amdgpu/tonga_ih.c b/drivers/gpu/drm/amd/amdgpu/tonga_ih.c
index 917707bba7f3..6f5090d3db48 100644
--- a/drivers/gpu/drm/amd/amdgpu/tonga_ih.c
+++ b/drivers/gpu/drm/amd/amdgpu/tonga_ih.c
@@ -218,6 +218,7 @@ static u32 tonga_ih_get_wptr(struct amdgpu_device *adev,
 	tmp = RREG32(mmIH_RB_CNTL);
 	tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 1);
 	WREG32(mmIH_RB_CNTL, tmp);
+	ih->overflow = true;

 out:
 	return (wptr & ih->ptr_mask);
@@ -268,6 +269,8 @@ static void tonga_ih_decode_iv(struct amdgpu_device *adev,
 static void tonga_ih_set_rptr(struct amdgpu_device *adev,
 			      struct amdgpu_ih_ring *ih)
 {
+	u32 tmp;
+
 	if (ih->use_doorbell) {
 		/* XXX check if swapping is necessary on BE */
 		*ih->rptr_cpu = ih->rptr;
@@ -275,6 +278,16 @@ static void tonga_ih_set_rptr(struct amdgpu_device *adev,
 	} else {
 		WREG32(mmIH_RB_RPTR, ih->rptr);
 	}
+
+	/* If we overflowed previously (and thus set the OVERFLOW_CLEAR bit),
+	 * reset it here to detect more overflows if they occur.
+	 */
+	if (ih->overflow) {
+		tmp = RREG32(mmIH_RB_CNTL);
+		tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 0);
+		WREG32(mmIH_RB_CNTL, tmp);
+		ih->overflow = false;
+	}
 }

 static int tonga_ih_early_init(void *handle)
diff --git a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
index d364c6dd152c..bb005924f194 100644
--- a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
+++ b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
@@ -372,6 +372,7 @@ static u32 vega10_ih_get_wptr(struct amdgpu_device *adev,
 	tmp = RREG32_NO_KIQ(ih_regs->ih_rb_cntl);
 	tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 1);
 	WREG32_NO_KIQ(ih_regs->ih_rb_cntl, tmp);
+	ih->overflow = true;

 out:
 	return (wptr & ih->ptr_mask);
@@ -413,6 +414,7 @@ static void vega10_ih_irq_rearm(struct amdgpu_device *adev,
 static void vega10_ih_set_rptr(struct amdgpu_device *adev,
 			       struct amdgpu_ih_ring *ih)
 {
+	u32 tmp;
 	struct amdgpu_ih_regs *ih_regs;

 	if (ih == &adev->irq.ih_soft)
@@ -429,6 +431,16 @@ static void vega10_ih_set_rptr(struct amdgpu_device *adev,
 		ih_regs = &ih->ih_regs;
 		WREG32(ih_regs->ih_rb_rptr, ih->rptr);
 	}
+
+	/* If we overflowed previously (and thus set the OVERFLOW_CLEAR bit),
+	 * reset it here to detect more overflows if they occur.
+	 */
+	if (ih->overflow) {
+		tmp = RREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl);
+		tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 0);
+		WREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl, tmp);
+		ih->overflow = false;
+	}
 }

 /**
diff --git a/drivers/gpu/drm/amd/amdgpu/vega20_ih.c b/drivers/gpu/drm/amd/amdgpu/vega20_ih.c
index ddfc6941f9d5..bb725a970697 100644
--- a/drivers/gpu/drm/amd/amdgpu/vega20_ih.c
+++ b/drivers/gpu/drm/amd/amdgpu/vega20_ih.c
@@ -420,6 +420,7 @@ static u32 vega20_ih_get_wptr(struct amdgpu_device *adev,
 	tmp = RREG32_NO_KIQ(ih_regs->ih_rb_cntl);
 	tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 1);
 	WREG32_NO_KIQ(ih_regs->ih_rb_cntl, tmp);
+	ih->overflow = true;

 out:
 	return (wptr & ih->ptr_mask);
@@ -462,6 +463,7 @@ static void vega20_ih_irq_rearm(struct amdgpu_device *adev,
 static void vega20_ih_set_rptr(struct amdgpu_device *adev,
 			       struct amdgpu_ih_ring *ih)
 {
+	u32 tmp;
 	struct amdgpu_ih_regs *ih_regs;

 	if (ih == &adev->irq.ih_soft)
@@ -478,6 +480,16 @@ static void vega20_ih_set_rptr(struct amdgpu_device *adev,
 		ih_regs = &ih->ih_regs;
 		WREG32(ih_regs->ih_rb_rptr, ih->rptr);
 	}
+
+	/* If we overflowed previously (and thus set the OVERFLOW_CLEAR bit),
+	 * reset it here to detect more overflows if they occur.
+	 */
+	if (ih->overflow) {
+		tmp = RREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl);
+		tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 0);
+		WREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl, tmp);
+		ih->overflow = false;
+	}
 }

 /**
--
2.43.0


^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [PATCH 2/2] drm/amdgpu: Process fences on IH overflow
  2024-01-14 13:00 ` Friedrich Vock
@ 2024-01-14 13:00   ` Friedrich Vock
  -1 siblings, 0 replies; 28+ messages in thread
From: Friedrich Vock @ 2024-01-14 13:00 UTC (permalink / raw)
  To: amd-gfx; +Cc: Alex Deucher, Friedrich Vock, stable, Joshua Ashton

If the IH ring buffer overflows, it's possible that fence signal events
were lost. Check each ring for progress to prevent job timeouts/GPU
hangs due to the fences staying unsignaled despite the work being done.

Cc: Joshua Ashton <joshua@froggi.es>
Cc: Alex Deucher <alexander.deucher@amd.com>
Cc: stable@vger.kernel.org

Signed-off-by: Friedrich Vock <friedrich.vock@gmx.de>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
index f3b0aaf3ebc6..2a246db1d3a7 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
@@ -209,6 +209,7 @@ int amdgpu_ih_process(struct amdgpu_device *adev, struct amdgpu_ih_ring *ih)
 {
 	unsigned int count;
 	u32 wptr;
+	int i;

 	if (!ih->enabled || adev->shutdown)
 		return IRQ_NONE;
@@ -227,6 +228,20 @@ int amdgpu_ih_process(struct amdgpu_device *adev, struct amdgpu_ih_ring *ih)
 		ih->rptr &= ih->ptr_mask;
 	}

+	/* If the ring buffer overflowed, we might have lost some fence
+	 * signal interrupts. Check if there was any activity so the signal
+	 * doesn't get lost.
+	 */
+	if (ih->overflow) {
+		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
+			struct amdgpu_ring *ring = adev->rings[i];
+
+			if (!ring || !ring->fence_drv.initialized)
+				continue;
+			amdgpu_fence_process(ring);
+		}
+	}
+
 	amdgpu_ih_set_rptr(adev, ih);
 	wake_up_all(&ih->wait_process);

--
2.43.0


^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [PATCH 2/2] drm/amdgpu: Process fences on IH overflow
@ 2024-01-14 13:00   ` Friedrich Vock
  0 siblings, 0 replies; 28+ messages in thread
From: Friedrich Vock @ 2024-01-14 13:00 UTC (permalink / raw)
  To: amd-gfx; +Cc: Friedrich Vock, Joshua Ashton, Alex Deucher, stable

If the IH ring buffer overflows, it's possible that fence signal events
were lost. Check each ring for progress to prevent job timeouts/GPU
hangs due to the fences staying unsignaled despite the work being done.

Cc: Joshua Ashton <joshua@froggi.es>
Cc: Alex Deucher <alexander.deucher@amd.com>
Cc: stable@vger.kernel.org

Signed-off-by: Friedrich Vock <friedrich.vock@gmx.de>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
index f3b0aaf3ebc6..2a246db1d3a7 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
@@ -209,6 +209,7 @@ int amdgpu_ih_process(struct amdgpu_device *adev, struct amdgpu_ih_ring *ih)
 {
 	unsigned int count;
 	u32 wptr;
+	int i;

 	if (!ih->enabled || adev->shutdown)
 		return IRQ_NONE;
@@ -227,6 +228,20 @@ int amdgpu_ih_process(struct amdgpu_device *adev, struct amdgpu_ih_ring *ih)
 		ih->rptr &= ih->ptr_mask;
 	}

+	/* If the ring buffer overflowed, we might have lost some fence
+	 * signal interrupts. Check if there was any activity so the signal
+	 * doesn't get lost.
+	 */
+	if (ih->overflow) {
+		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
+			struct amdgpu_ring *ring = adev->rings[i];
+
+			if (!ring || !ring->fence_drv.initialized)
+				continue;
+			amdgpu_fence_process(ring);
+		}
+	}
+
 	amdgpu_ih_set_rptr(adev, ih);
 	wake_up_all(&ih->wait_process);

--
2.43.0


^ permalink raw reply related	[flat|nested] 28+ messages in thread

* Re: [PATCH 2/2] drm/amdgpu: Process fences on IH overflow
  2024-01-14 13:00   ` Friedrich Vock
@ 2024-01-15 10:26     ` Christian König
  -1 siblings, 0 replies; 28+ messages in thread
From: Christian König @ 2024-01-15 10:26 UTC (permalink / raw)
  To: Friedrich Vock, amd-gfx; +Cc: Alex Deucher, Joshua Ashton, stable

Am 14.01.24 um 14:00 schrieb Friedrich Vock:
> If the IH ring buffer overflows, it's possible that fence signal events
> were lost. Check each ring for progress to prevent job timeouts/GPU
> hangs due to the fences staying unsignaled despite the work being done.

That's completely unnecessary and in some cases even harmful.

We already have a timeout handler for that and overflows point to severe 
system problem so they should never occur in a production system.

Regards,
Christian.

>
> Cc: Joshua Ashton <joshua@froggi.es>
> Cc: Alex Deucher <alexander.deucher@amd.com>
> Cc: stable@vger.kernel.org
>
> Signed-off-by: Friedrich Vock <friedrich.vock@gmx.de>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c | 15 +++++++++++++++
>   1 file changed, 15 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
> index f3b0aaf3ebc6..2a246db1d3a7 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
> @@ -209,6 +209,7 @@ int amdgpu_ih_process(struct amdgpu_device *adev, struct amdgpu_ih_ring *ih)
>   {
>   	unsigned int count;
>   	u32 wptr;
> +	int i;
>
>   	if (!ih->enabled || adev->shutdown)
>   		return IRQ_NONE;
> @@ -227,6 +228,20 @@ int amdgpu_ih_process(struct amdgpu_device *adev, struct amdgpu_ih_ring *ih)
>   		ih->rptr &= ih->ptr_mask;
>   	}
>
> +	/* If the ring buffer overflowed, we might have lost some fence
> +	 * signal interrupts. Check if there was any activity so the signal
> +	 * doesn't get lost.
> +	 */
> +	if (ih->overflow) {
> +		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
> +			struct amdgpu_ring *ring = adev->rings[i];
> +
> +			if (!ring || !ring->fence_drv.initialized)
> +				continue;
> +			amdgpu_fence_process(ring);
> +		}
> +	}
> +
>   	amdgpu_ih_set_rptr(adev, ih);
>   	wake_up_all(&ih->wait_process);
>
> --
> 2.43.0
>


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 2/2] drm/amdgpu: Process fences on IH overflow
@ 2024-01-15 10:26     ` Christian König
  0 siblings, 0 replies; 28+ messages in thread
From: Christian König @ 2024-01-15 10:26 UTC (permalink / raw)
  To: Friedrich Vock, amd-gfx; +Cc: Alex Deucher, stable, Joshua Ashton

Am 14.01.24 um 14:00 schrieb Friedrich Vock:
> If the IH ring buffer overflows, it's possible that fence signal events
> were lost. Check each ring for progress to prevent job timeouts/GPU
> hangs due to the fences staying unsignaled despite the work being done.

That's completely unnecessary and in some cases even harmful.

We already have a timeout handler for that and overflows point to severe 
system problem so they should never occur in a production system.

Regards,
Christian.

>
> Cc: Joshua Ashton <joshua@froggi.es>
> Cc: Alex Deucher <alexander.deucher@amd.com>
> Cc: stable@vger.kernel.org
>
> Signed-off-by: Friedrich Vock <friedrich.vock@gmx.de>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c | 15 +++++++++++++++
>   1 file changed, 15 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
> index f3b0aaf3ebc6..2a246db1d3a7 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
> @@ -209,6 +209,7 @@ int amdgpu_ih_process(struct amdgpu_device *adev, struct amdgpu_ih_ring *ih)
>   {
>   	unsigned int count;
>   	u32 wptr;
> +	int i;
>
>   	if (!ih->enabled || adev->shutdown)
>   		return IRQ_NONE;
> @@ -227,6 +228,20 @@ int amdgpu_ih_process(struct amdgpu_device *adev, struct amdgpu_ih_ring *ih)
>   		ih->rptr &= ih->ptr_mask;
>   	}
>
> +	/* If the ring buffer overflowed, we might have lost some fence
> +	 * signal interrupts. Check if there was any activity so the signal
> +	 * doesn't get lost.
> +	 */
> +	if (ih->overflow) {
> +		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
> +			struct amdgpu_ring *ring = adev->rings[i];
> +
> +			if (!ring || !ring->fence_drv.initialized)
> +				continue;
> +			amdgpu_fence_process(ring);
> +		}
> +	}
> +
>   	amdgpu_ih_set_rptr(adev, ih);
>   	wake_up_all(&ih->wait_process);
>
> --
> 2.43.0
>


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 1/2] drm/amdgpu: Reset IH OVERFLOW_CLEAR bit after writing rptr
       [not found] ` <69cec077-4011-4738-bbb0-8fb1e6f52159@gmail.com>
@ 2024-01-15 11:18   ` Friedrich Vock
  2024-01-16  7:03     ` Christian König
  0 siblings, 1 reply; 28+ messages in thread
From: Friedrich Vock @ 2024-01-15 11:18 UTC (permalink / raw)
  To: Christian König; +Cc: Alex Deucher, Joshua Ashton, amd-gfx

Adding the original Ccs from the thread since they seemed to be missing
in the reply.

On 15.01.24 11:55, Christian König wrote:
> Am 14.01.24 um 14:00 schrieb Friedrich Vock:
>> Allows us to detect subsequent IH ring buffer overflows as well.
>
> Well that suggested handling here is certainly broken, see below.
>
>>
>> Cc: Joshua Ashton <joshua@froggi.es>
>> Cc: Alex Deucher <alexander.deucher@amd.com>
>> Cc: stable@vger.kernel.org
>>
>> Signed-off-by: Friedrich Vock <friedrich.vock@gmx.de>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h  |  2 ++
>>   drivers/gpu/drm/amd/amdgpu/cik_ih.c     | 13 +++++++++++++
>>   drivers/gpu/drm/amd/amdgpu/cz_ih.c      | 14 +++++++++++++-
>>   drivers/gpu/drm/amd/amdgpu/iceland_ih.c | 14 +++++++++++++-
>>   drivers/gpu/drm/amd/amdgpu/ih_v6_0.c    | 13 +++++++++++++
>>   drivers/gpu/drm/amd/amdgpu/ih_v6_1.c    | 13 +++++++++++++
>>   drivers/gpu/drm/amd/amdgpu/navi10_ih.c  | 12 ++++++++++++
>>   drivers/gpu/drm/amd/amdgpu/si_ih.c      | 12 ++++++++++++
>>   drivers/gpu/drm/amd/amdgpu/tonga_ih.c   | 13 +++++++++++++
>>   drivers/gpu/drm/amd/amdgpu/vega10_ih.c  | 12 ++++++++++++
>>   drivers/gpu/drm/amd/amdgpu/vega20_ih.c  | 12 ++++++++++++
>>   11 files changed, 128 insertions(+), 2 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h
>> index 508f02eb0cf8..6041ec727f06 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h
>> @@ -69,6 +69,8 @@ struct amdgpu_ih_ring {
>>       unsigned        rptr;
>>       struct amdgpu_ih_regs    ih_regs;
>>
>> +    bool overflow;
>> +
>>       /* For waiting on IH processing at checkpoint. */
>>       wait_queue_head_t wait_process;
>>       uint64_t        processed_timestamp;
>> diff --git a/drivers/gpu/drm/amd/amdgpu/cik_ih.c
>> b/drivers/gpu/drm/amd/amdgpu/cik_ih.c
>> index 6f7c031dd197..807cc30c9e33 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/cik_ih.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/cik_ih.c
>> @@ -204,6 +204,7 @@ static u32 cik_ih_get_wptr(struct amdgpu_device
>> *adev,
>>           tmp = RREG32(mmIH_RB_CNTL);
>>           tmp |= IH_RB_CNTL__WPTR_OVERFLOW_CLEAR_MASK;
>>           WREG32(mmIH_RB_CNTL, tmp);
>> +        ih->overflow = true;
>>       }
>>       return (wptr & ih->ptr_mask);
>>   }
>> @@ -274,7 +275,19 @@ static void cik_ih_decode_iv(struct
>> amdgpu_device *adev,
>>   static void cik_ih_set_rptr(struct amdgpu_device *adev,
>>                   struct amdgpu_ih_ring *ih)
>>   {
>> +    u32 tmp;
>> +
>>       WREG32(mmIH_RB_RPTR, ih->rptr);
>> +
>> +    /* If we overflowed previously (and thus set the OVERFLOW_CLEAR
>> bit),
>> +     * reset it here to detect more overflows if they occur.
>> +     */
>> +    if (ih->overflow) {
>> +        tmp = RREG32(mmIH_RB_CNTL);
>> +        tmp &= ~IH_RB_CNTL__WPTR_OVERFLOW_CLEAR_MASK;
>> +        WREG32(mmIH_RB_CNTL, tmp);
>> +        ih->overflow = false;
>> +    }
>
> Well that is an extremely bad idea. We already reset the overflow
> after reading the WPTR.

This is not resetting the overflow bit. This is resetting a "clear
overflow" bit. I don't have the hardware docs, but the name (and my
observations) strongly suggest that setting this bit actually prevents
the hardware from setting the overflow bit ever again.

Right now, IH overflows, even if they occur repeatedly, only get
registered once. If not registering IH overflows can trivially lead to
system crashes, it's amdgpu's current handling that is broken.

The possibility of a repeated IH overflow in between reading the wptr
and updating the rptr is a good point, but how can we detect that at
all? It seems to me like we can't set the OVERFLOW_CLEAR bit at all
then, because we're guaranteed to miss any overflows that happen while
the bit is set.

Regards,
Friedrich

>
> When you clear the overflow again when updating the RPTR you could
> loose another overflow which might have happened in between and so
> potentially process corrupted IVs.
>
> That can trivially crash the system.
>
> Regards,
> Christian.
>
>>   }
>>
>>   static int cik_ih_early_init(void *handle)
>> diff --git a/drivers/gpu/drm/amd/amdgpu/cz_ih.c
>> b/drivers/gpu/drm/amd/amdgpu/cz_ih.c
>> index b8c47e0cf37a..076559668573 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/cz_ih.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/cz_ih.c
>> @@ -215,7 +215,7 @@ static u32 cz_ih_get_wptr(struct amdgpu_device
>> *adev,
>>       tmp = RREG32(mmIH_RB_CNTL);
>>       tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 1);
>>       WREG32(mmIH_RB_CNTL, tmp);
>> -
>> +    ih->overflow = true;
>>
>>   out:
>>       return (wptr & ih->ptr_mask);
>> @@ -266,7 +266,19 @@ static void cz_ih_decode_iv(struct amdgpu_device
>> *adev,
>>   static void cz_ih_set_rptr(struct amdgpu_device *adev,
>>                  struct amdgpu_ih_ring *ih)
>>   {
>> +    u32 tmp;
>> +
>>       WREG32(mmIH_RB_RPTR, ih->rptr);
>> +
>> +    /* If we overflowed previously (and thus set the OVERFLOW_CLEAR
>> bit),
>> +     * reset it here to detect more overflows if they occur.
>> +     */
>> +    if (ih->overflow) {
>> +        tmp = RREG32(mmIH_RB_CNTL);
>> +        tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 0);
>> +        WREG32(mmIH_RB_CNTL, tmp);
>> +        ih->overflow = false;
>> +    }
>>   }
>>
>>   static int cz_ih_early_init(void *handle)
>> diff --git a/drivers/gpu/drm/amd/amdgpu/iceland_ih.c
>> b/drivers/gpu/drm/amd/amdgpu/iceland_ih.c
>> index aecad530b10a..1a5e668643d1 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/iceland_ih.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/iceland_ih.c
>> @@ -214,7 +214,7 @@ static u32 iceland_ih_get_wptr(struct
>> amdgpu_device *adev,
>>       tmp = RREG32(mmIH_RB_CNTL);
>>       tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 1);
>>       WREG32(mmIH_RB_CNTL, tmp);
>> -
>> +    ih->overflow = true;
>>
>>   out:
>>       return (wptr & ih->ptr_mask);
>> @@ -265,7 +265,19 @@ static void iceland_ih_decode_iv(struct
>> amdgpu_device *adev,
>>   static void iceland_ih_set_rptr(struct amdgpu_device *adev,
>>                   struct amdgpu_ih_ring *ih)
>>   {
>> +    u32 tmp;
>> +
>>       WREG32(mmIH_RB_RPTR, ih->rptr);
>> +
>> +    /* If we overflowed previously (and thus set the OVERFLOW_CLEAR
>> bit),
>> +     * reset it here to detect more overflows if they occur.
>> +     */
>> +    if (ih->overflow) {
>> +        tmp = RREG32(mmIH_RB_CNTL);
>> +        tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 0);
>> +        WREG32(mmIH_RB_CNTL, tmp);
>> +        ih->overflow = false;
>> +    }
>>   }
>>
>>   static int iceland_ih_early_init(void *handle)
>> diff --git a/drivers/gpu/drm/amd/amdgpu/ih_v6_0.c
>> b/drivers/gpu/drm/amd/amdgpu/ih_v6_0.c
>> index d9ed7332d805..ce8f7feec713 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/ih_v6_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/ih_v6_0.c
>> @@ -418,6 +418,8 @@ static u32 ih_v6_0_get_wptr(struct amdgpu_device
>> *adev,
>>       tmp = RREG32_NO_KIQ(ih_regs->ih_rb_cntl);
>>       tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 1);
>>       WREG32_NO_KIQ(ih_regs->ih_rb_cntl, tmp);
>> +    ih->overflow = true;
>> +
>>   out:
>>       return (wptr & ih->ptr_mask);
>>   }
>> @@ -459,6 +461,7 @@ static void ih_v6_0_irq_rearm(struct
>> amdgpu_device *adev,
>>   static void ih_v6_0_set_rptr(struct amdgpu_device *adev,
>>                      struct amdgpu_ih_ring *ih)
>>   {
>> +    u32 tmp;
>>       struct amdgpu_ih_regs *ih_regs;
>>
>>       if (ih->use_doorbell) {
>> @@ -472,6 +475,16 @@ static void ih_v6_0_set_rptr(struct
>> amdgpu_device *adev,
>>           ih_regs = &ih->ih_regs;
>>           WREG32(ih_regs->ih_rb_rptr, ih->rptr);
>>       }
>> +
>> +    /* If we overflowed previously (and thus set the OVERFLOW_CLEAR
>> bit),
>> +     * reset it here to detect more overflows if they occur.
>> +     */
>> +    if (ih->overflow) {
>> +        tmp = RREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl);
>> +        tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 0);
>> +        WREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl, tmp);
>> +        ih->overflow = false;
>> +    }
>>   }
>>
>>   /**
>> diff --git a/drivers/gpu/drm/amd/amdgpu/ih_v6_1.c
>> b/drivers/gpu/drm/amd/amdgpu/ih_v6_1.c
>> index 8fb05eae340a..668788ad34d9 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/ih_v6_1.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/ih_v6_1.c
>> @@ -418,6 +418,8 @@ static u32 ih_v6_1_get_wptr(struct amdgpu_device
>> *adev,
>>       tmp = RREG32_NO_KIQ(ih_regs->ih_rb_cntl);
>>       tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 1);
>>       WREG32_NO_KIQ(ih_regs->ih_rb_cntl, tmp);
>> +    ih->overflow = true;
>> +
>>   out:
>>       return (wptr & ih->ptr_mask);
>>   }
>> @@ -459,6 +461,7 @@ static void ih_v6_1_irq_rearm(struct
>> amdgpu_device *adev,
>>   static void ih_v6_1_set_rptr(struct amdgpu_device *adev,
>>                      struct amdgpu_ih_ring *ih)
>>   {
>> +    u32 tmp;
>>       struct amdgpu_ih_regs *ih_regs;
>>
>>       if (ih->use_doorbell) {
>> @@ -472,6 +475,16 @@ static void ih_v6_1_set_rptr(struct
>> amdgpu_device *adev,
>>           ih_regs = &ih->ih_regs;
>>           WREG32(ih_regs->ih_rb_rptr, ih->rptr);
>>       }
>> +
>> +    /* If we overflowed previously (and thus set the OVERFLOW_CLEAR
>> bit),
>> +     * reset it here to detect more overflows if they occur.
>> +     */
>> +    if (ih->overflow) {
>> +        tmp = RREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl);
>> +        tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 0);
>> +        WREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl, tmp);
>> +        ih->overflow = false;
>> +    }
>>   }
>>
>>   /**
>> diff --git a/drivers/gpu/drm/amd/amdgpu/navi10_ih.c
>> b/drivers/gpu/drm/amd/amdgpu/navi10_ih.c
>> index e64b33115848..0bdac923cb4d 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/navi10_ih.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/navi10_ih.c
>> @@ -442,6 +442,7 @@ static u32 navi10_ih_get_wptr(struct
>> amdgpu_device *adev,
>>       tmp = RREG32_NO_KIQ(ih_regs->ih_rb_cntl);
>>       tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 1);
>>       WREG32_NO_KIQ(ih_regs->ih_rb_cntl, tmp);
>> +    ih->overflow = true;
>>   out:
>>       return (wptr & ih->ptr_mask);
>>   }
>> @@ -483,6 +484,7 @@ static void navi10_ih_irq_rearm(struct
>> amdgpu_device *adev,
>>   static void navi10_ih_set_rptr(struct amdgpu_device *adev,
>>                      struct amdgpu_ih_ring *ih)
>>   {
>> +    u32 tmp;
>>       struct amdgpu_ih_regs *ih_regs;
>>
>>       if (ih == &adev->irq.ih_soft)
>> @@ -499,6 +501,16 @@ static void navi10_ih_set_rptr(struct
>> amdgpu_device *adev,
>>           ih_regs = &ih->ih_regs;
>>           WREG32(ih_regs->ih_rb_rptr, ih->rptr);
>>       }
>> +
>> +    /* If we overflowed previously (and thus set the OVERFLOW_CLEAR
>> bit),
>> +     * reset it here to detect more overflows if they occur.
>> +     */
>> +    if (ih->overflow) {
>> +        tmp = RREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl);
>> +        tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 0);
>> +        WREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl, tmp);
>> +        ih->overflow = false;
>> +    }
>>   }
>>
>>   /**
>> diff --git a/drivers/gpu/drm/amd/amdgpu/si_ih.c
>> b/drivers/gpu/drm/amd/amdgpu/si_ih.c
>> index 9a24f17a5750..ff35056d2b54 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/si_ih.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/si_ih.c
>> @@ -119,6 +119,7 @@ static u32 si_ih_get_wptr(struct amdgpu_device
>> *adev,
>>           tmp = RREG32(IH_RB_CNTL);
>>           tmp |= IH_RB_CNTL__WPTR_OVERFLOW_CLEAR_MASK;
>>           WREG32(IH_RB_CNTL, tmp);
>> +        ih->overflow = true;
>>       }
>>       return (wptr & ih->ptr_mask);
>>   }
>> @@ -147,7 +148,18 @@ static void si_ih_decode_iv(struct amdgpu_device
>> *adev,
>>   static void si_ih_set_rptr(struct amdgpu_device *adev,
>>                  struct amdgpu_ih_ring *ih)
>>   {
>> +    u32 tmp;
>> +
>>       WREG32(IH_RB_RPTR, ih->rptr);
>> +
>> +    /* If we overflowed previously (and thus set the OVERFLOW_CLEAR
>> bit),
>> +     * reset it here to detect more overflows if they occur.
>> +     */
>> +    if (ih->overflow) {
>> +        tmp = RREG32(IH_RB_CNTL);
>> +        tmp &= ~IH_RB_CNTL__WPTR_OVERFLOW_CLEAR_MASK;
>> +        WREG32(IH_RB_CNTL, tmp);
>> +    }
>>   }
>>
>>   static int si_ih_early_init(void *handle)
>> diff --git a/drivers/gpu/drm/amd/amdgpu/tonga_ih.c
>> b/drivers/gpu/drm/amd/amdgpu/tonga_ih.c
>> index 917707bba7f3..6f5090d3db48 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/tonga_ih.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/tonga_ih.c
>> @@ -218,6 +218,7 @@ static u32 tonga_ih_get_wptr(struct amdgpu_device
>> *adev,
>>       tmp = RREG32(mmIH_RB_CNTL);
>>       tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 1);
>>       WREG32(mmIH_RB_CNTL, tmp);
>> +    ih->overflow = true;
>>
>>   out:
>>       return (wptr & ih->ptr_mask);
>> @@ -268,6 +269,8 @@ static void tonga_ih_decode_iv(struct
>> amdgpu_device *adev,
>>   static void tonga_ih_set_rptr(struct amdgpu_device *adev,
>>                     struct amdgpu_ih_ring *ih)
>>   {
>> +    u32 tmp;
>> +
>>       if (ih->use_doorbell) {
>>           /* XXX check if swapping is necessary on BE */
>>           *ih->rptr_cpu = ih->rptr;
>> @@ -275,6 +278,16 @@ static void tonga_ih_set_rptr(struct
>> amdgpu_device *adev,
>>       } else {
>>           WREG32(mmIH_RB_RPTR, ih->rptr);
>>       }
>> +
>> +    /* If we overflowed previously (and thus set the OVERFLOW_CLEAR
>> bit),
>> +     * reset it here to detect more overflows if they occur.
>> +     */
>> +    if (ih->overflow) {
>> +        tmp = RREG32(mmIH_RB_CNTL);
>> +        tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 0);
>> +        WREG32(mmIH_RB_CNTL, tmp);
>> +        ih->overflow = false;
>> +    }
>>   }
>>
>>   static int tonga_ih_early_init(void *handle)
>> diff --git a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
>> b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
>> index d364c6dd152c..bb005924f194 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
>> @@ -372,6 +372,7 @@ static u32 vega10_ih_get_wptr(struct
>> amdgpu_device *adev,
>>       tmp = RREG32_NO_KIQ(ih_regs->ih_rb_cntl);
>>       tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 1);
>>       WREG32_NO_KIQ(ih_regs->ih_rb_cntl, tmp);
>> +    ih->overflow = true;
>>
>>   out:
>>       return (wptr & ih->ptr_mask);
>> @@ -413,6 +414,7 @@ static void vega10_ih_irq_rearm(struct
>> amdgpu_device *adev,
>>   static void vega10_ih_set_rptr(struct amdgpu_device *adev,
>>                      struct amdgpu_ih_ring *ih)
>>   {
>> +    u32 tmp;
>>       struct amdgpu_ih_regs *ih_regs;
>>
>>       if (ih == &adev->irq.ih_soft)
>> @@ -429,6 +431,16 @@ static void vega10_ih_set_rptr(struct
>> amdgpu_device *adev,
>>           ih_regs = &ih->ih_regs;
>>           WREG32(ih_regs->ih_rb_rptr, ih->rptr);
>>       }
>> +
>> +    /* If we overflowed previously (and thus set the OVERFLOW_CLEAR
>> bit),
>> +     * reset it here to detect more overflows if they occur.
>> +     */
>> +    if (ih->overflow) {
>> +        tmp = RREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl);
>> +        tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 0);
>> +        WREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl, tmp);
>> +        ih->overflow = false;
>> +    }
>>   }
>>
>>   /**
>> diff --git a/drivers/gpu/drm/amd/amdgpu/vega20_ih.c
>> b/drivers/gpu/drm/amd/amdgpu/vega20_ih.c
>> index ddfc6941f9d5..bb725a970697 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/vega20_ih.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/vega20_ih.c
>> @@ -420,6 +420,7 @@ static u32 vega20_ih_get_wptr(struct
>> amdgpu_device *adev,
>>       tmp = RREG32_NO_KIQ(ih_regs->ih_rb_cntl);
>>       tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 1);
>>       WREG32_NO_KIQ(ih_regs->ih_rb_cntl, tmp);
>> +    ih->overflow = true;
>>
>>   out:
>>       return (wptr & ih->ptr_mask);
>> @@ -462,6 +463,7 @@ static void vega20_ih_irq_rearm(struct
>> amdgpu_device *adev,
>>   static void vega20_ih_set_rptr(struct amdgpu_device *adev,
>>                      struct amdgpu_ih_ring *ih)
>>   {
>> +    u32 tmp;
>>       struct amdgpu_ih_regs *ih_regs;
>>
>>       if (ih == &adev->irq.ih_soft)
>> @@ -478,6 +480,16 @@ static void vega20_ih_set_rptr(struct
>> amdgpu_device *adev,
>>           ih_regs = &ih->ih_regs;
>>           WREG32(ih_regs->ih_rb_rptr, ih->rptr);
>>       }
>> +
>> +    /* If we overflowed previously (and thus set the OVERFLOW_CLEAR
>> bit),
>> +     * reset it here to detect more overflows if they occur.
>> +     */
>> +    if (ih->overflow) {
>> +        tmp = RREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl);
>> +        tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 0);
>> +        WREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl, tmp);
>> +        ih->overflow = false;
>> +    }
>>   }
>>
>>   /**
>> --
>> 2.43.0
>>
>

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 2/2] drm/amdgpu: Process fences on IH overflow
  2024-01-15 10:26     ` Christian König
@ 2024-01-15 11:19       ` Friedrich Vock
  -1 siblings, 0 replies; 28+ messages in thread
From: Friedrich Vock @ 2024-01-15 11:19 UTC (permalink / raw)
  To: Christian König, amd-gfx; +Cc: Alex Deucher, stable, Joshua Ashton

On 15.01.24 11:26, Christian König wrote:
> Am 14.01.24 um 14:00 schrieb Friedrich Vock:
>> If the IH ring buffer overflows, it's possible that fence signal events
>> were lost. Check each ring for progress to prevent job timeouts/GPU
>> hangs due to the fences staying unsignaled despite the work being done.
>
> That's completely unnecessary and in some cases even harmful.
How is it harmful? The only effect it can have is prevent unnecessary
GPU hangs, no? It's not like it hides any legitimate errors that you'd
otherwise see.
>
> We already have a timeout handler for that and overflows point to
> severe system problem so they should never occur in a production system.

IH ring buffer overflows are pretty reliably reproducible if you trigger
a lot of page faults, at least on Deck. Why shouldn't enough page faults
in quick succession be able to overflow the IH ring buffer?

The fence fallback timer as it is now is useless for this because it
only gets triggered once after 0.5s. I guess an alternative approach
would be to make a timer trigger for each work item in flight every
0.5s, but why should that be better than just handling overflow errors
as they occur?

Regards,
Friedrich

>
> Regards,
> Christian.
>
>>
>> Cc: Joshua Ashton <joshua@froggi.es>
>> Cc: Alex Deucher <alexander.deucher@amd.com>
>> Cc: stable@vger.kernel.org
>>
>> Signed-off-by: Friedrich Vock <friedrich.vock@gmx.de>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c | 15 +++++++++++++++
>>   1 file changed, 15 insertions(+)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
>> index f3b0aaf3ebc6..2a246db1d3a7 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
>> @@ -209,6 +209,7 @@ int amdgpu_ih_process(struct amdgpu_device *adev,
>> struct amdgpu_ih_ring *ih)
>>   {
>>       unsigned int count;
>>       u32 wptr;
>> +    int i;
>>
>>       if (!ih->enabled || adev->shutdown)
>>           return IRQ_NONE;
>> @@ -227,6 +228,20 @@ int amdgpu_ih_process(struct amdgpu_device
>> *adev, struct amdgpu_ih_ring *ih)
>>           ih->rptr &= ih->ptr_mask;
>>       }
>>
>> +    /* If the ring buffer overflowed, we might have lost some fence
>> +     * signal interrupts. Check if there was any activity so the signal
>> +     * doesn't get lost.
>> +     */
>> +    if (ih->overflow) {
>> +        for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>> +            struct amdgpu_ring *ring = adev->rings[i];
>> +
>> +            if (!ring || !ring->fence_drv.initialized)
>> +                continue;
>> +            amdgpu_fence_process(ring);
>> +        }
>> +    }
>> +
>>       amdgpu_ih_set_rptr(adev, ih);
>>       wake_up_all(&ih->wait_process);
>>
>> --
>> 2.43.0
>>
>

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 2/2] drm/amdgpu: Process fences on IH overflow
@ 2024-01-15 11:19       ` Friedrich Vock
  0 siblings, 0 replies; 28+ messages in thread
From: Friedrich Vock @ 2024-01-15 11:19 UTC (permalink / raw)
  To: Christian König, amd-gfx; +Cc: Alex Deucher, Joshua Ashton, stable

On 15.01.24 11:26, Christian König wrote:
> Am 14.01.24 um 14:00 schrieb Friedrich Vock:
>> If the IH ring buffer overflows, it's possible that fence signal events
>> were lost. Check each ring for progress to prevent job timeouts/GPU
>> hangs due to the fences staying unsignaled despite the work being done.
>
> That's completely unnecessary and in some cases even harmful.
How is it harmful? The only effect it can have is prevent unnecessary
GPU hangs, no? It's not like it hides any legitimate errors that you'd
otherwise see.
>
> We already have a timeout handler for that and overflows point to
> severe system problem so they should never occur in a production system.

IH ring buffer overflows are pretty reliably reproducible if you trigger
a lot of page faults, at least on Deck. Why shouldn't enough page faults
in quick succession be able to overflow the IH ring buffer?

The fence fallback timer as it is now is useless for this because it
only gets triggered once after 0.5s. I guess an alternative approach
would be to make a timer trigger for each work item in flight every
0.5s, but why should that be better than just handling overflow errors
as they occur?

Regards,
Friedrich

>
> Regards,
> Christian.
>
>>
>> Cc: Joshua Ashton <joshua@froggi.es>
>> Cc: Alex Deucher <alexander.deucher@amd.com>
>> Cc: stable@vger.kernel.org
>>
>> Signed-off-by: Friedrich Vock <friedrich.vock@gmx.de>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c | 15 +++++++++++++++
>>   1 file changed, 15 insertions(+)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
>> index f3b0aaf3ebc6..2a246db1d3a7 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
>> @@ -209,6 +209,7 @@ int amdgpu_ih_process(struct amdgpu_device *adev,
>> struct amdgpu_ih_ring *ih)
>>   {
>>       unsigned int count;
>>       u32 wptr;
>> +    int i;
>>
>>       if (!ih->enabled || adev->shutdown)
>>           return IRQ_NONE;
>> @@ -227,6 +228,20 @@ int amdgpu_ih_process(struct amdgpu_device
>> *adev, struct amdgpu_ih_ring *ih)
>>           ih->rptr &= ih->ptr_mask;
>>       }
>>
>> +    /* If the ring buffer overflowed, we might have lost some fence
>> +     * signal interrupts. Check if there was any activity so the signal
>> +     * doesn't get lost.
>> +     */
>> +    if (ih->overflow) {
>> +        for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>> +            struct amdgpu_ring *ring = adev->rings[i];
>> +
>> +            if (!ring || !ring->fence_drv.initialized)
>> +                continue;
>> +            amdgpu_fence_process(ring);
>> +        }
>> +    }
>> +
>>       amdgpu_ih_set_rptr(adev, ih);
>>       wake_up_all(&ih->wait_process);
>>
>> --
>> 2.43.0
>>
>

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 1/2] drm/amdgpu: Reset IH OVERFLOW_CLEAR bit after writing rptr
  2024-01-15 11:18   ` [PATCH 1/2] drm/amdgpu: Reset IH OVERFLOW_CLEAR bit after writing rptr Friedrich Vock
@ 2024-01-16  7:03     ` Christian König
  2024-01-16 10:31       ` Friedrich Vock
  0 siblings, 1 reply; 28+ messages in thread
From: Christian König @ 2024-01-16  7:03 UTC (permalink / raw)
  To: Friedrich Vock; +Cc: Alex Deucher, Joshua Ashton, amd-gfx

Am 15.01.24 um 12:18 schrieb Friedrich Vock:
> Adding the original Ccs from the thread since they seemed to be missing
> in the reply.
>
> On 15.01.24 11:55, Christian König wrote:
>> Am 14.01.24 um 14:00 schrieb Friedrich Vock:
>>> Allows us to detect subsequent IH ring buffer overflows as well.
>>
>> Well that suggested handling here is certainly broken, see below.
>>
>>>
>>> Cc: Joshua Ashton <joshua@froggi.es>
>>> Cc: Alex Deucher <alexander.deucher@amd.com>
>>> Cc: stable@vger.kernel.org
>>>
>>> Signed-off-by: Friedrich Vock <friedrich.vock@gmx.de>
>>> ---
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h  |  2 ++
>>>   drivers/gpu/drm/amd/amdgpu/cik_ih.c     | 13 +++++++++++++
>>>   drivers/gpu/drm/amd/amdgpu/cz_ih.c      | 14 +++++++++++++-
>>>   drivers/gpu/drm/amd/amdgpu/iceland_ih.c | 14 +++++++++++++-
>>>   drivers/gpu/drm/amd/amdgpu/ih_v6_0.c    | 13 +++++++++++++
>>>   drivers/gpu/drm/amd/amdgpu/ih_v6_1.c    | 13 +++++++++++++
>>>   drivers/gpu/drm/amd/amdgpu/navi10_ih.c  | 12 ++++++++++++
>>>   drivers/gpu/drm/amd/amdgpu/si_ih.c      | 12 ++++++++++++
>>>   drivers/gpu/drm/amd/amdgpu/tonga_ih.c   | 13 +++++++++++++
>>>   drivers/gpu/drm/amd/amdgpu/vega10_ih.c  | 12 ++++++++++++
>>>   drivers/gpu/drm/amd/amdgpu/vega20_ih.c  | 12 ++++++++++++
>>>   11 files changed, 128 insertions(+), 2 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h
>>> index 508f02eb0cf8..6041ec727f06 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h
>>> @@ -69,6 +69,8 @@ struct amdgpu_ih_ring {
>>>       unsigned        rptr;
>>>       struct amdgpu_ih_regs    ih_regs;
>>>
>>> +    bool overflow;
>>> +
>>>       /* For waiting on IH processing at checkpoint. */
>>>       wait_queue_head_t wait_process;
>>>       uint64_t        processed_timestamp;
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/cik_ih.c
>>> b/drivers/gpu/drm/amd/amdgpu/cik_ih.c
>>> index 6f7c031dd197..807cc30c9e33 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/cik_ih.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/cik_ih.c
>>> @@ -204,6 +204,7 @@ static u32 cik_ih_get_wptr(struct amdgpu_device
>>> *adev,
>>>           tmp = RREG32(mmIH_RB_CNTL);
>>>           tmp |= IH_RB_CNTL__WPTR_OVERFLOW_CLEAR_MASK;
>>>           WREG32(mmIH_RB_CNTL, tmp);
>>> +        ih->overflow = true;
>>>       }
>>>       return (wptr & ih->ptr_mask);
>>>   }
>>> @@ -274,7 +275,19 @@ static void cik_ih_decode_iv(struct
>>> amdgpu_device *adev,
>>>   static void cik_ih_set_rptr(struct amdgpu_device *adev,
>>>                   struct amdgpu_ih_ring *ih)
>>>   {
>>> +    u32 tmp;
>>> +
>>>       WREG32(mmIH_RB_RPTR, ih->rptr);
>>> +
>>> +    /* If we overflowed previously (and thus set the OVERFLOW_CLEAR
>>> bit),
>>> +     * reset it here to detect more overflows if they occur.
>>> +     */
>>> +    if (ih->overflow) {
>>> +        tmp = RREG32(mmIH_RB_CNTL);
>>> +        tmp &= ~IH_RB_CNTL__WPTR_OVERFLOW_CLEAR_MASK;
>>> +        WREG32(mmIH_RB_CNTL, tmp);
>>> +        ih->overflow = false;
>>> +    }
>>
>> Well that is an extremely bad idea. We already reset the overflow
>> after reading the WPTR.
>
> This is not resetting the overflow bit. This is resetting a "clear
> overflow" bit. I don't have the hardware docs, but the name (and my
> observations) strongly suggest that setting this bit actually prevents
> the hardware from setting the overflow bit ever again.

Well that doesn't make any sense at all. The hardware documentation 
clearly states that this bit is write only and should always read as zero.

Setting this bit will clear the overflow flag in the WPTR register and 
clearing it has no effect at all.

I could only ping the hw engineer responsible for this block to double 
check if the documentation is somehow outdated, but I really doubt so.

> Right now, IH overflows, even if they occur repeatedly, only get
> registered once. If not registering IH overflows can trivially lead to
> system crashes, it's amdgpu's current handling that is broken.

It's years that we last tested this but according to the HW 
documentation this should work fine.

What could potentially happen is that the IH has silenced the source of 
the overflow. We never implemented resetting those, but in this case 
that here won't help either.

>
> The possibility of a repeated IH overflow in between reading the wptr
> and updating the rptr is a good point, but how can we detect that at
> all? It seems to me like we can't set the OVERFLOW_CLEAR bit at all
> then, because we're guaranteed to miss any overflows that happen while
> the bit is set.

When an IH overflow is signaled we clear that flag by writing 1 into the 
OVERFLOW_CLEAR bit and skip one entry in the IH ring buffer.

What can of course happen is that the IH ring buffer overflows more than 
this single entry and we process IVs which are potentially corrupted, 
but we won't miss any additional overflows since we only start 
processing after resetting the flag.

An IH overflow is also something you should *never* see in a production 
system. This is purely for driver bringup and as fallback when there is 
a severe incorrect programming of the HW.

The only exception of that is page fault handling on MI products because 
of a hardware bug, to mitigate this we are processing page faults on a 
separate IH ring on those parts.

On all other hw generations the IH should have some rate limit for the 
number of faults generated per second, so that the CPU is always able to 
catch up.

Regards,
Christian.

>
> Regards,
> Friedrich
>
>>
>> When you clear the overflow again when updating the RPTR you could
>> loose another overflow which might have happened in between and so
>> potentially process corrupted IVs.
>>
>> That can trivially crash the system.
>>
>> Regards,
>> Christian.
>>
>>>   }
>>>
>>>   static int cik_ih_early_init(void *handle)
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/cz_ih.c
>>> b/drivers/gpu/drm/amd/amdgpu/cz_ih.c
>>> index b8c47e0cf37a..076559668573 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/cz_ih.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/cz_ih.c
>>> @@ -215,7 +215,7 @@ static u32 cz_ih_get_wptr(struct amdgpu_device
>>> *adev,
>>>       tmp = RREG32(mmIH_RB_CNTL);
>>>       tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 1);
>>>       WREG32(mmIH_RB_CNTL, tmp);
>>> -
>>> +    ih->overflow = true;
>>>
>>>   out:
>>>       return (wptr & ih->ptr_mask);
>>> @@ -266,7 +266,19 @@ static void cz_ih_decode_iv(struct amdgpu_device
>>> *adev,
>>>   static void cz_ih_set_rptr(struct amdgpu_device *adev,
>>>                  struct amdgpu_ih_ring *ih)
>>>   {
>>> +    u32 tmp;
>>> +
>>>       WREG32(mmIH_RB_RPTR, ih->rptr);
>>> +
>>> +    /* If we overflowed previously (and thus set the OVERFLOW_CLEAR
>>> bit),
>>> +     * reset it here to detect more overflows if they occur.
>>> +     */
>>> +    if (ih->overflow) {
>>> +        tmp = RREG32(mmIH_RB_CNTL);
>>> +        tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 0);
>>> +        WREG32(mmIH_RB_CNTL, tmp);
>>> +        ih->overflow = false;
>>> +    }
>>>   }
>>>
>>>   static int cz_ih_early_init(void *handle)
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/iceland_ih.c
>>> b/drivers/gpu/drm/amd/amdgpu/iceland_ih.c
>>> index aecad530b10a..1a5e668643d1 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/iceland_ih.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/iceland_ih.c
>>> @@ -214,7 +214,7 @@ static u32 iceland_ih_get_wptr(struct
>>> amdgpu_device *adev,
>>>       tmp = RREG32(mmIH_RB_CNTL);
>>>       tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 1);
>>>       WREG32(mmIH_RB_CNTL, tmp);
>>> -
>>> +    ih->overflow = true;
>>>
>>>   out:
>>>       return (wptr & ih->ptr_mask);
>>> @@ -265,7 +265,19 @@ static void iceland_ih_decode_iv(struct
>>> amdgpu_device *adev,
>>>   static void iceland_ih_set_rptr(struct amdgpu_device *adev,
>>>                   struct amdgpu_ih_ring *ih)
>>>   {
>>> +    u32 tmp;
>>> +
>>>       WREG32(mmIH_RB_RPTR, ih->rptr);
>>> +
>>> +    /* If we overflowed previously (and thus set the OVERFLOW_CLEAR
>>> bit),
>>> +     * reset it here to detect more overflows if they occur.
>>> +     */
>>> +    if (ih->overflow) {
>>> +        tmp = RREG32(mmIH_RB_CNTL);
>>> +        tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 0);
>>> +        WREG32(mmIH_RB_CNTL, tmp);
>>> +        ih->overflow = false;
>>> +    }
>>>   }
>>>
>>>   static int iceland_ih_early_init(void *handle)
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/ih_v6_0.c
>>> b/drivers/gpu/drm/amd/amdgpu/ih_v6_0.c
>>> index d9ed7332d805..ce8f7feec713 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/ih_v6_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/ih_v6_0.c
>>> @@ -418,6 +418,8 @@ static u32 ih_v6_0_get_wptr(struct amdgpu_device
>>> *adev,
>>>       tmp = RREG32_NO_KIQ(ih_regs->ih_rb_cntl);
>>>       tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 1);
>>>       WREG32_NO_KIQ(ih_regs->ih_rb_cntl, tmp);
>>> +    ih->overflow = true;
>>> +
>>>   out:
>>>       return (wptr & ih->ptr_mask);
>>>   }
>>> @@ -459,6 +461,7 @@ static void ih_v6_0_irq_rearm(struct
>>> amdgpu_device *adev,
>>>   static void ih_v6_0_set_rptr(struct amdgpu_device *adev,
>>>                      struct amdgpu_ih_ring *ih)
>>>   {
>>> +    u32 tmp;
>>>       struct amdgpu_ih_regs *ih_regs;
>>>
>>>       if (ih->use_doorbell) {
>>> @@ -472,6 +475,16 @@ static void ih_v6_0_set_rptr(struct
>>> amdgpu_device *adev,
>>>           ih_regs = &ih->ih_regs;
>>>           WREG32(ih_regs->ih_rb_rptr, ih->rptr);
>>>       }
>>> +
>>> +    /* If we overflowed previously (and thus set the OVERFLOW_CLEAR
>>> bit),
>>> +     * reset it here to detect more overflows if they occur.
>>> +     */
>>> +    if (ih->overflow) {
>>> +        tmp = RREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl);
>>> +        tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 0);
>>> +        WREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl, tmp);
>>> +        ih->overflow = false;
>>> +    }
>>>   }
>>>
>>>   /**
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/ih_v6_1.c
>>> b/drivers/gpu/drm/amd/amdgpu/ih_v6_1.c
>>> index 8fb05eae340a..668788ad34d9 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/ih_v6_1.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/ih_v6_1.c
>>> @@ -418,6 +418,8 @@ static u32 ih_v6_1_get_wptr(struct amdgpu_device
>>> *adev,
>>>       tmp = RREG32_NO_KIQ(ih_regs->ih_rb_cntl);
>>>       tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 1);
>>>       WREG32_NO_KIQ(ih_regs->ih_rb_cntl, tmp);
>>> +    ih->overflow = true;
>>> +
>>>   out:
>>>       return (wptr & ih->ptr_mask);
>>>   }
>>> @@ -459,6 +461,7 @@ static void ih_v6_1_irq_rearm(struct
>>> amdgpu_device *adev,
>>>   static void ih_v6_1_set_rptr(struct amdgpu_device *adev,
>>>                      struct amdgpu_ih_ring *ih)
>>>   {
>>> +    u32 tmp;
>>>       struct amdgpu_ih_regs *ih_regs;
>>>
>>>       if (ih->use_doorbell) {
>>> @@ -472,6 +475,16 @@ static void ih_v6_1_set_rptr(struct
>>> amdgpu_device *adev,
>>>           ih_regs = &ih->ih_regs;
>>>           WREG32(ih_regs->ih_rb_rptr, ih->rptr);
>>>       }
>>> +
>>> +    /* If we overflowed previously (and thus set the OVERFLOW_CLEAR
>>> bit),
>>> +     * reset it here to detect more overflows if they occur.
>>> +     */
>>> +    if (ih->overflow) {
>>> +        tmp = RREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl);
>>> +        tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 0);
>>> +        WREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl, tmp);
>>> +        ih->overflow = false;
>>> +    }
>>>   }
>>>
>>>   /**
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/navi10_ih.c
>>> b/drivers/gpu/drm/amd/amdgpu/navi10_ih.c
>>> index e64b33115848..0bdac923cb4d 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/navi10_ih.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/navi10_ih.c
>>> @@ -442,6 +442,7 @@ static u32 navi10_ih_get_wptr(struct
>>> amdgpu_device *adev,
>>>       tmp = RREG32_NO_KIQ(ih_regs->ih_rb_cntl);
>>>       tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 1);
>>>       WREG32_NO_KIQ(ih_regs->ih_rb_cntl, tmp);
>>> +    ih->overflow = true;
>>>   out:
>>>       return (wptr & ih->ptr_mask);
>>>   }
>>> @@ -483,6 +484,7 @@ static void navi10_ih_irq_rearm(struct
>>> amdgpu_device *adev,
>>>   static void navi10_ih_set_rptr(struct amdgpu_device *adev,
>>>                      struct amdgpu_ih_ring *ih)
>>>   {
>>> +    u32 tmp;
>>>       struct amdgpu_ih_regs *ih_regs;
>>>
>>>       if (ih == &adev->irq.ih_soft)
>>> @@ -499,6 +501,16 @@ static void navi10_ih_set_rptr(struct
>>> amdgpu_device *adev,
>>>           ih_regs = &ih->ih_regs;
>>>           WREG32(ih_regs->ih_rb_rptr, ih->rptr);
>>>       }
>>> +
>>> +    /* If we overflowed previously (and thus set the OVERFLOW_CLEAR
>>> bit),
>>> +     * reset it here to detect more overflows if they occur.
>>> +     */
>>> +    if (ih->overflow) {
>>> +        tmp = RREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl);
>>> +        tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 0);
>>> +        WREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl, tmp);
>>> +        ih->overflow = false;
>>> +    }
>>>   }
>>>
>>>   /**
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/si_ih.c
>>> b/drivers/gpu/drm/amd/amdgpu/si_ih.c
>>> index 9a24f17a5750..ff35056d2b54 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/si_ih.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/si_ih.c
>>> @@ -119,6 +119,7 @@ static u32 si_ih_get_wptr(struct amdgpu_device
>>> *adev,
>>>           tmp = RREG32(IH_RB_CNTL);
>>>           tmp |= IH_RB_CNTL__WPTR_OVERFLOW_CLEAR_MASK;
>>>           WREG32(IH_RB_CNTL, tmp);
>>> +        ih->overflow = true;
>>>       }
>>>       return (wptr & ih->ptr_mask);
>>>   }
>>> @@ -147,7 +148,18 @@ static void si_ih_decode_iv(struct amdgpu_device
>>> *adev,
>>>   static void si_ih_set_rptr(struct amdgpu_device *adev,
>>>                  struct amdgpu_ih_ring *ih)
>>>   {
>>> +    u32 tmp;
>>> +
>>>       WREG32(IH_RB_RPTR, ih->rptr);
>>> +
>>> +    /* If we overflowed previously (and thus set the OVERFLOW_CLEAR
>>> bit),
>>> +     * reset it here to detect more overflows if they occur.
>>> +     */
>>> +    if (ih->overflow) {
>>> +        tmp = RREG32(IH_RB_CNTL);
>>> +        tmp &= ~IH_RB_CNTL__WPTR_OVERFLOW_CLEAR_MASK;
>>> +        WREG32(IH_RB_CNTL, tmp);
>>> +    }
>>>   }
>>>
>>>   static int si_ih_early_init(void *handle)
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/tonga_ih.c
>>> b/drivers/gpu/drm/amd/amdgpu/tonga_ih.c
>>> index 917707bba7f3..6f5090d3db48 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/tonga_ih.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/tonga_ih.c
>>> @@ -218,6 +218,7 @@ static u32 tonga_ih_get_wptr(struct amdgpu_device
>>> *adev,
>>>       tmp = RREG32(mmIH_RB_CNTL);
>>>       tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 1);
>>>       WREG32(mmIH_RB_CNTL, tmp);
>>> +    ih->overflow = true;
>>>
>>>   out:
>>>       return (wptr & ih->ptr_mask);
>>> @@ -268,6 +269,8 @@ static void tonga_ih_decode_iv(struct
>>> amdgpu_device *adev,
>>>   static void tonga_ih_set_rptr(struct amdgpu_device *adev,
>>>                     struct amdgpu_ih_ring *ih)
>>>   {
>>> +    u32 tmp;
>>> +
>>>       if (ih->use_doorbell) {
>>>           /* XXX check if swapping is necessary on BE */
>>>           *ih->rptr_cpu = ih->rptr;
>>> @@ -275,6 +278,16 @@ static void tonga_ih_set_rptr(struct
>>> amdgpu_device *adev,
>>>       } else {
>>>           WREG32(mmIH_RB_RPTR, ih->rptr);
>>>       }
>>> +
>>> +    /* If we overflowed previously (and thus set the OVERFLOW_CLEAR
>>> bit),
>>> +     * reset it here to detect more overflows if they occur.
>>> +     */
>>> +    if (ih->overflow) {
>>> +        tmp = RREG32(mmIH_RB_CNTL);
>>> +        tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 0);
>>> +        WREG32(mmIH_RB_CNTL, tmp);
>>> +        ih->overflow = false;
>>> +    }
>>>   }
>>>
>>>   static int tonga_ih_early_init(void *handle)
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
>>> b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
>>> index d364c6dd152c..bb005924f194 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
>>> @@ -372,6 +372,7 @@ static u32 vega10_ih_get_wptr(struct
>>> amdgpu_device *adev,
>>>       tmp = RREG32_NO_KIQ(ih_regs->ih_rb_cntl);
>>>       tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 1);
>>>       WREG32_NO_KIQ(ih_regs->ih_rb_cntl, tmp);
>>> +    ih->overflow = true;
>>>
>>>   out:
>>>       return (wptr & ih->ptr_mask);
>>> @@ -413,6 +414,7 @@ static void vega10_ih_irq_rearm(struct
>>> amdgpu_device *adev,
>>>   static void vega10_ih_set_rptr(struct amdgpu_device *adev,
>>>                      struct amdgpu_ih_ring *ih)
>>>   {
>>> +    u32 tmp;
>>>       struct amdgpu_ih_regs *ih_regs;
>>>
>>>       if (ih == &adev->irq.ih_soft)
>>> @@ -429,6 +431,16 @@ static void vega10_ih_set_rptr(struct
>>> amdgpu_device *adev,
>>>           ih_regs = &ih->ih_regs;
>>>           WREG32(ih_regs->ih_rb_rptr, ih->rptr);
>>>       }
>>> +
>>> +    /* If we overflowed previously (and thus set the OVERFLOW_CLEAR
>>> bit),
>>> +     * reset it here to detect more overflows if they occur.
>>> +     */
>>> +    if (ih->overflow) {
>>> +        tmp = RREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl);
>>> +        tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 0);
>>> +        WREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl, tmp);
>>> +        ih->overflow = false;
>>> +    }
>>>   }
>>>
>>>   /**
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/vega20_ih.c
>>> b/drivers/gpu/drm/amd/amdgpu/vega20_ih.c
>>> index ddfc6941f9d5..bb725a970697 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/vega20_ih.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/vega20_ih.c
>>> @@ -420,6 +420,7 @@ static u32 vega20_ih_get_wptr(struct
>>> amdgpu_device *adev,
>>>       tmp = RREG32_NO_KIQ(ih_regs->ih_rb_cntl);
>>>       tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 1);
>>>       WREG32_NO_KIQ(ih_regs->ih_rb_cntl, tmp);
>>> +    ih->overflow = true;
>>>
>>>   out:
>>>       return (wptr & ih->ptr_mask);
>>> @@ -462,6 +463,7 @@ static void vega20_ih_irq_rearm(struct
>>> amdgpu_device *adev,
>>>   static void vega20_ih_set_rptr(struct amdgpu_device *adev,
>>>                      struct amdgpu_ih_ring *ih)
>>>   {
>>> +    u32 tmp;
>>>       struct amdgpu_ih_regs *ih_regs;
>>>
>>>       if (ih == &adev->irq.ih_soft)
>>> @@ -478,6 +480,16 @@ static void vega20_ih_set_rptr(struct
>>> amdgpu_device *adev,
>>>           ih_regs = &ih->ih_regs;
>>>           WREG32(ih_regs->ih_rb_rptr, ih->rptr);
>>>       }
>>> +
>>> +    /* If we overflowed previously (and thus set the OVERFLOW_CLEAR
>>> bit),
>>> +     * reset it here to detect more overflows if they occur.
>>> +     */
>>> +    if (ih->overflow) {
>>> +        tmp = RREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl);
>>> +        tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 0);
>>> +        WREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl, tmp);
>>> +        ih->overflow = false;
>>> +    }
>>>   }
>>>
>>>   /**
>>> -- 
>>> 2.43.0
>>>
>>


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 2/2] drm/amdgpu: Process fences on IH overflow
  2024-01-15 11:19       ` Friedrich Vock
@ 2024-01-16  7:17         ` Christian König
  -1 siblings, 0 replies; 28+ messages in thread
From: Christian König @ 2024-01-16  7:17 UTC (permalink / raw)
  To: Friedrich Vock, amd-gfx; +Cc: Alex Deucher, stable, Joshua Ashton

Am 15.01.24 um 12:19 schrieb Friedrich Vock:
> On 15.01.24 11:26, Christian König wrote:
>> Am 14.01.24 um 14:00 schrieb Friedrich Vock:
>>> If the IH ring buffer overflows, it's possible that fence signal events
>>> were lost. Check each ring for progress to prevent job timeouts/GPU
>>> hangs due to the fences staying unsignaled despite the work being done.
>>
>> That's completely unnecessary and in some cases even harmful.
> How is it harmful? The only effect it can have is prevent unnecessary
> GPU hangs, no? It's not like it hides any legitimate errors that you'd
> otherwise see.

We have no guarantee that all ring buffers are actually fully 
initialized to allow fence processing.

Apart from that fence processing is the least of your problems when an 
IV overflow occurs. Other interrupt source which are not repeated are 
usually for more worse.

>>
>> We already have a timeout handler for that and overflows point to
>> severe system problem so they should never occur in a production system.
>
> IH ring buffer overflows are pretty reliably reproducible if you trigger
> a lot of page faults, at least on Deck. Why shouldn't enough page faults
> in quick succession be able to overflow the IH ring buffer?

At least not on recent hw generations. Since gfx9 we have a rate limit 
on the number of page faults generated.

What could maybe do as well is to change the default of vm_fault_stop, 
but for your case that would be even worse in production.

>
> The fence fallback timer as it is now is useless for this because it
> only gets triggered once after 0.5s. I guess an alternative approach
> would be to make a timer trigger for each work item in flight every
> 0.5s, but why should that be better than just handling overflow errors
> as they occur?

That is intentional. As I said an IH overflow just points out that there 
is something massively wrong in the HW programming.

After gfx9 the IH should never produce overflow any more, otherwise 
either the ratelimit doesn't work or isn't enabled for some reason or 
the IH ring buffer is just to small.

Regards,
Christian.

>
> Regards,
> Friedrich
>
>>
>> Regards,
>> Christian.
>>
>>>
>>> Cc: Joshua Ashton <joshua@froggi.es>
>>> Cc: Alex Deucher <alexander.deucher@amd.com>
>>> Cc: stable@vger.kernel.org
>>>
>>> Signed-off-by: Friedrich Vock <friedrich.vock@gmx.de>
>>> ---
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c | 15 +++++++++++++++
>>>   1 file changed, 15 insertions(+)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
>>> index f3b0aaf3ebc6..2a246db1d3a7 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
>>> @@ -209,6 +209,7 @@ int amdgpu_ih_process(struct amdgpu_device *adev,
>>> struct amdgpu_ih_ring *ih)
>>>   {
>>>       unsigned int count;
>>>       u32 wptr;
>>> +    int i;
>>>
>>>       if (!ih->enabled || adev->shutdown)
>>>           return IRQ_NONE;
>>> @@ -227,6 +228,20 @@ int amdgpu_ih_process(struct amdgpu_device
>>> *adev, struct amdgpu_ih_ring *ih)
>>>           ih->rptr &= ih->ptr_mask;
>>>       }
>>>
>>> +    /* If the ring buffer overflowed, we might have lost some fence
>>> +     * signal interrupts. Check if there was any activity so the 
>>> signal
>>> +     * doesn't get lost.
>>> +     */
>>> +    if (ih->overflow) {
>>> +        for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>>> +            struct amdgpu_ring *ring = adev->rings[i];
>>> +
>>> +            if (!ring || !ring->fence_drv.initialized)
>>> +                continue;
>>> +            amdgpu_fence_process(ring);
>>> +        }
>>> +    }
>>> +
>>>       amdgpu_ih_set_rptr(adev, ih);
>>>       wake_up_all(&ih->wait_process);
>>>
>>> -- 
>>> 2.43.0
>>>
>>


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 2/2] drm/amdgpu: Process fences on IH overflow
@ 2024-01-16  7:17         ` Christian König
  0 siblings, 0 replies; 28+ messages in thread
From: Christian König @ 2024-01-16  7:17 UTC (permalink / raw)
  To: Friedrich Vock, amd-gfx; +Cc: Alex Deucher, Joshua Ashton, stable

Am 15.01.24 um 12:19 schrieb Friedrich Vock:
> On 15.01.24 11:26, Christian König wrote:
>> Am 14.01.24 um 14:00 schrieb Friedrich Vock:
>>> If the IH ring buffer overflows, it's possible that fence signal events
>>> were lost. Check each ring for progress to prevent job timeouts/GPU
>>> hangs due to the fences staying unsignaled despite the work being done.
>>
>> That's completely unnecessary and in some cases even harmful.
> How is it harmful? The only effect it can have is prevent unnecessary
> GPU hangs, no? It's not like it hides any legitimate errors that you'd
> otherwise see.

We have no guarantee that all ring buffers are actually fully 
initialized to allow fence processing.

Apart from that fence processing is the least of your problems when an 
IV overflow occurs. Other interrupt source which are not repeated are 
usually for more worse.

>>
>> We already have a timeout handler for that and overflows point to
>> severe system problem so they should never occur in a production system.
>
> IH ring buffer overflows are pretty reliably reproducible if you trigger
> a lot of page faults, at least on Deck. Why shouldn't enough page faults
> in quick succession be able to overflow the IH ring buffer?

At least not on recent hw generations. Since gfx9 we have a rate limit 
on the number of page faults generated.

What could maybe do as well is to change the default of vm_fault_stop, 
but for your case that would be even worse in production.

>
> The fence fallback timer as it is now is useless for this because it
> only gets triggered once after 0.5s. I guess an alternative approach
> would be to make a timer trigger for each work item in flight every
> 0.5s, but why should that be better than just handling overflow errors
> as they occur?

That is intentional. As I said an IH overflow just points out that there 
is something massively wrong in the HW programming.

After gfx9 the IH should never produce overflow any more, otherwise 
either the ratelimit doesn't work or isn't enabled for some reason or 
the IH ring buffer is just to small.

Regards,
Christian.

>
> Regards,
> Friedrich
>
>>
>> Regards,
>> Christian.
>>
>>>
>>> Cc: Joshua Ashton <joshua@froggi.es>
>>> Cc: Alex Deucher <alexander.deucher@amd.com>
>>> Cc: stable@vger.kernel.org
>>>
>>> Signed-off-by: Friedrich Vock <friedrich.vock@gmx.de>
>>> ---
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c | 15 +++++++++++++++
>>>   1 file changed, 15 insertions(+)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
>>> index f3b0aaf3ebc6..2a246db1d3a7 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
>>> @@ -209,6 +209,7 @@ int amdgpu_ih_process(struct amdgpu_device *adev,
>>> struct amdgpu_ih_ring *ih)
>>>   {
>>>       unsigned int count;
>>>       u32 wptr;
>>> +    int i;
>>>
>>>       if (!ih->enabled || adev->shutdown)
>>>           return IRQ_NONE;
>>> @@ -227,6 +228,20 @@ int amdgpu_ih_process(struct amdgpu_device
>>> *adev, struct amdgpu_ih_ring *ih)
>>>           ih->rptr &= ih->ptr_mask;
>>>       }
>>>
>>> +    /* If the ring buffer overflowed, we might have lost some fence
>>> +     * signal interrupts. Check if there was any activity so the 
>>> signal
>>> +     * doesn't get lost.
>>> +     */
>>> +    if (ih->overflow) {
>>> +        for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>>> +            struct amdgpu_ring *ring = adev->rings[i];
>>> +
>>> +            if (!ring || !ring->fence_drv.initialized)
>>> +                continue;
>>> +            amdgpu_fence_process(ring);
>>> +        }
>>> +    }
>>> +
>>>       amdgpu_ih_set_rptr(adev, ih);
>>>       wake_up_all(&ih->wait_process);
>>>
>>> -- 
>>> 2.43.0
>>>
>>


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 1/2] drm/amdgpu: Reset IH OVERFLOW_CLEAR bit after writing rptr
  2024-01-16  7:03     ` Christian König
@ 2024-01-16 10:31       ` Friedrich Vock
  2024-01-17 12:27         ` Christian König
  0 siblings, 1 reply; 28+ messages in thread
From: Friedrich Vock @ 2024-01-16 10:31 UTC (permalink / raw)
  To: Christian König; +Cc: Alex Deucher, amd-gfx, Joshua Ashton

On 16.01.24 08:03, Christian König wrote:
> Am 15.01.24 um 12:18 schrieb Friedrich Vock:
>> Adding the original Ccs from the thread since they seemed to be missing
>> in the reply.
>>
>> On 15.01.24 11:55, Christian König wrote:
>>> Am 14.01.24 um 14:00 schrieb Friedrich Vock:
>>>> Allows us to detect subsequent IH ring buffer overflows as well.
>>>
>>> Well that suggested handling here is certainly broken, see below.
>>>
>>>>
>>>> Cc: Joshua Ashton <joshua@froggi.es>
>>>> Cc: Alex Deucher <alexander.deucher@amd.com>
>>>> Cc: stable@vger.kernel.org
>>>>
>>>> Signed-off-by: Friedrich Vock <friedrich.vock@gmx.de>
>>>> ---
>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h  |  2 ++
>>>>   drivers/gpu/drm/amd/amdgpu/cik_ih.c     | 13 +++++++++++++
>>>>   drivers/gpu/drm/amd/amdgpu/cz_ih.c      | 14 +++++++++++++-
>>>>   drivers/gpu/drm/amd/amdgpu/iceland_ih.c | 14 +++++++++++++-
>>>>   drivers/gpu/drm/amd/amdgpu/ih_v6_0.c    | 13 +++++++++++++
>>>>   drivers/gpu/drm/amd/amdgpu/ih_v6_1.c    | 13 +++++++++++++
>>>>   drivers/gpu/drm/amd/amdgpu/navi10_ih.c  | 12 ++++++++++++
>>>>   drivers/gpu/drm/amd/amdgpu/si_ih.c      | 12 ++++++++++++
>>>>   drivers/gpu/drm/amd/amdgpu/tonga_ih.c   | 13 +++++++++++++
>>>>   drivers/gpu/drm/amd/amdgpu/vega10_ih.c  | 12 ++++++++++++
>>>>   drivers/gpu/drm/amd/amdgpu/vega20_ih.c  | 12 ++++++++++++
>>>>   11 files changed, 128 insertions(+), 2 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h
>>>> index 508f02eb0cf8..6041ec727f06 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h
>>>> @@ -69,6 +69,8 @@ struct amdgpu_ih_ring {
>>>>       unsigned        rptr;
>>>>       struct amdgpu_ih_regs    ih_regs;
>>>>
>>>> +    bool overflow;
>>>> +
>>>>       /* For waiting on IH processing at checkpoint. */
>>>>       wait_queue_head_t wait_process;
>>>>       uint64_t        processed_timestamp;
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/cik_ih.c
>>>> b/drivers/gpu/drm/amd/amdgpu/cik_ih.c
>>>> index 6f7c031dd197..807cc30c9e33 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/cik_ih.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/cik_ih.c
>>>> @@ -204,6 +204,7 @@ static u32 cik_ih_get_wptr(struct amdgpu_device
>>>> *adev,
>>>>           tmp = RREG32(mmIH_RB_CNTL);
>>>>           tmp |= IH_RB_CNTL__WPTR_OVERFLOW_CLEAR_MASK;
>>>>           WREG32(mmIH_RB_CNTL, tmp);
>>>> +        ih->overflow = true;
>>>>       }
>>>>       return (wptr & ih->ptr_mask);
>>>>   }
>>>> @@ -274,7 +275,19 @@ static void cik_ih_decode_iv(struct
>>>> amdgpu_device *adev,
>>>>   static void cik_ih_set_rptr(struct amdgpu_device *adev,
>>>>                   struct amdgpu_ih_ring *ih)
>>>>   {
>>>> +    u32 tmp;
>>>> +
>>>>       WREG32(mmIH_RB_RPTR, ih->rptr);
>>>> +
>>>> +    /* If we overflowed previously (and thus set the OVERFLOW_CLEAR
>>>> bit),
>>>> +     * reset it here to detect more overflows if they occur.
>>>> +     */
>>>> +    if (ih->overflow) {
>>>> +        tmp = RREG32(mmIH_RB_CNTL);
>>>> +        tmp &= ~IH_RB_CNTL__WPTR_OVERFLOW_CLEAR_MASK;
>>>> +        WREG32(mmIH_RB_CNTL, tmp);
>>>> +        ih->overflow = false;
>>>> +    }
>>>
>>> Well that is an extremely bad idea. We already reset the overflow
>>> after reading the WPTR.
>>
>> This is not resetting the overflow bit. This is resetting a "clear
>> overflow" bit. I don't have the hardware docs, but the name (and my
>> observations) strongly suggest that setting this bit actually prevents
>> the hardware from setting the overflow bit ever again.
>
> Well that doesn't make any sense at all. The hardware documentation
> clearly states that this bit is write only and should always read as
> zero.
>
> Setting this bit will clear the overflow flag in the WPTR register and
> clearing it has no effect at all.
>
> I could only ping the hw engineer responsible for this block to double
> check if the documentation is somehow outdated, but I really doubt so.
>
I see. I wish I had access to the documentation, but I don't, so all I
can do is tell you what I observe the hardware doing. I've tested this
on both a Steam Deck (OSSYS 5.2.0) and an RX 6700 XT (OSSYS 5.0.3). On
both systems, launching a bunch of shaders that cause page faults leads
to lots of "[gfxhub] page fault" messages in dmesg, followed by an
"amdgpu: IH ring buffer overflow".

If I re-launch the same set of shaders after the GPU has soft-recovered,
the "amdgpu: IH ring buffer overflow" message is missing, even though
the same amount of page faults should've been triggered at roughly the
same rate. Running with this patch applied makes more "amdgpu: IH ring
buffer overflow" messages appear after relaunching the faulting shaders
(but not when processing any non-faulting work).

The only possible conclusion I can draw from this is that clearing that
bit *does* have an effect, and I don't think it's far-fetched to assume
the IH ring buffer overflows still happen after re-launching the
faulting shaders but go undetected so far.

>> Right now, IH overflows, even if they occur repeatedly, only get
>> registered once. If not registering IH overflows can trivially lead to
>> system crashes, it's amdgpu's current handling that is broken.
>
> It's years that we last tested this but according to the HW
> documentation this should work fine.
>
> What could potentially happen is that the IH has silenced the source
> of the overflow. We never implemented resetting those, but in this
> case that here won't help either.
>
If the IH silenced the page faults (which quite clearly cause the
overflow here), then how are the page faults still logged in dmesg?
>>
>> The possibility of a repeated IH overflow in between reading the wptr
>> and updating the rptr is a good point, but how can we detect that at
>> all? It seems to me like we can't set the OVERFLOW_CLEAR bit at all
>> then, because we're guaranteed to miss any overflows that happen while
>> the bit is set.
>
> When an IH overflow is signaled we clear that flag by writing 1 into
> the OVERFLOW_CLEAR bit and skip one entry in the IH ring buffer.
>
> What can of course happen is that the IH ring buffer overflows more
> than this single entry and we process IVs which are potentially
> corrupted, but we won't miss any additional overflows since we only
> start processing after resetting the flag.
>
> An IH overflow is also something you should *never* see in a
> production system. This is purely for driver bringup and as fallback
> when there is a severe incorrect programming of the HW.
>
> The only exception of that is page fault handling on MI products
> because of a hardware bug, to mitigate this we are processing page
> faults on a separate IH ring on those parts.
>
> On all other hw generations the IH should have some rate limit for the
> number of faults generated per second, so that the CPU is always able
> to catch up.

I'm wondering if there is another bug in here somewhere. Your
explanation of how it's supposed to work makes a lot of sense, but from
what I can tell it doesn't work that way when I test it.

 From the printk_ratelimit stats it would seem like >2000 faults arrive
in less than a second, so perhaps your theory about fault interrupt
ratelimiting not working is correct (but it's hard for me to verify what
is going on without the documentation).

Regards,
Friedrich

>
> Regards,
> Christian.
>
>>
>> Regards,
>> Friedrich
>>
>>>
>>> When you clear the overflow again when updating the RPTR you could
>>> loose another overflow which might have happened in between and so
>>> potentially process corrupted IVs.
>>>
>>> That can trivially crash the system.
>>>
>>> Regards,
>>> Christian.
>>>
>>>>   }
>>>>
>>>>   static int cik_ih_early_init(void *handle)
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/cz_ih.c
>>>> b/drivers/gpu/drm/amd/amdgpu/cz_ih.c
>>>> index b8c47e0cf37a..076559668573 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/cz_ih.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/cz_ih.c
>>>> @@ -215,7 +215,7 @@ static u32 cz_ih_get_wptr(struct amdgpu_device
>>>> *adev,
>>>>       tmp = RREG32(mmIH_RB_CNTL);
>>>>       tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 1);
>>>>       WREG32(mmIH_RB_CNTL, tmp);
>>>> -
>>>> +    ih->overflow = true;
>>>>
>>>>   out:
>>>>       return (wptr & ih->ptr_mask);
>>>> @@ -266,7 +266,19 @@ static void cz_ih_decode_iv(struct amdgpu_device
>>>> *adev,
>>>>   static void cz_ih_set_rptr(struct amdgpu_device *adev,
>>>>                  struct amdgpu_ih_ring *ih)
>>>>   {
>>>> +    u32 tmp;
>>>> +
>>>>       WREG32(mmIH_RB_RPTR, ih->rptr);
>>>> +
>>>> +    /* If we overflowed previously (and thus set the OVERFLOW_CLEAR
>>>> bit),
>>>> +     * reset it here to detect more overflows if they occur.
>>>> +     */
>>>> +    if (ih->overflow) {
>>>> +        tmp = RREG32(mmIH_RB_CNTL);
>>>> +        tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 0);
>>>> +        WREG32(mmIH_RB_CNTL, tmp);
>>>> +        ih->overflow = false;
>>>> +    }
>>>>   }
>>>>
>>>>   static int cz_ih_early_init(void *handle)
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/iceland_ih.c
>>>> b/drivers/gpu/drm/amd/amdgpu/iceland_ih.c
>>>> index aecad530b10a..1a5e668643d1 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/iceland_ih.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/iceland_ih.c
>>>> @@ -214,7 +214,7 @@ static u32 iceland_ih_get_wptr(struct
>>>> amdgpu_device *adev,
>>>>       tmp = RREG32(mmIH_RB_CNTL);
>>>>       tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 1);
>>>>       WREG32(mmIH_RB_CNTL, tmp);
>>>> -
>>>> +    ih->overflow = true;
>>>>
>>>>   out:
>>>>       return (wptr & ih->ptr_mask);
>>>> @@ -265,7 +265,19 @@ static void iceland_ih_decode_iv(struct
>>>> amdgpu_device *adev,
>>>>   static void iceland_ih_set_rptr(struct amdgpu_device *adev,
>>>>                   struct amdgpu_ih_ring *ih)
>>>>   {
>>>> +    u32 tmp;
>>>> +
>>>>       WREG32(mmIH_RB_RPTR, ih->rptr);
>>>> +
>>>> +    /* If we overflowed previously (and thus set the OVERFLOW_CLEAR
>>>> bit),
>>>> +     * reset it here to detect more overflows if they occur.
>>>> +     */
>>>> +    if (ih->overflow) {
>>>> +        tmp = RREG32(mmIH_RB_CNTL);
>>>> +        tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 0);
>>>> +        WREG32(mmIH_RB_CNTL, tmp);
>>>> +        ih->overflow = false;
>>>> +    }
>>>>   }
>>>>
>>>>   static int iceland_ih_early_init(void *handle)
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/ih_v6_0.c
>>>> b/drivers/gpu/drm/amd/amdgpu/ih_v6_0.c
>>>> index d9ed7332d805..ce8f7feec713 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/ih_v6_0.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/ih_v6_0.c
>>>> @@ -418,6 +418,8 @@ static u32 ih_v6_0_get_wptr(struct amdgpu_device
>>>> *adev,
>>>>       tmp = RREG32_NO_KIQ(ih_regs->ih_rb_cntl);
>>>>       tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 1);
>>>>       WREG32_NO_KIQ(ih_regs->ih_rb_cntl, tmp);
>>>> +    ih->overflow = true;
>>>> +
>>>>   out:
>>>>       return (wptr & ih->ptr_mask);
>>>>   }
>>>> @@ -459,6 +461,7 @@ static void ih_v6_0_irq_rearm(struct
>>>> amdgpu_device *adev,
>>>>   static void ih_v6_0_set_rptr(struct amdgpu_device *adev,
>>>>                      struct amdgpu_ih_ring *ih)
>>>>   {
>>>> +    u32 tmp;
>>>>       struct amdgpu_ih_regs *ih_regs;
>>>>
>>>>       if (ih->use_doorbell) {
>>>> @@ -472,6 +475,16 @@ static void ih_v6_0_set_rptr(struct
>>>> amdgpu_device *adev,
>>>>           ih_regs = &ih->ih_regs;
>>>>           WREG32(ih_regs->ih_rb_rptr, ih->rptr);
>>>>       }
>>>> +
>>>> +    /* If we overflowed previously (and thus set the OVERFLOW_CLEAR
>>>> bit),
>>>> +     * reset it here to detect more overflows if they occur.
>>>> +     */
>>>> +    if (ih->overflow) {
>>>> +        tmp = RREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl);
>>>> +        tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 0);
>>>> +        WREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl, tmp);
>>>> +        ih->overflow = false;
>>>> +    }
>>>>   }
>>>>
>>>>   /**
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/ih_v6_1.c
>>>> b/drivers/gpu/drm/amd/amdgpu/ih_v6_1.c
>>>> index 8fb05eae340a..668788ad34d9 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/ih_v6_1.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/ih_v6_1.c
>>>> @@ -418,6 +418,8 @@ static u32 ih_v6_1_get_wptr(struct amdgpu_device
>>>> *adev,
>>>>       tmp = RREG32_NO_KIQ(ih_regs->ih_rb_cntl);
>>>>       tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 1);
>>>>       WREG32_NO_KIQ(ih_regs->ih_rb_cntl, tmp);
>>>> +    ih->overflow = true;
>>>> +
>>>>   out:
>>>>       return (wptr & ih->ptr_mask);
>>>>   }
>>>> @@ -459,6 +461,7 @@ static void ih_v6_1_irq_rearm(struct
>>>> amdgpu_device *adev,
>>>>   static void ih_v6_1_set_rptr(struct amdgpu_device *adev,
>>>>                      struct amdgpu_ih_ring *ih)
>>>>   {
>>>> +    u32 tmp;
>>>>       struct amdgpu_ih_regs *ih_regs;
>>>>
>>>>       if (ih->use_doorbell) {
>>>> @@ -472,6 +475,16 @@ static void ih_v6_1_set_rptr(struct
>>>> amdgpu_device *adev,
>>>>           ih_regs = &ih->ih_regs;
>>>>           WREG32(ih_regs->ih_rb_rptr, ih->rptr);
>>>>       }
>>>> +
>>>> +    /* If we overflowed previously (and thus set the OVERFLOW_CLEAR
>>>> bit),
>>>> +     * reset it here to detect more overflows if they occur.
>>>> +     */
>>>> +    if (ih->overflow) {
>>>> +        tmp = RREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl);
>>>> +        tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 0);
>>>> +        WREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl, tmp);
>>>> +        ih->overflow = false;
>>>> +    }
>>>>   }
>>>>
>>>>   /**
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/navi10_ih.c
>>>> b/drivers/gpu/drm/amd/amdgpu/navi10_ih.c
>>>> index e64b33115848..0bdac923cb4d 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/navi10_ih.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/navi10_ih.c
>>>> @@ -442,6 +442,7 @@ static u32 navi10_ih_get_wptr(struct
>>>> amdgpu_device *adev,
>>>>       tmp = RREG32_NO_KIQ(ih_regs->ih_rb_cntl);
>>>>       tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 1);
>>>>       WREG32_NO_KIQ(ih_regs->ih_rb_cntl, tmp);
>>>> +    ih->overflow = true;
>>>>   out:
>>>>       return (wptr & ih->ptr_mask);
>>>>   }
>>>> @@ -483,6 +484,7 @@ static void navi10_ih_irq_rearm(struct
>>>> amdgpu_device *adev,
>>>>   static void navi10_ih_set_rptr(struct amdgpu_device *adev,
>>>>                      struct amdgpu_ih_ring *ih)
>>>>   {
>>>> +    u32 tmp;
>>>>       struct amdgpu_ih_regs *ih_regs;
>>>>
>>>>       if (ih == &adev->irq.ih_soft)
>>>> @@ -499,6 +501,16 @@ static void navi10_ih_set_rptr(struct
>>>> amdgpu_device *adev,
>>>>           ih_regs = &ih->ih_regs;
>>>>           WREG32(ih_regs->ih_rb_rptr, ih->rptr);
>>>>       }
>>>> +
>>>> +    /* If we overflowed previously (and thus set the OVERFLOW_CLEAR
>>>> bit),
>>>> +     * reset it here to detect more overflows if they occur.
>>>> +     */
>>>> +    if (ih->overflow) {
>>>> +        tmp = RREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl);
>>>> +        tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 0);
>>>> +        WREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl, tmp);
>>>> +        ih->overflow = false;
>>>> +    }
>>>>   }
>>>>
>>>>   /**
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/si_ih.c
>>>> b/drivers/gpu/drm/amd/amdgpu/si_ih.c
>>>> index 9a24f17a5750..ff35056d2b54 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/si_ih.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/si_ih.c
>>>> @@ -119,6 +119,7 @@ static u32 si_ih_get_wptr(struct amdgpu_device
>>>> *adev,
>>>>           tmp = RREG32(IH_RB_CNTL);
>>>>           tmp |= IH_RB_CNTL__WPTR_OVERFLOW_CLEAR_MASK;
>>>>           WREG32(IH_RB_CNTL, tmp);
>>>> +        ih->overflow = true;
>>>>       }
>>>>       return (wptr & ih->ptr_mask);
>>>>   }
>>>> @@ -147,7 +148,18 @@ static void si_ih_decode_iv(struct amdgpu_device
>>>> *adev,
>>>>   static void si_ih_set_rptr(struct amdgpu_device *adev,
>>>>                  struct amdgpu_ih_ring *ih)
>>>>   {
>>>> +    u32 tmp;
>>>> +
>>>>       WREG32(IH_RB_RPTR, ih->rptr);
>>>> +
>>>> +    /* If we overflowed previously (and thus set the OVERFLOW_CLEAR
>>>> bit),
>>>> +     * reset it here to detect more overflows if they occur.
>>>> +     */
>>>> +    if (ih->overflow) {
>>>> +        tmp = RREG32(IH_RB_CNTL);
>>>> +        tmp &= ~IH_RB_CNTL__WPTR_OVERFLOW_CLEAR_MASK;
>>>> +        WREG32(IH_RB_CNTL, tmp);
>>>> +    }
>>>>   }
>>>>
>>>>   static int si_ih_early_init(void *handle)
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/tonga_ih.c
>>>> b/drivers/gpu/drm/amd/amdgpu/tonga_ih.c
>>>> index 917707bba7f3..6f5090d3db48 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/tonga_ih.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/tonga_ih.c
>>>> @@ -218,6 +218,7 @@ static u32 tonga_ih_get_wptr(struct amdgpu_device
>>>> *adev,
>>>>       tmp = RREG32(mmIH_RB_CNTL);
>>>>       tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 1);
>>>>       WREG32(mmIH_RB_CNTL, tmp);
>>>> +    ih->overflow = true;
>>>>
>>>>   out:
>>>>       return (wptr & ih->ptr_mask);
>>>> @@ -268,6 +269,8 @@ static void tonga_ih_decode_iv(struct
>>>> amdgpu_device *adev,
>>>>   static void tonga_ih_set_rptr(struct amdgpu_device *adev,
>>>>                     struct amdgpu_ih_ring *ih)
>>>>   {
>>>> +    u32 tmp;
>>>> +
>>>>       if (ih->use_doorbell) {
>>>>           /* XXX check if swapping is necessary on BE */
>>>>           *ih->rptr_cpu = ih->rptr;
>>>> @@ -275,6 +278,16 @@ static void tonga_ih_set_rptr(struct
>>>> amdgpu_device *adev,
>>>>       } else {
>>>>           WREG32(mmIH_RB_RPTR, ih->rptr);
>>>>       }
>>>> +
>>>> +    /* If we overflowed previously (and thus set the OVERFLOW_CLEAR
>>>> bit),
>>>> +     * reset it here to detect more overflows if they occur.
>>>> +     */
>>>> +    if (ih->overflow) {
>>>> +        tmp = RREG32(mmIH_RB_CNTL);
>>>> +        tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 0);
>>>> +        WREG32(mmIH_RB_CNTL, tmp);
>>>> +        ih->overflow = false;
>>>> +    }
>>>>   }
>>>>
>>>>   static int tonga_ih_early_init(void *handle)
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
>>>> b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
>>>> index d364c6dd152c..bb005924f194 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
>>>> @@ -372,6 +372,7 @@ static u32 vega10_ih_get_wptr(struct
>>>> amdgpu_device *adev,
>>>>       tmp = RREG32_NO_KIQ(ih_regs->ih_rb_cntl);
>>>>       tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 1);
>>>>       WREG32_NO_KIQ(ih_regs->ih_rb_cntl, tmp);
>>>> +    ih->overflow = true;
>>>>
>>>>   out:
>>>>       return (wptr & ih->ptr_mask);
>>>> @@ -413,6 +414,7 @@ static void vega10_ih_irq_rearm(struct
>>>> amdgpu_device *adev,
>>>>   static void vega10_ih_set_rptr(struct amdgpu_device *adev,
>>>>                      struct amdgpu_ih_ring *ih)
>>>>   {
>>>> +    u32 tmp;
>>>>       struct amdgpu_ih_regs *ih_regs;
>>>>
>>>>       if (ih == &adev->irq.ih_soft)
>>>> @@ -429,6 +431,16 @@ static void vega10_ih_set_rptr(struct
>>>> amdgpu_device *adev,
>>>>           ih_regs = &ih->ih_regs;
>>>>           WREG32(ih_regs->ih_rb_rptr, ih->rptr);
>>>>       }
>>>> +
>>>> +    /* If we overflowed previously (and thus set the OVERFLOW_CLEAR
>>>> bit),
>>>> +     * reset it here to detect more overflows if they occur.
>>>> +     */
>>>> +    if (ih->overflow) {
>>>> +        tmp = RREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl);
>>>> +        tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 0);
>>>> +        WREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl, tmp);
>>>> +        ih->overflow = false;
>>>> +    }
>>>>   }
>>>>
>>>>   /**
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/vega20_ih.c
>>>> b/drivers/gpu/drm/amd/amdgpu/vega20_ih.c
>>>> index ddfc6941f9d5..bb725a970697 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/vega20_ih.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/vega20_ih.c
>>>> @@ -420,6 +420,7 @@ static u32 vega20_ih_get_wptr(struct
>>>> amdgpu_device *adev,
>>>>       tmp = RREG32_NO_KIQ(ih_regs->ih_rb_cntl);
>>>>       tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 1);
>>>>       WREG32_NO_KIQ(ih_regs->ih_rb_cntl, tmp);
>>>> +    ih->overflow = true;
>>>>
>>>>   out:
>>>>       return (wptr & ih->ptr_mask);
>>>> @@ -462,6 +463,7 @@ static void vega20_ih_irq_rearm(struct
>>>> amdgpu_device *adev,
>>>>   static void vega20_ih_set_rptr(struct amdgpu_device *adev,
>>>>                      struct amdgpu_ih_ring *ih)
>>>>   {
>>>> +    u32 tmp;
>>>>       struct amdgpu_ih_regs *ih_regs;
>>>>
>>>>       if (ih == &adev->irq.ih_soft)
>>>> @@ -478,6 +480,16 @@ static void vega20_ih_set_rptr(struct
>>>> amdgpu_device *adev,
>>>>           ih_regs = &ih->ih_regs;
>>>>           WREG32(ih_regs->ih_rb_rptr, ih->rptr);
>>>>       }
>>>> +
>>>> +    /* If we overflowed previously (and thus set the OVERFLOW_CLEAR
>>>> bit),
>>>> +     * reset it here to detect more overflows if they occur.
>>>> +     */
>>>> +    if (ih->overflow) {
>>>> +        tmp = RREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl);
>>>> +        tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 0);
>>>> +        WREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl, tmp);
>>>> +        ih->overflow = false;
>>>> +    }
>>>>   }
>>>>
>>>>   /**
>>>> --
>>>> 2.43.0
>>>>
>>>
>

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 1/2] drm/amdgpu: Reset IH OVERFLOW_CLEAR bit after writing rptr
  2024-01-16 10:31       ` Friedrich Vock
@ 2024-01-17 12:27         ` Christian König
  2024-01-17 23:00           ` Alex Deucher
  0 siblings, 1 reply; 28+ messages in thread
From: Christian König @ 2024-01-17 12:27 UTC (permalink / raw)
  To: Friedrich Vock; +Cc: Alex Deucher, amd-gfx, Joshua Ashton

Am 16.01.24 um 11:31 schrieb Friedrich Vock:
> On 16.01.24 08:03, Christian König wrote:
>> Am 15.01.24 um 12:18 schrieb Friedrich Vock:
>>> [SNIP]
>>>>> +    if (ih->overflow) {
>>>>> +        tmp = RREG32(mmIH_RB_CNTL);
>>>>> +        tmp &= ~IH_RB_CNTL__WPTR_OVERFLOW_CLEAR_MASK;
>>>>> +        WREG32(mmIH_RB_CNTL, tmp);
>>>>> +        ih->overflow = false;
>>>>> +    }
>>>>
>>>> Well that is an extremely bad idea. We already reset the overflow
>>>> after reading the WPTR.
>>>
>>> This is not resetting the overflow bit. This is resetting a "clear
>>> overflow" bit. I don't have the hardware docs, but the name (and my
>>> observations) strongly suggest that setting this bit actually prevents
>>> the hardware from setting the overflow bit ever again.
>>
>> Well that doesn't make any sense at all. The hardware documentation
>> clearly states that this bit is write only and should always read as
>> zero.
>>
>> Setting this bit will clear the overflow flag in the WPTR register and
>> clearing it has no effect at all.
>>
>> I could only ping the hw engineer responsible for this block to double
>> check if the documentation is somehow outdated, but I really doubt so.
>>
> I see. I wish I had access to the documentation,

Well, doesn't Valve has an NDA in place?

> but I don't, so all I
> can do is tell you what I observe the hardware doing. I've tested this
> on both a Steam Deck (OSSYS 5.2.0) and an RX 6700 XT (OSSYS 5.0.3). On
> both systems, launching a bunch of shaders that cause page faults leads
> to lots of "[gfxhub] page fault" messages in dmesg, followed by an
> "amdgpu: IH ring buffer overflow".

Well that is certainly a bug, maybe even the same thing we have seen on 
Vega and MI.

What we could do is to try to apply the same workaround to re-route the 
page faults to a different IH ring.

See those patches here as well:

commit 516bc3d8dd7965f1a8a3ea453857f14d95971e62
Author: Christian König <christian.koenig@amd.com>
Date:   Fri Nov 2 15:00:16 2018 +0100

     drm/amdgpu: reroute VMC and UMD to IH ring 1

     Page faults can easily overwhelm the interrupt handler.

     So to make sure that we never lose valuable interrupts on the 
primary ring
     we re-route page faults to IH ring 1.

commit b849aaa41c914a0fd88003f88cb04420a873c624
Author: Christian König <christian.koenig@amd.com>
Date:   Mon Mar 4 19:34:34 2019 +0100

     drm/amdgpu: also reroute VMC and UMD to IH ring 1 on Vega 20

     Same patch we alredy did for Vega10. Just re-route page faults to a 
separate
     ring to avoid drowning in interrupts.

>
> If I re-launch the same set of shaders after the GPU has soft-recovered,
> the "amdgpu: IH ring buffer overflow" message is missing, even though
> the same amount of page faults should've been triggered at roughly the
> same rate. Running with this patch applied makes more "amdgpu: IH ring
> buffer overflow" messages appear after relaunching the faulting shaders
> (but not when processing any non-faulting work).

That is actually the expected behavior. There should be a limit on the 
number of faults written to the ring so that the ring never overflows.

>
> The only possible conclusion I can draw from this is that clearing that
> bit *does* have an effect, and I don't think it's far-fetched to assume
> the IH ring buffer overflows still happen after re-launching the
> faulting shaders but go undetected so far.

Well that can only mean that the hw documentation is incorrect.

Either the value is not write only trigger bit as documented or we need 
an additional read of the register for it to take effect or something 
like this.

>>> Right now, IH overflows, even if they occur repeatedly, only get
>>> registered once. If not registering IH overflows can trivially lead to
>>> system crashes, it's amdgpu's current handling that is broken.
>>
>> It's years that we last tested this but according to the HW
>> documentation this should work fine.
>>
>> What could potentially happen is that the IH has silenced the source
>> of the overflow. We never implemented resetting those, but in this
>> case that here won't help either.
>>
> If the IH silenced the page faults (which quite clearly cause the
> overflow here), then how are the page faults still logged in dmesg?

There should be a hardware rate limit for the page faults, e.g. there 
can only be X faults reported in N clock cycles and then a delay is 
inserted.

>>>
>>> The possibility of a repeated IH overflow in between reading the wptr
>>> and updating the rptr is a good point, but how can we detect that at
>>> all? It seems to me like we can't set the OVERFLOW_CLEAR bit at all
>>> then, because we're guaranteed to miss any overflows that happen while
>>> the bit is set.
>>
>> When an IH overflow is signaled we clear that flag by writing 1 into
>> the OVERFLOW_CLEAR bit and skip one entry in the IH ring buffer.
>>
>> What can of course happen is that the IH ring buffer overflows more
>> than this single entry and we process IVs which are potentially
>> corrupted, but we won't miss any additional overflows since we only
>> start processing after resetting the flag.
>>
>> An IH overflow is also something you should *never* see in a
>> production system. This is purely for driver bringup and as fallback
>> when there is a severe incorrect programming of the HW.
>>
>> The only exception of that is page fault handling on MI products
>> because of a hardware bug, to mitigate this we are processing page
>> faults on a separate IH ring on those parts.
>>
>> On all other hw generations the IH should have some rate limit for the
>> number of faults generated per second, so that the CPU is always able
>> to catch up.
>
> I'm wondering if there is another bug in here somewhere. Your
> explanation of how it's supposed to work makes a lot of sense, but from
> what I can tell it doesn't work that way when I test it.
>
> From the printk_ratelimit stats it would seem like >2000 faults arrive
> in less than a second, so perhaps your theory about fault interrupt
> ratelimiting not working is correct (but it's hard for me to verify what
> is going on without the documentation).

I'm going to ping the relevant engineer and putting someone on the task 
to take a look.

Thanks,
Christian.

>
> Regards,
> Friedrich
>
>>
>> Regards,
>> Christian.
>>
>>>
>>> Regards,
>>> Friedrich
>>>
>>>>
>>>> When you clear the overflow again when updating the RPTR you could
>>>> loose another overflow which might have happened in between and so
>>>> potentially process corrupted IVs.
>>>>
>>>> That can trivially crash the system.
>>>>
>>>> Regards,
>>>> Christian.
>>>>
>>>>>   }
>>>>>
>>>>>   static int cik_ih_early_init(void *handle)
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/cz_ih.c
>>>>> b/drivers/gpu/drm/amd/amdgpu/cz_ih.c
>>>>> index b8c47e0cf37a..076559668573 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/cz_ih.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/cz_ih.c
>>>>> @@ -215,7 +215,7 @@ static u32 cz_ih_get_wptr(struct amdgpu_device
>>>>> *adev,
>>>>>       tmp = RREG32(mmIH_RB_CNTL);
>>>>>       tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 1);
>>>>>       WREG32(mmIH_RB_CNTL, tmp);
>>>>> -
>>>>> +    ih->overflow = true;
>>>>>
>>>>>   out:
>>>>>       return (wptr & ih->ptr_mask);
>>>>> @@ -266,7 +266,19 @@ static void cz_ih_decode_iv(struct amdgpu_device
>>>>> *adev,
>>>>>   static void cz_ih_set_rptr(struct amdgpu_device *adev,
>>>>>                  struct amdgpu_ih_ring *ih)
>>>>>   {
>>>>> +    u32 tmp;
>>>>> +
>>>>>       WREG32(mmIH_RB_RPTR, ih->rptr);
>>>>> +
>>>>> +    /* If we overflowed previously (and thus set the OVERFLOW_CLEAR
>>>>> bit),
>>>>> +     * reset it here to detect more overflows if they occur.
>>>>> +     */
>>>>> +    if (ih->overflow) {
>>>>> +        tmp = RREG32(mmIH_RB_CNTL);
>>>>> +        tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 
>>>>> 0);
>>>>> +        WREG32(mmIH_RB_CNTL, tmp);
>>>>> +        ih->overflow = false;
>>>>> +    }
>>>>>   }
>>>>>
>>>>>   static int cz_ih_early_init(void *handle)
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/iceland_ih.c
>>>>> b/drivers/gpu/drm/amd/amdgpu/iceland_ih.c
>>>>> index aecad530b10a..1a5e668643d1 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/iceland_ih.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/iceland_ih.c
>>>>> @@ -214,7 +214,7 @@ static u32 iceland_ih_get_wptr(struct
>>>>> amdgpu_device *adev,
>>>>>       tmp = RREG32(mmIH_RB_CNTL);
>>>>>       tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 1);
>>>>>       WREG32(mmIH_RB_CNTL, tmp);
>>>>> -
>>>>> +    ih->overflow = true;
>>>>>
>>>>>   out:
>>>>>       return (wptr & ih->ptr_mask);
>>>>> @@ -265,7 +265,19 @@ static void iceland_ih_decode_iv(struct
>>>>> amdgpu_device *adev,
>>>>>   static void iceland_ih_set_rptr(struct amdgpu_device *adev,
>>>>>                   struct amdgpu_ih_ring *ih)
>>>>>   {
>>>>> +    u32 tmp;
>>>>> +
>>>>>       WREG32(mmIH_RB_RPTR, ih->rptr);
>>>>> +
>>>>> +    /* If we overflowed previously (and thus set the OVERFLOW_CLEAR
>>>>> bit),
>>>>> +     * reset it here to detect more overflows if they occur.
>>>>> +     */
>>>>> +    if (ih->overflow) {
>>>>> +        tmp = RREG32(mmIH_RB_CNTL);
>>>>> +        tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 
>>>>> 0);
>>>>> +        WREG32(mmIH_RB_CNTL, tmp);
>>>>> +        ih->overflow = false;
>>>>> +    }
>>>>>   }
>>>>>
>>>>>   static int iceland_ih_early_init(void *handle)
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/ih_v6_0.c
>>>>> b/drivers/gpu/drm/amd/amdgpu/ih_v6_0.c
>>>>> index d9ed7332d805..ce8f7feec713 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/ih_v6_0.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/ih_v6_0.c
>>>>> @@ -418,6 +418,8 @@ static u32 ih_v6_0_get_wptr(struct amdgpu_device
>>>>> *adev,
>>>>>       tmp = RREG32_NO_KIQ(ih_regs->ih_rb_cntl);
>>>>>       tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 1);
>>>>>       WREG32_NO_KIQ(ih_regs->ih_rb_cntl, tmp);
>>>>> +    ih->overflow = true;
>>>>> +
>>>>>   out:
>>>>>       return (wptr & ih->ptr_mask);
>>>>>   }
>>>>> @@ -459,6 +461,7 @@ static void ih_v6_0_irq_rearm(struct
>>>>> amdgpu_device *adev,
>>>>>   static void ih_v6_0_set_rptr(struct amdgpu_device *adev,
>>>>>                      struct amdgpu_ih_ring *ih)
>>>>>   {
>>>>> +    u32 tmp;
>>>>>       struct amdgpu_ih_regs *ih_regs;
>>>>>
>>>>>       if (ih->use_doorbell) {
>>>>> @@ -472,6 +475,16 @@ static void ih_v6_0_set_rptr(struct
>>>>> amdgpu_device *adev,
>>>>>           ih_regs = &ih->ih_regs;
>>>>>           WREG32(ih_regs->ih_rb_rptr, ih->rptr);
>>>>>       }
>>>>> +
>>>>> +    /* If we overflowed previously (and thus set the OVERFLOW_CLEAR
>>>>> bit),
>>>>> +     * reset it here to detect more overflows if they occur.
>>>>> +     */
>>>>> +    if (ih->overflow) {
>>>>> +        tmp = RREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl);
>>>>> +        tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 
>>>>> 0);
>>>>> +        WREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl, tmp);
>>>>> +        ih->overflow = false;
>>>>> +    }
>>>>>   }
>>>>>
>>>>>   /**
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/ih_v6_1.c
>>>>> b/drivers/gpu/drm/amd/amdgpu/ih_v6_1.c
>>>>> index 8fb05eae340a..668788ad34d9 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/ih_v6_1.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/ih_v6_1.c
>>>>> @@ -418,6 +418,8 @@ static u32 ih_v6_1_get_wptr(struct amdgpu_device
>>>>> *adev,
>>>>>       tmp = RREG32_NO_KIQ(ih_regs->ih_rb_cntl);
>>>>>       tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 1);
>>>>>       WREG32_NO_KIQ(ih_regs->ih_rb_cntl, tmp);
>>>>> +    ih->overflow = true;
>>>>> +
>>>>>   out:
>>>>>       return (wptr & ih->ptr_mask);
>>>>>   }
>>>>> @@ -459,6 +461,7 @@ static void ih_v6_1_irq_rearm(struct
>>>>> amdgpu_device *adev,
>>>>>   static void ih_v6_1_set_rptr(struct amdgpu_device *adev,
>>>>>                      struct amdgpu_ih_ring *ih)
>>>>>   {
>>>>> +    u32 tmp;
>>>>>       struct amdgpu_ih_regs *ih_regs;
>>>>>
>>>>>       if (ih->use_doorbell) {
>>>>> @@ -472,6 +475,16 @@ static void ih_v6_1_set_rptr(struct
>>>>> amdgpu_device *adev,
>>>>>           ih_regs = &ih->ih_regs;
>>>>>           WREG32(ih_regs->ih_rb_rptr, ih->rptr);
>>>>>       }
>>>>> +
>>>>> +    /* If we overflowed previously (and thus set the OVERFLOW_CLEAR
>>>>> bit),
>>>>> +     * reset it here to detect more overflows if they occur.
>>>>> +     */
>>>>> +    if (ih->overflow) {
>>>>> +        tmp = RREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl);
>>>>> +        tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 
>>>>> 0);
>>>>> +        WREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl, tmp);
>>>>> +        ih->overflow = false;
>>>>> +    }
>>>>>   }
>>>>>
>>>>>   /**
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/navi10_ih.c
>>>>> b/drivers/gpu/drm/amd/amdgpu/navi10_ih.c
>>>>> index e64b33115848..0bdac923cb4d 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/navi10_ih.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/navi10_ih.c
>>>>> @@ -442,6 +442,7 @@ static u32 navi10_ih_get_wptr(struct
>>>>> amdgpu_device *adev,
>>>>>       tmp = RREG32_NO_KIQ(ih_regs->ih_rb_cntl);
>>>>>       tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 1);
>>>>>       WREG32_NO_KIQ(ih_regs->ih_rb_cntl, tmp);
>>>>> +    ih->overflow = true;
>>>>>   out:
>>>>>       return (wptr & ih->ptr_mask);
>>>>>   }
>>>>> @@ -483,6 +484,7 @@ static void navi10_ih_irq_rearm(struct
>>>>> amdgpu_device *adev,
>>>>>   static void navi10_ih_set_rptr(struct amdgpu_device *adev,
>>>>>                      struct amdgpu_ih_ring *ih)
>>>>>   {
>>>>> +    u32 tmp;
>>>>>       struct amdgpu_ih_regs *ih_regs;
>>>>>
>>>>>       if (ih == &adev->irq.ih_soft)
>>>>> @@ -499,6 +501,16 @@ static void navi10_ih_set_rptr(struct
>>>>> amdgpu_device *adev,
>>>>>           ih_regs = &ih->ih_regs;
>>>>>           WREG32(ih_regs->ih_rb_rptr, ih->rptr);
>>>>>       }
>>>>> +
>>>>> +    /* If we overflowed previously (and thus set the OVERFLOW_CLEAR
>>>>> bit),
>>>>> +     * reset it here to detect more overflows if they occur.
>>>>> +     */
>>>>> +    if (ih->overflow) {
>>>>> +        tmp = RREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl);
>>>>> +        tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 
>>>>> 0);
>>>>> +        WREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl, tmp);
>>>>> +        ih->overflow = false;
>>>>> +    }
>>>>>   }
>>>>>
>>>>>   /**
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/si_ih.c
>>>>> b/drivers/gpu/drm/amd/amdgpu/si_ih.c
>>>>> index 9a24f17a5750..ff35056d2b54 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/si_ih.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/si_ih.c
>>>>> @@ -119,6 +119,7 @@ static u32 si_ih_get_wptr(struct amdgpu_device
>>>>> *adev,
>>>>>           tmp = RREG32(IH_RB_CNTL);
>>>>>           tmp |= IH_RB_CNTL__WPTR_OVERFLOW_CLEAR_MASK;
>>>>>           WREG32(IH_RB_CNTL, tmp);
>>>>> +        ih->overflow = true;
>>>>>       }
>>>>>       return (wptr & ih->ptr_mask);
>>>>>   }
>>>>> @@ -147,7 +148,18 @@ static void si_ih_decode_iv(struct amdgpu_device
>>>>> *adev,
>>>>>   static void si_ih_set_rptr(struct amdgpu_device *adev,
>>>>>                  struct amdgpu_ih_ring *ih)
>>>>>   {
>>>>> +    u32 tmp;
>>>>> +
>>>>>       WREG32(IH_RB_RPTR, ih->rptr);
>>>>> +
>>>>> +    /* If we overflowed previously (and thus set the OVERFLOW_CLEAR
>>>>> bit),
>>>>> +     * reset it here to detect more overflows if they occur.
>>>>> +     */
>>>>> +    if (ih->overflow) {
>>>>> +        tmp = RREG32(IH_RB_CNTL);
>>>>> +        tmp &= ~IH_RB_CNTL__WPTR_OVERFLOW_CLEAR_MASK;
>>>>> +        WREG32(IH_RB_CNTL, tmp);
>>>>> +    }
>>>>>   }
>>>>>
>>>>>   static int si_ih_early_init(void *handle)
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/tonga_ih.c
>>>>> b/drivers/gpu/drm/amd/amdgpu/tonga_ih.c
>>>>> index 917707bba7f3..6f5090d3db48 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/tonga_ih.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/tonga_ih.c
>>>>> @@ -218,6 +218,7 @@ static u32 tonga_ih_get_wptr(struct amdgpu_device
>>>>> *adev,
>>>>>       tmp = RREG32(mmIH_RB_CNTL);
>>>>>       tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 1);
>>>>>       WREG32(mmIH_RB_CNTL, tmp);
>>>>> +    ih->overflow = true;
>>>>>
>>>>>   out:
>>>>>       return (wptr & ih->ptr_mask);
>>>>> @@ -268,6 +269,8 @@ static void tonga_ih_decode_iv(struct
>>>>> amdgpu_device *adev,
>>>>>   static void tonga_ih_set_rptr(struct amdgpu_device *adev,
>>>>>                     struct amdgpu_ih_ring *ih)
>>>>>   {
>>>>> +    u32 tmp;
>>>>> +
>>>>>       if (ih->use_doorbell) {
>>>>>           /* XXX check if swapping is necessary on BE */
>>>>>           *ih->rptr_cpu = ih->rptr;
>>>>> @@ -275,6 +278,16 @@ static void tonga_ih_set_rptr(struct
>>>>> amdgpu_device *adev,
>>>>>       } else {
>>>>>           WREG32(mmIH_RB_RPTR, ih->rptr);
>>>>>       }
>>>>> +
>>>>> +    /* If we overflowed previously (and thus set the OVERFLOW_CLEAR
>>>>> bit),
>>>>> +     * reset it here to detect more overflows if they occur.
>>>>> +     */
>>>>> +    if (ih->overflow) {
>>>>> +        tmp = RREG32(mmIH_RB_CNTL);
>>>>> +        tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 
>>>>> 0);
>>>>> +        WREG32(mmIH_RB_CNTL, tmp);
>>>>> +        ih->overflow = false;
>>>>> +    }
>>>>>   }
>>>>>
>>>>>   static int tonga_ih_early_init(void *handle)
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
>>>>> b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
>>>>> index d364c6dd152c..bb005924f194 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
>>>>> @@ -372,6 +372,7 @@ static u32 vega10_ih_get_wptr(struct
>>>>> amdgpu_device *adev,
>>>>>       tmp = RREG32_NO_KIQ(ih_regs->ih_rb_cntl);
>>>>>       tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 1);
>>>>>       WREG32_NO_KIQ(ih_regs->ih_rb_cntl, tmp);
>>>>> +    ih->overflow = true;
>>>>>
>>>>>   out:
>>>>>       return (wptr & ih->ptr_mask);
>>>>> @@ -413,6 +414,7 @@ static void vega10_ih_irq_rearm(struct
>>>>> amdgpu_device *adev,
>>>>>   static void vega10_ih_set_rptr(struct amdgpu_device *adev,
>>>>>                      struct amdgpu_ih_ring *ih)
>>>>>   {
>>>>> +    u32 tmp;
>>>>>       struct amdgpu_ih_regs *ih_regs;
>>>>>
>>>>>       if (ih == &adev->irq.ih_soft)
>>>>> @@ -429,6 +431,16 @@ static void vega10_ih_set_rptr(struct
>>>>> amdgpu_device *adev,
>>>>>           ih_regs = &ih->ih_regs;
>>>>>           WREG32(ih_regs->ih_rb_rptr, ih->rptr);
>>>>>       }
>>>>> +
>>>>> +    /* If we overflowed previously (and thus set the OVERFLOW_CLEAR
>>>>> bit),
>>>>> +     * reset it here to detect more overflows if they occur.
>>>>> +     */
>>>>> +    if (ih->overflow) {
>>>>> +        tmp = RREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl);
>>>>> +        tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 
>>>>> 0);
>>>>> +        WREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl, tmp);
>>>>> +        ih->overflow = false;
>>>>> +    }
>>>>>   }
>>>>>
>>>>>   /**
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/vega20_ih.c
>>>>> b/drivers/gpu/drm/amd/amdgpu/vega20_ih.c
>>>>> index ddfc6941f9d5..bb725a970697 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/vega20_ih.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/vega20_ih.c
>>>>> @@ -420,6 +420,7 @@ static u32 vega20_ih_get_wptr(struct
>>>>> amdgpu_device *adev,
>>>>>       tmp = RREG32_NO_KIQ(ih_regs->ih_rb_cntl);
>>>>>       tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 1);
>>>>>       WREG32_NO_KIQ(ih_regs->ih_rb_cntl, tmp);
>>>>> +    ih->overflow = true;
>>>>>
>>>>>   out:
>>>>>       return (wptr & ih->ptr_mask);
>>>>> @@ -462,6 +463,7 @@ static void vega20_ih_irq_rearm(struct
>>>>> amdgpu_device *adev,
>>>>>   static void vega20_ih_set_rptr(struct amdgpu_device *adev,
>>>>>                      struct amdgpu_ih_ring *ih)
>>>>>   {
>>>>> +    u32 tmp;
>>>>>       struct amdgpu_ih_regs *ih_regs;
>>>>>
>>>>>       if (ih == &adev->irq.ih_soft)
>>>>> @@ -478,6 +480,16 @@ static void vega20_ih_set_rptr(struct
>>>>> amdgpu_device *adev,
>>>>>           ih_regs = &ih->ih_regs;
>>>>>           WREG32(ih_regs->ih_rb_rptr, ih->rptr);
>>>>>       }
>>>>> +
>>>>> +    /* If we overflowed previously (and thus set the OVERFLOW_CLEAR
>>>>> bit),
>>>>> +     * reset it here to detect more overflows if they occur.
>>>>> +     */
>>>>> +    if (ih->overflow) {
>>>>> +        tmp = RREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl);
>>>>> +        tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 
>>>>> 0);
>>>>> +        WREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl, tmp);
>>>>> +        ih->overflow = false;
>>>>> +    }
>>>>>   }
>>>>>
>>>>>   /**
>>>>> -- 
>>>>> 2.43.0
>>>>>
>>>>
>>


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 1/2] drm/amdgpu: Reset IH OVERFLOW_CLEAR bit after writing rptr
  2024-01-17 12:27         ` Christian König
@ 2024-01-17 23:00           ` Alex Deucher
  2024-01-17 23:44             ` Friedrich Vock
  0 siblings, 1 reply; 28+ messages in thread
From: Alex Deucher @ 2024-01-17 23:00 UTC (permalink / raw)
  To: Christian König, Christian Koenig
  Cc: Alex Deucher, Friedrich Vock, Joshua Ashton, amd-gfx

On Wed, Jan 17, 2024 at 7:36 AM Christian König
<ckoenig.leichtzumerken@gmail.com> wrote:
>
> Am 16.01.24 um 11:31 schrieb Friedrich Vock:
> > On 16.01.24 08:03, Christian König wrote:
> >> Am 15.01.24 um 12:18 schrieb Friedrich Vock:
> >>> [SNIP]
> >>>>> +    if (ih->overflow) {
> >>>>> +        tmp = RREG32(mmIH_RB_CNTL);
> >>>>> +        tmp &= ~IH_RB_CNTL__WPTR_OVERFLOW_CLEAR_MASK;
> >>>>> +        WREG32(mmIH_RB_CNTL, tmp);
> >>>>> +        ih->overflow = false;
> >>>>> +    }
> >>>>
> >>>> Well that is an extremely bad idea. We already reset the overflow
> >>>> after reading the WPTR.
> >>>
> >>> This is not resetting the overflow bit. This is resetting a "clear
> >>> overflow" bit. I don't have the hardware docs, but the name (and my
> >>> observations) strongly suggest that setting this bit actually prevents
> >>> the hardware from setting the overflow bit ever again.
> >>
> >> Well that doesn't make any sense at all. The hardware documentation
> >> clearly states that this bit is write only and should always read as
> >> zero.
> >>
> >> Setting this bit will clear the overflow flag in the WPTR register and
> >> clearing it has no effect at all.
> >>
> >> I could only ping the hw engineer responsible for this block to double
> >> check if the documentation is somehow outdated, but I really doubt so.
> >>
> > I see. I wish I had access to the documentation,
>
> Well, doesn't Valve has an NDA in place?
>
> > but I don't, so all I
> > can do is tell you what I observe the hardware doing. I've tested this
> > on both a Steam Deck (OSSYS 5.2.0) and an RX 6700 XT (OSSYS 5.0.3). On
> > both systems, launching a bunch of shaders that cause page faults leads
> > to lots of "[gfxhub] page fault" messages in dmesg, followed by an
> > "amdgpu: IH ring buffer overflow".
>
> Well that is certainly a bug, maybe even the same thing we have seen on
> Vega and MI.
>
> What we could do is to try to apply the same workaround to re-route the
> page faults to a different IH ring.
>
> See those patches here as well:
>
> commit 516bc3d8dd7965f1a8a3ea453857f14d95971e62
> Author: Christian König <christian.koenig@amd.com>
> Date:   Fri Nov 2 15:00:16 2018 +0100
>
>      drm/amdgpu: reroute VMC and UMD to IH ring 1
>
>      Page faults can easily overwhelm the interrupt handler.
>
>      So to make sure that we never lose valuable interrupts on the
> primary ring
>      we re-route page faults to IH ring 1.
>
> commit b849aaa41c914a0fd88003f88cb04420a873c624
> Author: Christian König <christian.koenig@amd.com>
> Date:   Mon Mar 4 19:34:34 2019 +0100
>
>      drm/amdgpu: also reroute VMC and UMD to IH ring 1 on Vega 20
>
>      Same patch we alredy did for Vega10. Just re-route page faults to a
> separate
>      ring to avoid drowning in interrupts.
>
> >
> > If I re-launch the same set of shaders after the GPU has soft-recovered,
> > the "amdgpu: IH ring buffer overflow" message is missing, even though
> > the same amount of page faults should've been triggered at roughly the
> > same rate. Running with this patch applied makes more "amdgpu: IH ring
> > buffer overflow" messages appear after relaunching the faulting shaders
> > (but not when processing any non-faulting work).
>
> That is actually the expected behavior. There should be a limit on the
> number of faults written to the ring so that the ring never overflows.
>
> >
> > The only possible conclusion I can draw from this is that clearing that
> > bit *does* have an effect, and I don't think it's far-fetched to assume
> > the IH ring buffer overflows still happen after re-launching the
> > faulting shaders but go undetected so far.
>
> Well that can only mean that the hw documentation is incorrect.
>
> Either the value is not write only trigger bit as documented or we need
> an additional read of the register for it to take effect or something
> like this.
>
> >>> Right now, IH overflows, even if they occur repeatedly, only get
> >>> registered once. If not registering IH overflows can trivially lead to
> >>> system crashes, it's amdgpu's current handling that is broken.
> >>
> >> It's years that we last tested this but according to the HW
> >> documentation this should work fine.
> >>
> >> What could potentially happen is that the IH has silenced the source
> >> of the overflow. We never implemented resetting those, but in this
> >> case that here won't help either.
> >>
> > If the IH silenced the page faults (which quite clearly cause the
> > overflow here), then how are the page faults still logged in dmesg?
>
> There should be a hardware rate limit for the page faults, e.g. there
> can only be X faults reported in N clock cycles and then a delay is
> inserted.

@Christian Koenig  Is that tied to xnack (i.e., noretry)?  The default
is noretry=1 on gfx10.3 and newer.  But it can be overridden.  It was
not set on some older kernels, maybe that is the problem?  @Friedrich
Vock does setting amdgpu.noretry=1 fix the issue?

Alex

>
> >>>
> >>> The possibility of a repeated IH overflow in between reading the wptr
> >>> and updating the rptr is a good point, but how can we detect that at
> >>> all? It seems to me like we can't set the OVERFLOW_CLEAR bit at all
> >>> then, because we're guaranteed to miss any overflows that happen while
> >>> the bit is set.
> >>
> >> When an IH overflow is signaled we clear that flag by writing 1 into
> >> the OVERFLOW_CLEAR bit and skip one entry in the IH ring buffer.
> >>
> >> What can of course happen is that the IH ring buffer overflows more
> >> than this single entry and we process IVs which are potentially
> >> corrupted, but we won't miss any additional overflows since we only
> >> start processing after resetting the flag.
> >>
> >> An IH overflow is also something you should *never* see in a
> >> production system. This is purely for driver bringup and as fallback
> >> when there is a severe incorrect programming of the HW.
> >>
> >> The only exception of that is page fault handling on MI products
> >> because of a hardware bug, to mitigate this we are processing page
> >> faults on a separate IH ring on those parts.
> >>
> >> On all other hw generations the IH should have some rate limit for the
> >> number of faults generated per second, so that the CPU is always able
> >> to catch up.
> >
> > I'm wondering if there is another bug in here somewhere. Your
> > explanation of how it's supposed to work makes a lot of sense, but from
> > what I can tell it doesn't work that way when I test it.
> >
> > From the printk_ratelimit stats it would seem like >2000 faults arrive
> > in less than a second, so perhaps your theory about fault interrupt
> > ratelimiting not working is correct (but it's hard for me to verify what
> > is going on without the documentation).
>
> I'm going to ping the relevant engineer and putting someone on the task
> to take a look.
>
> Thanks,
> Christian.
>
> >
> > Regards,
> > Friedrich
> >
> >>
> >> Regards,
> >> Christian.
> >>
> >>>
> >>> Regards,
> >>> Friedrich
> >>>
> >>>>
> >>>> When you clear the overflow again when updating the RPTR you could
> >>>> loose another overflow which might have happened in between and so
> >>>> potentially process corrupted IVs.
> >>>>
> >>>> That can trivially crash the system.
> >>>>
> >>>> Regards,
> >>>> Christian.
> >>>>
> >>>>>   }
> >>>>>
> >>>>>   static int cik_ih_early_init(void *handle)
> >>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/cz_ih.c
> >>>>> b/drivers/gpu/drm/amd/amdgpu/cz_ih.c
> >>>>> index b8c47e0cf37a..076559668573 100644
> >>>>> --- a/drivers/gpu/drm/amd/amdgpu/cz_ih.c
> >>>>> +++ b/drivers/gpu/drm/amd/amdgpu/cz_ih.c
> >>>>> @@ -215,7 +215,7 @@ static u32 cz_ih_get_wptr(struct amdgpu_device
> >>>>> *adev,
> >>>>>       tmp = RREG32(mmIH_RB_CNTL);
> >>>>>       tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 1);
> >>>>>       WREG32(mmIH_RB_CNTL, tmp);
> >>>>> -
> >>>>> +    ih->overflow = true;
> >>>>>
> >>>>>   out:
> >>>>>       return (wptr & ih->ptr_mask);
> >>>>> @@ -266,7 +266,19 @@ static void cz_ih_decode_iv(struct amdgpu_device
> >>>>> *adev,
> >>>>>   static void cz_ih_set_rptr(struct amdgpu_device *adev,
> >>>>>                  struct amdgpu_ih_ring *ih)
> >>>>>   {
> >>>>> +    u32 tmp;
> >>>>> +
> >>>>>       WREG32(mmIH_RB_RPTR, ih->rptr);
> >>>>> +
> >>>>> +    /* If we overflowed previously (and thus set the OVERFLOW_CLEAR
> >>>>> bit),
> >>>>> +     * reset it here to detect more overflows if they occur.
> >>>>> +     */
> >>>>> +    if (ih->overflow) {
> >>>>> +        tmp = RREG32(mmIH_RB_CNTL);
> >>>>> +        tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR,
> >>>>> 0);
> >>>>> +        WREG32(mmIH_RB_CNTL, tmp);
> >>>>> +        ih->overflow = false;
> >>>>> +    }
> >>>>>   }
> >>>>>
> >>>>>   static int cz_ih_early_init(void *handle)
> >>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/iceland_ih.c
> >>>>> b/drivers/gpu/drm/amd/amdgpu/iceland_ih.c
> >>>>> index aecad530b10a..1a5e668643d1 100644
> >>>>> --- a/drivers/gpu/drm/amd/amdgpu/iceland_ih.c
> >>>>> +++ b/drivers/gpu/drm/amd/amdgpu/iceland_ih.c
> >>>>> @@ -214,7 +214,7 @@ static u32 iceland_ih_get_wptr(struct
> >>>>> amdgpu_device *adev,
> >>>>>       tmp = RREG32(mmIH_RB_CNTL);
> >>>>>       tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 1);
> >>>>>       WREG32(mmIH_RB_CNTL, tmp);
> >>>>> -
> >>>>> +    ih->overflow = true;
> >>>>>
> >>>>>   out:
> >>>>>       return (wptr & ih->ptr_mask);
> >>>>> @@ -265,7 +265,19 @@ static void iceland_ih_decode_iv(struct
> >>>>> amdgpu_device *adev,
> >>>>>   static void iceland_ih_set_rptr(struct amdgpu_device *adev,
> >>>>>                   struct amdgpu_ih_ring *ih)
> >>>>>   {
> >>>>> +    u32 tmp;
> >>>>> +
> >>>>>       WREG32(mmIH_RB_RPTR, ih->rptr);
> >>>>> +
> >>>>> +    /* If we overflowed previously (and thus set the OVERFLOW_CLEAR
> >>>>> bit),
> >>>>> +     * reset it here to detect more overflows if they occur.
> >>>>> +     */
> >>>>> +    if (ih->overflow) {
> >>>>> +        tmp = RREG32(mmIH_RB_CNTL);
> >>>>> +        tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR,
> >>>>> 0);
> >>>>> +        WREG32(mmIH_RB_CNTL, tmp);
> >>>>> +        ih->overflow = false;
> >>>>> +    }
> >>>>>   }
> >>>>>
> >>>>>   static int iceland_ih_early_init(void *handle)
> >>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/ih_v6_0.c
> >>>>> b/drivers/gpu/drm/amd/amdgpu/ih_v6_0.c
> >>>>> index d9ed7332d805..ce8f7feec713 100644
> >>>>> --- a/drivers/gpu/drm/amd/amdgpu/ih_v6_0.c
> >>>>> +++ b/drivers/gpu/drm/amd/amdgpu/ih_v6_0.c
> >>>>> @@ -418,6 +418,8 @@ static u32 ih_v6_0_get_wptr(struct amdgpu_device
> >>>>> *adev,
> >>>>>       tmp = RREG32_NO_KIQ(ih_regs->ih_rb_cntl);
> >>>>>       tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 1);
> >>>>>       WREG32_NO_KIQ(ih_regs->ih_rb_cntl, tmp);
> >>>>> +    ih->overflow = true;
> >>>>> +
> >>>>>   out:
> >>>>>       return (wptr & ih->ptr_mask);
> >>>>>   }
> >>>>> @@ -459,6 +461,7 @@ static void ih_v6_0_irq_rearm(struct
> >>>>> amdgpu_device *adev,
> >>>>>   static void ih_v6_0_set_rptr(struct amdgpu_device *adev,
> >>>>>                      struct amdgpu_ih_ring *ih)
> >>>>>   {
> >>>>> +    u32 tmp;
> >>>>>       struct amdgpu_ih_regs *ih_regs;
> >>>>>
> >>>>>       if (ih->use_doorbell) {
> >>>>> @@ -472,6 +475,16 @@ static void ih_v6_0_set_rptr(struct
> >>>>> amdgpu_device *adev,
> >>>>>           ih_regs = &ih->ih_regs;
> >>>>>           WREG32(ih_regs->ih_rb_rptr, ih->rptr);
> >>>>>       }
> >>>>> +
> >>>>> +    /* If we overflowed previously (and thus set the OVERFLOW_CLEAR
> >>>>> bit),
> >>>>> +     * reset it here to detect more overflows if they occur.
> >>>>> +     */
> >>>>> +    if (ih->overflow) {
> >>>>> +        tmp = RREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl);
> >>>>> +        tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR,
> >>>>> 0);
> >>>>> +        WREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl, tmp);
> >>>>> +        ih->overflow = false;
> >>>>> +    }
> >>>>>   }
> >>>>>
> >>>>>   /**
> >>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/ih_v6_1.c
> >>>>> b/drivers/gpu/drm/amd/amdgpu/ih_v6_1.c
> >>>>> index 8fb05eae340a..668788ad34d9 100644
> >>>>> --- a/drivers/gpu/drm/amd/amdgpu/ih_v6_1.c
> >>>>> +++ b/drivers/gpu/drm/amd/amdgpu/ih_v6_1.c
> >>>>> @@ -418,6 +418,8 @@ static u32 ih_v6_1_get_wptr(struct amdgpu_device
> >>>>> *adev,
> >>>>>       tmp = RREG32_NO_KIQ(ih_regs->ih_rb_cntl);
> >>>>>       tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 1);
> >>>>>       WREG32_NO_KIQ(ih_regs->ih_rb_cntl, tmp);
> >>>>> +    ih->overflow = true;
> >>>>> +
> >>>>>   out:
> >>>>>       return (wptr & ih->ptr_mask);
> >>>>>   }
> >>>>> @@ -459,6 +461,7 @@ static void ih_v6_1_irq_rearm(struct
> >>>>> amdgpu_device *adev,
> >>>>>   static void ih_v6_1_set_rptr(struct amdgpu_device *adev,
> >>>>>                      struct amdgpu_ih_ring *ih)
> >>>>>   {
> >>>>> +    u32 tmp;
> >>>>>       struct amdgpu_ih_regs *ih_regs;
> >>>>>
> >>>>>       if (ih->use_doorbell) {
> >>>>> @@ -472,6 +475,16 @@ static void ih_v6_1_set_rptr(struct
> >>>>> amdgpu_device *adev,
> >>>>>           ih_regs = &ih->ih_regs;
> >>>>>           WREG32(ih_regs->ih_rb_rptr, ih->rptr);
> >>>>>       }
> >>>>> +
> >>>>> +    /* If we overflowed previously (and thus set the OVERFLOW_CLEAR
> >>>>> bit),
> >>>>> +     * reset it here to detect more overflows if they occur.
> >>>>> +     */
> >>>>> +    if (ih->overflow) {
> >>>>> +        tmp = RREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl);
> >>>>> +        tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR,
> >>>>> 0);
> >>>>> +        WREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl, tmp);
> >>>>> +        ih->overflow = false;
> >>>>> +    }
> >>>>>   }
> >>>>>
> >>>>>   /**
> >>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/navi10_ih.c
> >>>>> b/drivers/gpu/drm/amd/amdgpu/navi10_ih.c
> >>>>> index e64b33115848..0bdac923cb4d 100644
> >>>>> --- a/drivers/gpu/drm/amd/amdgpu/navi10_ih.c
> >>>>> +++ b/drivers/gpu/drm/amd/amdgpu/navi10_ih.c
> >>>>> @@ -442,6 +442,7 @@ static u32 navi10_ih_get_wptr(struct
> >>>>> amdgpu_device *adev,
> >>>>>       tmp = RREG32_NO_KIQ(ih_regs->ih_rb_cntl);
> >>>>>       tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 1);
> >>>>>       WREG32_NO_KIQ(ih_regs->ih_rb_cntl, tmp);
> >>>>> +    ih->overflow = true;
> >>>>>   out:
> >>>>>       return (wptr & ih->ptr_mask);
> >>>>>   }
> >>>>> @@ -483,6 +484,7 @@ static void navi10_ih_irq_rearm(struct
> >>>>> amdgpu_device *adev,
> >>>>>   static void navi10_ih_set_rptr(struct amdgpu_device *adev,
> >>>>>                      struct amdgpu_ih_ring *ih)
> >>>>>   {
> >>>>> +    u32 tmp;
> >>>>>       struct amdgpu_ih_regs *ih_regs;
> >>>>>
> >>>>>       if (ih == &adev->irq.ih_soft)
> >>>>> @@ -499,6 +501,16 @@ static void navi10_ih_set_rptr(struct
> >>>>> amdgpu_device *adev,
> >>>>>           ih_regs = &ih->ih_regs;
> >>>>>           WREG32(ih_regs->ih_rb_rptr, ih->rptr);
> >>>>>       }
> >>>>> +
> >>>>> +    /* If we overflowed previously (and thus set the OVERFLOW_CLEAR
> >>>>> bit),
> >>>>> +     * reset it here to detect more overflows if they occur.
> >>>>> +     */
> >>>>> +    if (ih->overflow) {
> >>>>> +        tmp = RREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl);
> >>>>> +        tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR,
> >>>>> 0);
> >>>>> +        WREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl, tmp);
> >>>>> +        ih->overflow = false;
> >>>>> +    }
> >>>>>   }
> >>>>>
> >>>>>   /**
> >>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/si_ih.c
> >>>>> b/drivers/gpu/drm/amd/amdgpu/si_ih.c
> >>>>> index 9a24f17a5750..ff35056d2b54 100644
> >>>>> --- a/drivers/gpu/drm/amd/amdgpu/si_ih.c
> >>>>> +++ b/drivers/gpu/drm/amd/amdgpu/si_ih.c
> >>>>> @@ -119,6 +119,7 @@ static u32 si_ih_get_wptr(struct amdgpu_device
> >>>>> *adev,
> >>>>>           tmp = RREG32(IH_RB_CNTL);
> >>>>>           tmp |= IH_RB_CNTL__WPTR_OVERFLOW_CLEAR_MASK;
> >>>>>           WREG32(IH_RB_CNTL, tmp);
> >>>>> +        ih->overflow = true;
> >>>>>       }
> >>>>>       return (wptr & ih->ptr_mask);
> >>>>>   }
> >>>>> @@ -147,7 +148,18 @@ static void si_ih_decode_iv(struct amdgpu_device
> >>>>> *adev,
> >>>>>   static void si_ih_set_rptr(struct amdgpu_device *adev,
> >>>>>                  struct amdgpu_ih_ring *ih)
> >>>>>   {
> >>>>> +    u32 tmp;
> >>>>> +
> >>>>>       WREG32(IH_RB_RPTR, ih->rptr);
> >>>>> +
> >>>>> +    /* If we overflowed previously (and thus set the OVERFLOW_CLEAR
> >>>>> bit),
> >>>>> +     * reset it here to detect more overflows if they occur.
> >>>>> +     */
> >>>>> +    if (ih->overflow) {
> >>>>> +        tmp = RREG32(IH_RB_CNTL);
> >>>>> +        tmp &= ~IH_RB_CNTL__WPTR_OVERFLOW_CLEAR_MASK;
> >>>>> +        WREG32(IH_RB_CNTL, tmp);
> >>>>> +    }
> >>>>>   }
> >>>>>
> >>>>>   static int si_ih_early_init(void *handle)
> >>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/tonga_ih.c
> >>>>> b/drivers/gpu/drm/amd/amdgpu/tonga_ih.c
> >>>>> index 917707bba7f3..6f5090d3db48 100644
> >>>>> --- a/drivers/gpu/drm/amd/amdgpu/tonga_ih.c
> >>>>> +++ b/drivers/gpu/drm/amd/amdgpu/tonga_ih.c
> >>>>> @@ -218,6 +218,7 @@ static u32 tonga_ih_get_wptr(struct amdgpu_device
> >>>>> *adev,
> >>>>>       tmp = RREG32(mmIH_RB_CNTL);
> >>>>>       tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 1);
> >>>>>       WREG32(mmIH_RB_CNTL, tmp);
> >>>>> +    ih->overflow = true;
> >>>>>
> >>>>>   out:
> >>>>>       return (wptr & ih->ptr_mask);
> >>>>> @@ -268,6 +269,8 @@ static void tonga_ih_decode_iv(struct
> >>>>> amdgpu_device *adev,
> >>>>>   static void tonga_ih_set_rptr(struct amdgpu_device *adev,
> >>>>>                     struct amdgpu_ih_ring *ih)
> >>>>>   {
> >>>>> +    u32 tmp;
> >>>>> +
> >>>>>       if (ih->use_doorbell) {
> >>>>>           /* XXX check if swapping is necessary on BE */
> >>>>>           *ih->rptr_cpu = ih->rptr;
> >>>>> @@ -275,6 +278,16 @@ static void tonga_ih_set_rptr(struct
> >>>>> amdgpu_device *adev,
> >>>>>       } else {
> >>>>>           WREG32(mmIH_RB_RPTR, ih->rptr);
> >>>>>       }
> >>>>> +
> >>>>> +    /* If we overflowed previously (and thus set the OVERFLOW_CLEAR
> >>>>> bit),
> >>>>> +     * reset it here to detect more overflows if they occur.
> >>>>> +     */
> >>>>> +    if (ih->overflow) {
> >>>>> +        tmp = RREG32(mmIH_RB_CNTL);
> >>>>> +        tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR,
> >>>>> 0);
> >>>>> +        WREG32(mmIH_RB_CNTL, tmp);
> >>>>> +        ih->overflow = false;
> >>>>> +    }
> >>>>>   }
> >>>>>
> >>>>>   static int tonga_ih_early_init(void *handle)
> >>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
> >>>>> b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
> >>>>> index d364c6dd152c..bb005924f194 100644
> >>>>> --- a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
> >>>>> +++ b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
> >>>>> @@ -372,6 +372,7 @@ static u32 vega10_ih_get_wptr(struct
> >>>>> amdgpu_device *adev,
> >>>>>       tmp = RREG32_NO_KIQ(ih_regs->ih_rb_cntl);
> >>>>>       tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 1);
> >>>>>       WREG32_NO_KIQ(ih_regs->ih_rb_cntl, tmp);
> >>>>> +    ih->overflow = true;
> >>>>>
> >>>>>   out:
> >>>>>       return (wptr & ih->ptr_mask);
> >>>>> @@ -413,6 +414,7 @@ static void vega10_ih_irq_rearm(struct
> >>>>> amdgpu_device *adev,
> >>>>>   static void vega10_ih_set_rptr(struct amdgpu_device *adev,
> >>>>>                      struct amdgpu_ih_ring *ih)
> >>>>>   {
> >>>>> +    u32 tmp;
> >>>>>       struct amdgpu_ih_regs *ih_regs;
> >>>>>
> >>>>>       if (ih == &adev->irq.ih_soft)
> >>>>> @@ -429,6 +431,16 @@ static void vega10_ih_set_rptr(struct
> >>>>> amdgpu_device *adev,
> >>>>>           ih_regs = &ih->ih_regs;
> >>>>>           WREG32(ih_regs->ih_rb_rptr, ih->rptr);
> >>>>>       }
> >>>>> +
> >>>>> +    /* If we overflowed previously (and thus set the OVERFLOW_CLEAR
> >>>>> bit),
> >>>>> +     * reset it here to detect more overflows if they occur.
> >>>>> +     */
> >>>>> +    if (ih->overflow) {
> >>>>> +        tmp = RREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl);
> >>>>> +        tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR,
> >>>>> 0);
> >>>>> +        WREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl, tmp);
> >>>>> +        ih->overflow = false;
> >>>>> +    }
> >>>>>   }
> >>>>>
> >>>>>   /**
> >>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/vega20_ih.c
> >>>>> b/drivers/gpu/drm/amd/amdgpu/vega20_ih.c
> >>>>> index ddfc6941f9d5..bb725a970697 100644
> >>>>> --- a/drivers/gpu/drm/amd/amdgpu/vega20_ih.c
> >>>>> +++ b/drivers/gpu/drm/amd/amdgpu/vega20_ih.c
> >>>>> @@ -420,6 +420,7 @@ static u32 vega20_ih_get_wptr(struct
> >>>>> amdgpu_device *adev,
> >>>>>       tmp = RREG32_NO_KIQ(ih_regs->ih_rb_cntl);
> >>>>>       tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 1);
> >>>>>       WREG32_NO_KIQ(ih_regs->ih_rb_cntl, tmp);
> >>>>> +    ih->overflow = true;
> >>>>>
> >>>>>   out:
> >>>>>       return (wptr & ih->ptr_mask);
> >>>>> @@ -462,6 +463,7 @@ static void vega20_ih_irq_rearm(struct
> >>>>> amdgpu_device *adev,
> >>>>>   static void vega20_ih_set_rptr(struct amdgpu_device *adev,
> >>>>>                      struct amdgpu_ih_ring *ih)
> >>>>>   {
> >>>>> +    u32 tmp;
> >>>>>       struct amdgpu_ih_regs *ih_regs;
> >>>>>
> >>>>>       if (ih == &adev->irq.ih_soft)
> >>>>> @@ -478,6 +480,16 @@ static void vega20_ih_set_rptr(struct
> >>>>> amdgpu_device *adev,
> >>>>>           ih_regs = &ih->ih_regs;
> >>>>>           WREG32(ih_regs->ih_rb_rptr, ih->rptr);
> >>>>>       }
> >>>>> +
> >>>>> +    /* If we overflowed previously (and thus set the OVERFLOW_CLEAR
> >>>>> bit),
> >>>>> +     * reset it here to detect more overflows if they occur.
> >>>>> +     */
> >>>>> +    if (ih->overflow) {
> >>>>> +        tmp = RREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl);
> >>>>> +        tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR,
> >>>>> 0);
> >>>>> +        WREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl, tmp);
> >>>>> +        ih->overflow = false;
> >>>>> +    }
> >>>>>   }
> >>>>>
> >>>>>   /**
> >>>>> --
> >>>>> 2.43.0
> >>>>>
> >>>>
> >>
>

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 1/2] drm/amdgpu: Reset IH OVERFLOW_CLEAR bit after writing rptr
  2024-01-17 23:00           ` Alex Deucher
@ 2024-01-17 23:44             ` Friedrich Vock
  2024-01-18 12:07               ` Christian König
  0 siblings, 1 reply; 28+ messages in thread
From: Friedrich Vock @ 2024-01-17 23:44 UTC (permalink / raw)
  To: Alex Deucher, Christian König, Christian Koenig
  Cc: Alex Deucher, Joshua Ashton, amd-gfx

On 18.01.24 00:00, Alex Deucher wrote:
> On Wed, Jan 17, 2024 at 7:36 AM Christian König
> <ckoenig.leichtzumerken@gmail.com> wrote:
>> Am 16.01.24 um 11:31 schrieb Friedrich Vock:
>>> On 16.01.24 08:03, Christian König wrote:
>>>> Am 15.01.24 um 12:18 schrieb Friedrich Vock:
>>>>> [SNIP]
>>>>>>> +    if (ih->overflow) {
>>>>>>> +        tmp = RREG32(mmIH_RB_CNTL);
>>>>>>> +        tmp &= ~IH_RB_CNTL__WPTR_OVERFLOW_CLEAR_MASK;
>>>>>>> +        WREG32(mmIH_RB_CNTL, tmp);
>>>>>>> +        ih->overflow = false;
>>>>>>> +    }
>>>>>> Well that is an extremely bad idea. We already reset the overflow
>>>>>> after reading the WPTR.
>>>>> This is not resetting the overflow bit. This is resetting a "clear
>>>>> overflow" bit. I don't have the hardware docs, but the name (and my
>>>>> observations) strongly suggest that setting this bit actually prevents
>>>>> the hardware from setting the overflow bit ever again.
>>>> Well that doesn't make any sense at all. The hardware documentation
>>>> clearly states that this bit is write only and should always read as
>>>> zero.
>>>>
>>>> Setting this bit will clear the overflow flag in the WPTR register and
>>>> clearing it has no effect at all.
>>>>
>>>> I could only ping the hw engineer responsible for this block to double
>>>> check if the documentation is somehow outdated, but I really doubt so.
>>>>
>>> I see. I wish I had access to the documentation,
>> Well, doesn't Valve has an NDA in place?
>>
>>> but I don't, so all I
>>> can do is tell you what I observe the hardware doing. I've tested this
>>> on both a Steam Deck (OSSYS 5.2.0) and an RX 6700 XT (OSSYS 5.0.3). On
>>> both systems, launching a bunch of shaders that cause page faults leads
>>> to lots of "[gfxhub] page fault" messages in dmesg, followed by an
>>> "amdgpu: IH ring buffer overflow".
>> Well that is certainly a bug, maybe even the same thing we have seen on
>> Vega and MI.
>>
>> What we could do is to try to apply the same workaround to re-route the
>> page faults to a different IH ring.
>>
>> See those patches here as well:
>>
>> commit 516bc3d8dd7965f1a8a3ea453857f14d95971e62
>> Author: Christian König <christian.koenig@amd.com>
>> Date:   Fri Nov 2 15:00:16 2018 +0100
>>
>>       drm/amdgpu: reroute VMC and UMD to IH ring 1
>>
>>       Page faults can easily overwhelm the interrupt handler.
>>
>>       So to make sure that we never lose valuable interrupts on the
>> primary ring
>>       we re-route page faults to IH ring 1.
>>
>> commit b849aaa41c914a0fd88003f88cb04420a873c624
>> Author: Christian König <christian.koenig@amd.com>
>> Date:   Mon Mar 4 19:34:34 2019 +0100
>>
>>       drm/amdgpu: also reroute VMC and UMD to IH ring 1 on Vega 20
>>
>>       Same patch we alredy did for Vega10. Just re-route page faults to a
>> separate
>>       ring to avoid drowning in interrupts.
>>
>>> If I re-launch the same set of shaders after the GPU has soft-recovered,
>>> the "amdgpu: IH ring buffer overflow" message is missing, even though
>>> the same amount of page faults should've been triggered at roughly the
>>> same rate. Running with this patch applied makes more "amdgpu: IH ring
>>> buffer overflow" messages appear after relaunching the faulting shaders
>>> (but not when processing any non-faulting work).
>> That is actually the expected behavior. There should be a limit on the
>> number of faults written to the ring so that the ring never overflows.
>>
>>> The only possible conclusion I can draw from this is that clearing that
>>> bit *does* have an effect, and I don't think it's far-fetched to assume
>>> the IH ring buffer overflows still happen after re-launching the
>>> faulting shaders but go undetected so far.
>> Well that can only mean that the hw documentation is incorrect.
>>
>> Either the value is not write only trigger bit as documented or we need
>> an additional read of the register for it to take effect or something
>> like this.
>>
>>>>> Right now, IH overflows, even if they occur repeatedly, only get
>>>>> registered once. If not registering IH overflows can trivially lead to
>>>>> system crashes, it's amdgpu's current handling that is broken.
>>>> It's years that we last tested this but according to the HW
>>>> documentation this should work fine.
>>>>
>>>> What could potentially happen is that the IH has silenced the source
>>>> of the overflow. We never implemented resetting those, but in this
>>>> case that here won't help either.
>>>>
>>> If the IH silenced the page faults (which quite clearly cause the
>>> overflow here), then how are the page faults still logged in dmesg?
>> There should be a hardware rate limit for the page faults, e.g. there
>> can only be X faults reported in N clock cycles and then a delay is
>> inserted.
> @Christian Koenig  Is that tied to xnack (i.e., noretry)?  The default
> is noretry=1 on gfx10.3 and newer.  But it can be overridden.  It was
> not set on some older kernels, maybe that is the problem?  @Friedrich
> Vock does setting amdgpu.noretry=1 fix the issue?


No, amdgpu.noretry=1 does not change anything.

Regards,
Friedrich

> Alex
>
>>>>> The possibility of a repeated IH overflow in between reading the wptr
>>>>> and updating the rptr is a good point, but how can we detect that at
>>>>> all? It seems to me like we can't set the OVERFLOW_CLEAR bit at all
>>>>> then, because we're guaranteed to miss any overflows that happen while
>>>>> the bit is set.
>>>> When an IH overflow is signaled we clear that flag by writing 1 into
>>>> the OVERFLOW_CLEAR bit and skip one entry in the IH ring buffer.
>>>>
>>>> What can of course happen is that the IH ring buffer overflows more
>>>> than this single entry and we process IVs which are potentially
>>>> corrupted, but we won't miss any additional overflows since we only
>>>> start processing after resetting the flag.
>>>>
>>>> An IH overflow is also something you should *never* see in a
>>>> production system. This is purely for driver bringup and as fallback
>>>> when there is a severe incorrect programming of the HW.
>>>>
>>>> The only exception of that is page fault handling on MI products
>>>> because of a hardware bug, to mitigate this we are processing page
>>>> faults on a separate IH ring on those parts.
>>>>
>>>> On all other hw generations the IH should have some rate limit for the
>>>> number of faults generated per second, so that the CPU is always able
>>>> to catch up.
>>> I'm wondering if there is another bug in here somewhere. Your
>>> explanation of how it's supposed to work makes a lot of sense, but from
>>> what I can tell it doesn't work that way when I test it.
>>>
>>>  From the printk_ratelimit stats it would seem like >2000 faults arrive
>>> in less than a second, so perhaps your theory about fault interrupt
>>> ratelimiting not working is correct (but it's hard for me to verify what
>>> is going on without the documentation).
>> I'm going to ping the relevant engineer and putting someone on the task
>> to take a look.
>>
>> Thanks,
>> Christian.
>>
>>> Regards,
>>> Friedrich
>>>
>>>> Regards,
>>>> Christian.
>>>>
>>>>> Regards,
>>>>> Friedrich
>>>>>
>>>>>> When you clear the overflow again when updating the RPTR you could
>>>>>> loose another overflow which might have happened in between and so
>>>>>> potentially process corrupted IVs.
>>>>>>
>>>>>> That can trivially crash the system.
>>>>>>
>>>>>> Regards,
>>>>>> Christian.
>>>>>>
>>>>>>>    }
>>>>>>>
>>>>>>>    static int cik_ih_early_init(void *handle)
>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/cz_ih.c
>>>>>>> b/drivers/gpu/drm/amd/amdgpu/cz_ih.c
>>>>>>> index b8c47e0cf37a..076559668573 100644
>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/cz_ih.c
>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/cz_ih.c
>>>>>>> @@ -215,7 +215,7 @@ static u32 cz_ih_get_wptr(struct amdgpu_device
>>>>>>> *adev,
>>>>>>>        tmp = RREG32(mmIH_RB_CNTL);
>>>>>>>        tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 1);
>>>>>>>        WREG32(mmIH_RB_CNTL, tmp);
>>>>>>> -
>>>>>>> +    ih->overflow = true;
>>>>>>>
>>>>>>>    out:
>>>>>>>        return (wptr & ih->ptr_mask);
>>>>>>> @@ -266,7 +266,19 @@ static void cz_ih_decode_iv(struct amdgpu_device
>>>>>>> *adev,
>>>>>>>    static void cz_ih_set_rptr(struct amdgpu_device *adev,
>>>>>>>                   struct amdgpu_ih_ring *ih)
>>>>>>>    {
>>>>>>> +    u32 tmp;
>>>>>>> +
>>>>>>>        WREG32(mmIH_RB_RPTR, ih->rptr);
>>>>>>> +
>>>>>>> +    /* If we overflowed previously (and thus set the OVERFLOW_CLEAR
>>>>>>> bit),
>>>>>>> +     * reset it here to detect more overflows if they occur.
>>>>>>> +     */
>>>>>>> +    if (ih->overflow) {
>>>>>>> +        tmp = RREG32(mmIH_RB_CNTL);
>>>>>>> +        tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR,
>>>>>>> 0);
>>>>>>> +        WREG32(mmIH_RB_CNTL, tmp);
>>>>>>> +        ih->overflow = false;
>>>>>>> +    }
>>>>>>>    }
>>>>>>>
>>>>>>>    static int cz_ih_early_init(void *handle)
>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/iceland_ih.c
>>>>>>> b/drivers/gpu/drm/amd/amdgpu/iceland_ih.c
>>>>>>> index aecad530b10a..1a5e668643d1 100644
>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/iceland_ih.c
>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/iceland_ih.c
>>>>>>> @@ -214,7 +214,7 @@ static u32 iceland_ih_get_wptr(struct
>>>>>>> amdgpu_device *adev,
>>>>>>>        tmp = RREG32(mmIH_RB_CNTL);
>>>>>>>        tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 1);
>>>>>>>        WREG32(mmIH_RB_CNTL, tmp);
>>>>>>> -
>>>>>>> +    ih->overflow = true;
>>>>>>>
>>>>>>>    out:
>>>>>>>        return (wptr & ih->ptr_mask);
>>>>>>> @@ -265,7 +265,19 @@ static void iceland_ih_decode_iv(struct
>>>>>>> amdgpu_device *adev,
>>>>>>>    static void iceland_ih_set_rptr(struct amdgpu_device *adev,
>>>>>>>                    struct amdgpu_ih_ring *ih)
>>>>>>>    {
>>>>>>> +    u32 tmp;
>>>>>>> +
>>>>>>>        WREG32(mmIH_RB_RPTR, ih->rptr);
>>>>>>> +
>>>>>>> +    /* If we overflowed previously (and thus set the OVERFLOW_CLEAR
>>>>>>> bit),
>>>>>>> +     * reset it here to detect more overflows if they occur.
>>>>>>> +     */
>>>>>>> +    if (ih->overflow) {
>>>>>>> +        tmp = RREG32(mmIH_RB_CNTL);
>>>>>>> +        tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR,
>>>>>>> 0);
>>>>>>> +        WREG32(mmIH_RB_CNTL, tmp);
>>>>>>> +        ih->overflow = false;
>>>>>>> +    }
>>>>>>>    }
>>>>>>>
>>>>>>>    static int iceland_ih_early_init(void *handle)
>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/ih_v6_0.c
>>>>>>> b/drivers/gpu/drm/amd/amdgpu/ih_v6_0.c
>>>>>>> index d9ed7332d805..ce8f7feec713 100644
>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/ih_v6_0.c
>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/ih_v6_0.c
>>>>>>> @@ -418,6 +418,8 @@ static u32 ih_v6_0_get_wptr(struct amdgpu_device
>>>>>>> *adev,
>>>>>>>        tmp = RREG32_NO_KIQ(ih_regs->ih_rb_cntl);
>>>>>>>        tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 1);
>>>>>>>        WREG32_NO_KIQ(ih_regs->ih_rb_cntl, tmp);
>>>>>>> +    ih->overflow = true;
>>>>>>> +
>>>>>>>    out:
>>>>>>>        return (wptr & ih->ptr_mask);
>>>>>>>    }
>>>>>>> @@ -459,6 +461,7 @@ static void ih_v6_0_irq_rearm(struct
>>>>>>> amdgpu_device *adev,
>>>>>>>    static void ih_v6_0_set_rptr(struct amdgpu_device *adev,
>>>>>>>                       struct amdgpu_ih_ring *ih)
>>>>>>>    {
>>>>>>> +    u32 tmp;
>>>>>>>        struct amdgpu_ih_regs *ih_regs;
>>>>>>>
>>>>>>>        if (ih->use_doorbell) {
>>>>>>> @@ -472,6 +475,16 @@ static void ih_v6_0_set_rptr(struct
>>>>>>> amdgpu_device *adev,
>>>>>>>            ih_regs = &ih->ih_regs;
>>>>>>>            WREG32(ih_regs->ih_rb_rptr, ih->rptr);
>>>>>>>        }
>>>>>>> +
>>>>>>> +    /* If we overflowed previously (and thus set the OVERFLOW_CLEAR
>>>>>>> bit),
>>>>>>> +     * reset it here to detect more overflows if they occur.
>>>>>>> +     */
>>>>>>> +    if (ih->overflow) {
>>>>>>> +        tmp = RREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl);
>>>>>>> +        tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR,
>>>>>>> 0);
>>>>>>> +        WREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl, tmp);
>>>>>>> +        ih->overflow = false;
>>>>>>> +    }
>>>>>>>    }
>>>>>>>
>>>>>>>    /**
>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/ih_v6_1.c
>>>>>>> b/drivers/gpu/drm/amd/amdgpu/ih_v6_1.c
>>>>>>> index 8fb05eae340a..668788ad34d9 100644
>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/ih_v6_1.c
>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/ih_v6_1.c
>>>>>>> @@ -418,6 +418,8 @@ static u32 ih_v6_1_get_wptr(struct amdgpu_device
>>>>>>> *adev,
>>>>>>>        tmp = RREG32_NO_KIQ(ih_regs->ih_rb_cntl);
>>>>>>>        tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 1);
>>>>>>>        WREG32_NO_KIQ(ih_regs->ih_rb_cntl, tmp);
>>>>>>> +    ih->overflow = true;
>>>>>>> +
>>>>>>>    out:
>>>>>>>        return (wptr & ih->ptr_mask);
>>>>>>>    }
>>>>>>> @@ -459,6 +461,7 @@ static void ih_v6_1_irq_rearm(struct
>>>>>>> amdgpu_device *adev,
>>>>>>>    static void ih_v6_1_set_rptr(struct amdgpu_device *adev,
>>>>>>>                       struct amdgpu_ih_ring *ih)
>>>>>>>    {
>>>>>>> +    u32 tmp;
>>>>>>>        struct amdgpu_ih_regs *ih_regs;
>>>>>>>
>>>>>>>        if (ih->use_doorbell) {
>>>>>>> @@ -472,6 +475,16 @@ static void ih_v6_1_set_rptr(struct
>>>>>>> amdgpu_device *adev,
>>>>>>>            ih_regs = &ih->ih_regs;
>>>>>>>            WREG32(ih_regs->ih_rb_rptr, ih->rptr);
>>>>>>>        }
>>>>>>> +
>>>>>>> +    /* If we overflowed previously (and thus set the OVERFLOW_CLEAR
>>>>>>> bit),
>>>>>>> +     * reset it here to detect more overflows if they occur.
>>>>>>> +     */
>>>>>>> +    if (ih->overflow) {
>>>>>>> +        tmp = RREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl);
>>>>>>> +        tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR,
>>>>>>> 0);
>>>>>>> +        WREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl, tmp);
>>>>>>> +        ih->overflow = false;
>>>>>>> +    }
>>>>>>>    }
>>>>>>>
>>>>>>>    /**
>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/navi10_ih.c
>>>>>>> b/drivers/gpu/drm/amd/amdgpu/navi10_ih.c
>>>>>>> index e64b33115848..0bdac923cb4d 100644
>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/navi10_ih.c
>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/navi10_ih.c
>>>>>>> @@ -442,6 +442,7 @@ static u32 navi10_ih_get_wptr(struct
>>>>>>> amdgpu_device *adev,
>>>>>>>        tmp = RREG32_NO_KIQ(ih_regs->ih_rb_cntl);
>>>>>>>        tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 1);
>>>>>>>        WREG32_NO_KIQ(ih_regs->ih_rb_cntl, tmp);
>>>>>>> +    ih->overflow = true;
>>>>>>>    out:
>>>>>>>        return (wptr & ih->ptr_mask);
>>>>>>>    }
>>>>>>> @@ -483,6 +484,7 @@ static void navi10_ih_irq_rearm(struct
>>>>>>> amdgpu_device *adev,
>>>>>>>    static void navi10_ih_set_rptr(struct amdgpu_device *adev,
>>>>>>>                       struct amdgpu_ih_ring *ih)
>>>>>>>    {
>>>>>>> +    u32 tmp;
>>>>>>>        struct amdgpu_ih_regs *ih_regs;
>>>>>>>
>>>>>>>        if (ih == &adev->irq.ih_soft)
>>>>>>> @@ -499,6 +501,16 @@ static void navi10_ih_set_rptr(struct
>>>>>>> amdgpu_device *adev,
>>>>>>>            ih_regs = &ih->ih_regs;
>>>>>>>            WREG32(ih_regs->ih_rb_rptr, ih->rptr);
>>>>>>>        }
>>>>>>> +
>>>>>>> +    /* If we overflowed previously (and thus set the OVERFLOW_CLEAR
>>>>>>> bit),
>>>>>>> +     * reset it here to detect more overflows if they occur.
>>>>>>> +     */
>>>>>>> +    if (ih->overflow) {
>>>>>>> +        tmp = RREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl);
>>>>>>> +        tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR,
>>>>>>> 0);
>>>>>>> +        WREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl, tmp);
>>>>>>> +        ih->overflow = false;
>>>>>>> +    }
>>>>>>>    }
>>>>>>>
>>>>>>>    /**
>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/si_ih.c
>>>>>>> b/drivers/gpu/drm/amd/amdgpu/si_ih.c
>>>>>>> index 9a24f17a5750..ff35056d2b54 100644
>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/si_ih.c
>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/si_ih.c
>>>>>>> @@ -119,6 +119,7 @@ static u32 si_ih_get_wptr(struct amdgpu_device
>>>>>>> *adev,
>>>>>>>            tmp = RREG32(IH_RB_CNTL);
>>>>>>>            tmp |= IH_RB_CNTL__WPTR_OVERFLOW_CLEAR_MASK;
>>>>>>>            WREG32(IH_RB_CNTL, tmp);
>>>>>>> +        ih->overflow = true;
>>>>>>>        }
>>>>>>>        return (wptr & ih->ptr_mask);
>>>>>>>    }
>>>>>>> @@ -147,7 +148,18 @@ static void si_ih_decode_iv(struct amdgpu_device
>>>>>>> *adev,
>>>>>>>    static void si_ih_set_rptr(struct amdgpu_device *adev,
>>>>>>>                   struct amdgpu_ih_ring *ih)
>>>>>>>    {
>>>>>>> +    u32 tmp;
>>>>>>> +
>>>>>>>        WREG32(IH_RB_RPTR, ih->rptr);
>>>>>>> +
>>>>>>> +    /* If we overflowed previously (and thus set the OVERFLOW_CLEAR
>>>>>>> bit),
>>>>>>> +     * reset it here to detect more overflows if they occur.
>>>>>>> +     */
>>>>>>> +    if (ih->overflow) {
>>>>>>> +        tmp = RREG32(IH_RB_CNTL);
>>>>>>> +        tmp &= ~IH_RB_CNTL__WPTR_OVERFLOW_CLEAR_MASK;
>>>>>>> +        WREG32(IH_RB_CNTL, tmp);
>>>>>>> +    }
>>>>>>>    }
>>>>>>>
>>>>>>>    static int si_ih_early_init(void *handle)
>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/tonga_ih.c
>>>>>>> b/drivers/gpu/drm/amd/amdgpu/tonga_ih.c
>>>>>>> index 917707bba7f3..6f5090d3db48 100644
>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/tonga_ih.c
>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/tonga_ih.c
>>>>>>> @@ -218,6 +218,7 @@ static u32 tonga_ih_get_wptr(struct amdgpu_device
>>>>>>> *adev,
>>>>>>>        tmp = RREG32(mmIH_RB_CNTL);
>>>>>>>        tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 1);
>>>>>>>        WREG32(mmIH_RB_CNTL, tmp);
>>>>>>> +    ih->overflow = true;
>>>>>>>
>>>>>>>    out:
>>>>>>>        return (wptr & ih->ptr_mask);
>>>>>>> @@ -268,6 +269,8 @@ static void tonga_ih_decode_iv(struct
>>>>>>> amdgpu_device *adev,
>>>>>>>    static void tonga_ih_set_rptr(struct amdgpu_device *adev,
>>>>>>>                      struct amdgpu_ih_ring *ih)
>>>>>>>    {
>>>>>>> +    u32 tmp;
>>>>>>> +
>>>>>>>        if (ih->use_doorbell) {
>>>>>>>            /* XXX check if swapping is necessary on BE */
>>>>>>>            *ih->rptr_cpu = ih->rptr;
>>>>>>> @@ -275,6 +278,16 @@ static void tonga_ih_set_rptr(struct
>>>>>>> amdgpu_device *adev,
>>>>>>>        } else {
>>>>>>>            WREG32(mmIH_RB_RPTR, ih->rptr);
>>>>>>>        }
>>>>>>> +
>>>>>>> +    /* If we overflowed previously (and thus set the OVERFLOW_CLEAR
>>>>>>> bit),
>>>>>>> +     * reset it here to detect more overflows if they occur.
>>>>>>> +     */
>>>>>>> +    if (ih->overflow) {
>>>>>>> +        tmp = RREG32(mmIH_RB_CNTL);
>>>>>>> +        tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR,
>>>>>>> 0);
>>>>>>> +        WREG32(mmIH_RB_CNTL, tmp);
>>>>>>> +        ih->overflow = false;
>>>>>>> +    }
>>>>>>>    }
>>>>>>>
>>>>>>>    static int tonga_ih_early_init(void *handle)
>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
>>>>>>> b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
>>>>>>> index d364c6dd152c..bb005924f194 100644
>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
>>>>>>> @@ -372,6 +372,7 @@ static u32 vega10_ih_get_wptr(struct
>>>>>>> amdgpu_device *adev,
>>>>>>>        tmp = RREG32_NO_KIQ(ih_regs->ih_rb_cntl);
>>>>>>>        tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 1);
>>>>>>>        WREG32_NO_KIQ(ih_regs->ih_rb_cntl, tmp);
>>>>>>> +    ih->overflow = true;
>>>>>>>
>>>>>>>    out:
>>>>>>>        return (wptr & ih->ptr_mask);
>>>>>>> @@ -413,6 +414,7 @@ static void vega10_ih_irq_rearm(struct
>>>>>>> amdgpu_device *adev,
>>>>>>>    static void vega10_ih_set_rptr(struct amdgpu_device *adev,
>>>>>>>                       struct amdgpu_ih_ring *ih)
>>>>>>>    {
>>>>>>> +    u32 tmp;
>>>>>>>        struct amdgpu_ih_regs *ih_regs;
>>>>>>>
>>>>>>>        if (ih == &adev->irq.ih_soft)
>>>>>>> @@ -429,6 +431,16 @@ static void vega10_ih_set_rptr(struct
>>>>>>> amdgpu_device *adev,
>>>>>>>            ih_regs = &ih->ih_regs;
>>>>>>>            WREG32(ih_regs->ih_rb_rptr, ih->rptr);
>>>>>>>        }
>>>>>>> +
>>>>>>> +    /* If we overflowed previously (and thus set the OVERFLOW_CLEAR
>>>>>>> bit),
>>>>>>> +     * reset it here to detect more overflows if they occur.
>>>>>>> +     */
>>>>>>> +    if (ih->overflow) {
>>>>>>> +        tmp = RREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl);
>>>>>>> +        tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR,
>>>>>>> 0);
>>>>>>> +        WREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl, tmp);
>>>>>>> +        ih->overflow = false;
>>>>>>> +    }
>>>>>>>    }
>>>>>>>
>>>>>>>    /**
>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/vega20_ih.c
>>>>>>> b/drivers/gpu/drm/amd/amdgpu/vega20_ih.c
>>>>>>> index ddfc6941f9d5..bb725a970697 100644
>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/vega20_ih.c
>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/vega20_ih.c
>>>>>>> @@ -420,6 +420,7 @@ static u32 vega20_ih_get_wptr(struct
>>>>>>> amdgpu_device *adev,
>>>>>>>        tmp = RREG32_NO_KIQ(ih_regs->ih_rb_cntl);
>>>>>>>        tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 1);
>>>>>>>        WREG32_NO_KIQ(ih_regs->ih_rb_cntl, tmp);
>>>>>>> +    ih->overflow = true;
>>>>>>>
>>>>>>>    out:
>>>>>>>        return (wptr & ih->ptr_mask);
>>>>>>> @@ -462,6 +463,7 @@ static void vega20_ih_irq_rearm(struct
>>>>>>> amdgpu_device *adev,
>>>>>>>    static void vega20_ih_set_rptr(struct amdgpu_device *adev,
>>>>>>>                       struct amdgpu_ih_ring *ih)
>>>>>>>    {
>>>>>>> +    u32 tmp;
>>>>>>>        struct amdgpu_ih_regs *ih_regs;
>>>>>>>
>>>>>>>        if (ih == &adev->irq.ih_soft)
>>>>>>> @@ -478,6 +480,16 @@ static void vega20_ih_set_rptr(struct
>>>>>>> amdgpu_device *adev,
>>>>>>>            ih_regs = &ih->ih_regs;
>>>>>>>            WREG32(ih_regs->ih_rb_rptr, ih->rptr);
>>>>>>>        }
>>>>>>> +
>>>>>>> +    /* If we overflowed previously (and thus set the OVERFLOW_CLEAR
>>>>>>> bit),
>>>>>>> +     * reset it here to detect more overflows if they occur.
>>>>>>> +     */
>>>>>>> +    if (ih->overflow) {
>>>>>>> +        tmp = RREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl);
>>>>>>> +        tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR,
>>>>>>> 0);
>>>>>>> +        WREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl, tmp);
>>>>>>> +        ih->overflow = false;
>>>>>>> +    }
>>>>>>>    }
>>>>>>>
>>>>>>>    /**
>>>>>>> --
>>>>>>> 2.43.0
>>>>>>>

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 1/2] drm/amdgpu: Reset IH OVERFLOW_CLEAR bit after writing rptr
  2024-01-17 23:44             ` Friedrich Vock
@ 2024-01-18 12:07               ` Christian König
  2024-01-19 19:18                 ` Felix Kuehling
  0 siblings, 1 reply; 28+ messages in thread
From: Christian König @ 2024-01-18 12:07 UTC (permalink / raw)
  To: Friedrich Vock, Alex Deucher, Christian Koenig
  Cc: Alex Deucher, Joshua Ashton, amd-gfx

Am 18.01.24 um 00:44 schrieb Friedrich Vock:
> On 18.01.24 00:00, Alex Deucher wrote:
>> [SNIP]
>>>>>> Right now, IH overflows, even if they occur repeatedly, only get
>>>>>> registered once. If not registering IH overflows can trivially 
>>>>>> lead to
>>>>>> system crashes, it's amdgpu's current handling that is broken.
>>>>> It's years that we last tested this but according to the HW
>>>>> documentation this should work fine.
>>>>>
>>>>> What could potentially happen is that the IH has silenced the source
>>>>> of the overflow. We never implemented resetting those, but in this
>>>>> case that here won't help either.
>>>>>
>>>> If the IH silenced the page faults (which quite clearly cause the
>>>> overflow here), then how are the page faults still logged in dmesg?
>>> There should be a hardware rate limit for the page faults, e.g. there
>>> can only be X faults reported in N clock cycles and then a delay is
>>> inserted.
>> @Christian Koenig  Is that tied to xnack (i.e., noretry)?  The default
>> is noretry=1 on gfx10.3 and newer.  But it can be overridden. It was
>> not set on some older kernels, maybe that is the problem? @Friedrich
>> Vock does setting amdgpu.noretry=1 fix the issue?
>
>
> No, amdgpu.noretry=1 does not change anything.

Well the good news first the hw engineer answered rather quickly. The 
bad news is that the hardware really doesn't work as documented in 
multiple ways.

First of all the CLEAR bit is a level and not a trigger, so the 
intention to clear it is indeed correct. For now please modify this 
patch so that the CLEAR bit is set and cleared directly after setting 
it, this way we should be able to detect further overflows immediately.

Then the APU the Steam Deck uses simply doesn't have the filter function 
for page faults in the hardware, the really bad news is it also doesn't 
have the extra IH rings where we could re-route the faults to prevent 
overflows.

That full explains the behavior you have been seeing, but doesn't really 
provide a doable solution to mitigate this problem.

I'm going to dig deeper into the hw documentation and specification to 
see if we can use a different feature to avoid the overflow.

Thanks,
Christian.

>
> Regards,
> Friedrich
>
>> Alex
>>
>>>>>> The possibility of a repeated IH overflow in between reading the 
>>>>>> wptr
>>>>>> and updating the rptr is a good point, but how can we detect that at
>>>>>> all? It seems to me like we can't set the OVERFLOW_CLEAR bit at all
>>>>>> then, because we're guaranteed to miss any overflows that happen 
>>>>>> while
>>>>>> the bit is set.
>>>>> When an IH overflow is signaled we clear that flag by writing 1 into
>>>>> the OVERFLOW_CLEAR bit and skip one entry in the IH ring buffer.
>>>>>
>>>>> What can of course happen is that the IH ring buffer overflows more
>>>>> than this single entry and we process IVs which are potentially
>>>>> corrupted, but we won't miss any additional overflows since we only
>>>>> start processing after resetting the flag.
>>>>>
>>>>> An IH overflow is also something you should *never* see in a
>>>>> production system. This is purely for driver bringup and as fallback
>>>>> when there is a severe incorrect programming of the HW.
>>>>>
>>>>> The only exception of that is page fault handling on MI products
>>>>> because of a hardware bug, to mitigate this we are processing page
>>>>> faults on a separate IH ring on those parts.
>>>>>
>>>>> On all other hw generations the IH should have some rate limit for 
>>>>> the
>>>>> number of faults generated per second, so that the CPU is always able
>>>>> to catch up.
>>>> I'm wondering if there is another bug in here somewhere. Your
>>>> explanation of how it's supposed to work makes a lot of sense, but 
>>>> from
>>>> what I can tell it doesn't work that way when I test it.
>>>>
>>>>  From the printk_ratelimit stats it would seem like >2000 faults 
>>>> arrive
>>>> in less than a second, so perhaps your theory about fault interrupt
>>>> ratelimiting not working is correct (but it's hard for me to verify 
>>>> what
>>>> is going on without the documentation).
>>> I'm going to ping the relevant engineer and putting someone on the task
>>> to take a look.
>>>
>>> Thanks,
>>> Christian.
>>>
>>>> Regards,
>>>> Friedrich
>>>>
>>>>> Regards,
>>>>> Christian.
>>>>>
>>>>>> Regards,
>>>>>> Friedrich
>>>>>>
>>>>>>> When you clear the overflow again when updating the RPTR you could
>>>>>>> loose another overflow which might have happened in between and so
>>>>>>> potentially process corrupted IVs.
>>>>>>>
>>>>>>> That can trivially crash the system.
>>>>>>>
>>>>>>> Regards,
>>>>>>> Christian.
>>>>>>>
>>>>>>>>    }
>>>>>>>>
>>>>>>>>    static int cik_ih_early_init(void *handle)
>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/cz_ih.c
>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/cz_ih.c
>>>>>>>> index b8c47e0cf37a..076559668573 100644
>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/cz_ih.c
>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/cz_ih.c
>>>>>>>> @@ -215,7 +215,7 @@ static u32 cz_ih_get_wptr(struct amdgpu_device
>>>>>>>> *adev,
>>>>>>>>        tmp = RREG32(mmIH_RB_CNTL);
>>>>>>>>        tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, 
>>>>>>>> WPTR_OVERFLOW_CLEAR, 1);
>>>>>>>>        WREG32(mmIH_RB_CNTL, tmp);
>>>>>>>> -
>>>>>>>> +    ih->overflow = true;
>>>>>>>>
>>>>>>>>    out:
>>>>>>>>        return (wptr & ih->ptr_mask);
>>>>>>>> @@ -266,7 +266,19 @@ static void cz_ih_decode_iv(struct 
>>>>>>>> amdgpu_device
>>>>>>>> *adev,
>>>>>>>>    static void cz_ih_set_rptr(struct amdgpu_device *adev,
>>>>>>>>                   struct amdgpu_ih_ring *ih)
>>>>>>>>    {
>>>>>>>> +    u32 tmp;
>>>>>>>> +
>>>>>>>>        WREG32(mmIH_RB_RPTR, ih->rptr);
>>>>>>>> +
>>>>>>>> +    /* If we overflowed previously (and thus set the 
>>>>>>>> OVERFLOW_CLEAR
>>>>>>>> bit),
>>>>>>>> +     * reset it here to detect more overflows if they occur.
>>>>>>>> +     */
>>>>>>>> +    if (ih->overflow) {
>>>>>>>> +        tmp = RREG32(mmIH_RB_CNTL);
>>>>>>>> +        tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR,
>>>>>>>> 0);
>>>>>>>> +        WREG32(mmIH_RB_CNTL, tmp);
>>>>>>>> +        ih->overflow = false;
>>>>>>>> +    }
>>>>>>>>    }
>>>>>>>>
>>>>>>>>    static int cz_ih_early_init(void *handle)
>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/iceland_ih.c
>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/iceland_ih.c
>>>>>>>> index aecad530b10a..1a5e668643d1 100644
>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/iceland_ih.c
>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/iceland_ih.c
>>>>>>>> @@ -214,7 +214,7 @@ static u32 iceland_ih_get_wptr(struct
>>>>>>>> amdgpu_device *adev,
>>>>>>>>        tmp = RREG32(mmIH_RB_CNTL);
>>>>>>>>        tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, 
>>>>>>>> WPTR_OVERFLOW_CLEAR, 1);
>>>>>>>>        WREG32(mmIH_RB_CNTL, tmp);
>>>>>>>> -
>>>>>>>> +    ih->overflow = true;
>>>>>>>>
>>>>>>>>    out:
>>>>>>>>        return (wptr & ih->ptr_mask);
>>>>>>>> @@ -265,7 +265,19 @@ static void iceland_ih_decode_iv(struct
>>>>>>>> amdgpu_device *adev,
>>>>>>>>    static void iceland_ih_set_rptr(struct amdgpu_device *adev,
>>>>>>>>                    struct amdgpu_ih_ring *ih)
>>>>>>>>    {
>>>>>>>> +    u32 tmp;
>>>>>>>> +
>>>>>>>>        WREG32(mmIH_RB_RPTR, ih->rptr);
>>>>>>>> +
>>>>>>>> +    /* If we overflowed previously (and thus set the 
>>>>>>>> OVERFLOW_CLEAR
>>>>>>>> bit),
>>>>>>>> +     * reset it here to detect more overflows if they occur.
>>>>>>>> +     */
>>>>>>>> +    if (ih->overflow) {
>>>>>>>> +        tmp = RREG32(mmIH_RB_CNTL);
>>>>>>>> +        tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR,
>>>>>>>> 0);
>>>>>>>> +        WREG32(mmIH_RB_CNTL, tmp);
>>>>>>>> +        ih->overflow = false;
>>>>>>>> +    }
>>>>>>>>    }
>>>>>>>>
>>>>>>>>    static int iceland_ih_early_init(void *handle)
>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/ih_v6_0.c
>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/ih_v6_0.c
>>>>>>>> index d9ed7332d805..ce8f7feec713 100644
>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/ih_v6_0.c
>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/ih_v6_0.c
>>>>>>>> @@ -418,6 +418,8 @@ static u32 ih_v6_0_get_wptr(struct 
>>>>>>>> amdgpu_device
>>>>>>>> *adev,
>>>>>>>>        tmp = RREG32_NO_KIQ(ih_regs->ih_rb_cntl);
>>>>>>>>        tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, 
>>>>>>>> WPTR_OVERFLOW_CLEAR, 1);
>>>>>>>>        WREG32_NO_KIQ(ih_regs->ih_rb_cntl, tmp);
>>>>>>>> +    ih->overflow = true;
>>>>>>>> +
>>>>>>>>    out:
>>>>>>>>        return (wptr & ih->ptr_mask);
>>>>>>>>    }
>>>>>>>> @@ -459,6 +461,7 @@ static void ih_v6_0_irq_rearm(struct
>>>>>>>> amdgpu_device *adev,
>>>>>>>>    static void ih_v6_0_set_rptr(struct amdgpu_device *adev,
>>>>>>>>                       struct amdgpu_ih_ring *ih)
>>>>>>>>    {
>>>>>>>> +    u32 tmp;
>>>>>>>>        struct amdgpu_ih_regs *ih_regs;
>>>>>>>>
>>>>>>>>        if (ih->use_doorbell) {
>>>>>>>> @@ -472,6 +475,16 @@ static void ih_v6_0_set_rptr(struct
>>>>>>>> amdgpu_device *adev,
>>>>>>>>            ih_regs = &ih->ih_regs;
>>>>>>>>            WREG32(ih_regs->ih_rb_rptr, ih->rptr);
>>>>>>>>        }
>>>>>>>> +
>>>>>>>> +    /* If we overflowed previously (and thus set the 
>>>>>>>> OVERFLOW_CLEAR
>>>>>>>> bit),
>>>>>>>> +     * reset it here to detect more overflows if they occur.
>>>>>>>> +     */
>>>>>>>> +    if (ih->overflow) {
>>>>>>>> +        tmp = RREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl);
>>>>>>>> +        tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR,
>>>>>>>> 0);
>>>>>>>> +        WREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl, tmp);
>>>>>>>> +        ih->overflow = false;
>>>>>>>> +    }
>>>>>>>>    }
>>>>>>>>
>>>>>>>>    /**
>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/ih_v6_1.c
>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/ih_v6_1.c
>>>>>>>> index 8fb05eae340a..668788ad34d9 100644
>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/ih_v6_1.c
>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/ih_v6_1.c
>>>>>>>> @@ -418,6 +418,8 @@ static u32 ih_v6_1_get_wptr(struct 
>>>>>>>> amdgpu_device
>>>>>>>> *adev,
>>>>>>>>        tmp = RREG32_NO_KIQ(ih_regs->ih_rb_cntl);
>>>>>>>>        tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, 
>>>>>>>> WPTR_OVERFLOW_CLEAR, 1);
>>>>>>>>        WREG32_NO_KIQ(ih_regs->ih_rb_cntl, tmp);
>>>>>>>> +    ih->overflow = true;
>>>>>>>> +
>>>>>>>>    out:
>>>>>>>>        return (wptr & ih->ptr_mask);
>>>>>>>>    }
>>>>>>>> @@ -459,6 +461,7 @@ static void ih_v6_1_irq_rearm(struct
>>>>>>>> amdgpu_device *adev,
>>>>>>>>    static void ih_v6_1_set_rptr(struct amdgpu_device *adev,
>>>>>>>>                       struct amdgpu_ih_ring *ih)
>>>>>>>>    {
>>>>>>>> +    u32 tmp;
>>>>>>>>        struct amdgpu_ih_regs *ih_regs;
>>>>>>>>
>>>>>>>>        if (ih->use_doorbell) {
>>>>>>>> @@ -472,6 +475,16 @@ static void ih_v6_1_set_rptr(struct
>>>>>>>> amdgpu_device *adev,
>>>>>>>>            ih_regs = &ih->ih_regs;
>>>>>>>>            WREG32(ih_regs->ih_rb_rptr, ih->rptr);
>>>>>>>>        }
>>>>>>>> +
>>>>>>>> +    /* If we overflowed previously (and thus set the 
>>>>>>>> OVERFLOW_CLEAR
>>>>>>>> bit),
>>>>>>>> +     * reset it here to detect more overflows if they occur.
>>>>>>>> +     */
>>>>>>>> +    if (ih->overflow) {
>>>>>>>> +        tmp = RREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl);
>>>>>>>> +        tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR,
>>>>>>>> 0);
>>>>>>>> +        WREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl, tmp);
>>>>>>>> +        ih->overflow = false;
>>>>>>>> +    }
>>>>>>>>    }
>>>>>>>>
>>>>>>>>    /**
>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/navi10_ih.c
>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/navi10_ih.c
>>>>>>>> index e64b33115848..0bdac923cb4d 100644
>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/navi10_ih.c
>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/navi10_ih.c
>>>>>>>> @@ -442,6 +442,7 @@ static u32 navi10_ih_get_wptr(struct
>>>>>>>> amdgpu_device *adev,
>>>>>>>>        tmp = RREG32_NO_KIQ(ih_regs->ih_rb_cntl);
>>>>>>>>        tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, 
>>>>>>>> WPTR_OVERFLOW_CLEAR, 1);
>>>>>>>>        WREG32_NO_KIQ(ih_regs->ih_rb_cntl, tmp);
>>>>>>>> +    ih->overflow = true;
>>>>>>>>    out:
>>>>>>>>        return (wptr & ih->ptr_mask);
>>>>>>>>    }
>>>>>>>> @@ -483,6 +484,7 @@ static void navi10_ih_irq_rearm(struct
>>>>>>>> amdgpu_device *adev,
>>>>>>>>    static void navi10_ih_set_rptr(struct amdgpu_device *adev,
>>>>>>>>                       struct amdgpu_ih_ring *ih)
>>>>>>>>    {
>>>>>>>> +    u32 tmp;
>>>>>>>>        struct amdgpu_ih_regs *ih_regs;
>>>>>>>>
>>>>>>>>        if (ih == &adev->irq.ih_soft)
>>>>>>>> @@ -499,6 +501,16 @@ static void navi10_ih_set_rptr(struct
>>>>>>>> amdgpu_device *adev,
>>>>>>>>            ih_regs = &ih->ih_regs;
>>>>>>>>            WREG32(ih_regs->ih_rb_rptr, ih->rptr);
>>>>>>>>        }
>>>>>>>> +
>>>>>>>> +    /* If we overflowed previously (and thus set the 
>>>>>>>> OVERFLOW_CLEAR
>>>>>>>> bit),
>>>>>>>> +     * reset it here to detect more overflows if they occur.
>>>>>>>> +     */
>>>>>>>> +    if (ih->overflow) {
>>>>>>>> +        tmp = RREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl);
>>>>>>>> +        tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR,
>>>>>>>> 0);
>>>>>>>> +        WREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl, tmp);
>>>>>>>> +        ih->overflow = false;
>>>>>>>> +    }
>>>>>>>>    }
>>>>>>>>
>>>>>>>>    /**
>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/si_ih.c
>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/si_ih.c
>>>>>>>> index 9a24f17a5750..ff35056d2b54 100644
>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/si_ih.c
>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/si_ih.c
>>>>>>>> @@ -119,6 +119,7 @@ static u32 si_ih_get_wptr(struct amdgpu_device
>>>>>>>> *adev,
>>>>>>>>            tmp = RREG32(IH_RB_CNTL);
>>>>>>>>            tmp |= IH_RB_CNTL__WPTR_OVERFLOW_CLEAR_MASK;
>>>>>>>>            WREG32(IH_RB_CNTL, tmp);
>>>>>>>> +        ih->overflow = true;
>>>>>>>>        }
>>>>>>>>        return (wptr & ih->ptr_mask);
>>>>>>>>    }
>>>>>>>> @@ -147,7 +148,18 @@ static void si_ih_decode_iv(struct 
>>>>>>>> amdgpu_device
>>>>>>>> *adev,
>>>>>>>>    static void si_ih_set_rptr(struct amdgpu_device *adev,
>>>>>>>>                   struct amdgpu_ih_ring *ih)
>>>>>>>>    {
>>>>>>>> +    u32 tmp;
>>>>>>>> +
>>>>>>>>        WREG32(IH_RB_RPTR, ih->rptr);
>>>>>>>> +
>>>>>>>> +    /* If we overflowed previously (and thus set the 
>>>>>>>> OVERFLOW_CLEAR
>>>>>>>> bit),
>>>>>>>> +     * reset it here to detect more overflows if they occur.
>>>>>>>> +     */
>>>>>>>> +    if (ih->overflow) {
>>>>>>>> +        tmp = RREG32(IH_RB_CNTL);
>>>>>>>> +        tmp &= ~IH_RB_CNTL__WPTR_OVERFLOW_CLEAR_MASK;
>>>>>>>> +        WREG32(IH_RB_CNTL, tmp);
>>>>>>>> +    }
>>>>>>>>    }
>>>>>>>>
>>>>>>>>    static int si_ih_early_init(void *handle)
>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/tonga_ih.c
>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/tonga_ih.c
>>>>>>>> index 917707bba7f3..6f5090d3db48 100644
>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/tonga_ih.c
>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/tonga_ih.c
>>>>>>>> @@ -218,6 +218,7 @@ static u32 tonga_ih_get_wptr(struct 
>>>>>>>> amdgpu_device
>>>>>>>> *adev,
>>>>>>>>        tmp = RREG32(mmIH_RB_CNTL);
>>>>>>>>        tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, 
>>>>>>>> WPTR_OVERFLOW_CLEAR, 1);
>>>>>>>>        WREG32(mmIH_RB_CNTL, tmp);
>>>>>>>> +    ih->overflow = true;
>>>>>>>>
>>>>>>>>    out:
>>>>>>>>        return (wptr & ih->ptr_mask);
>>>>>>>> @@ -268,6 +269,8 @@ static void tonga_ih_decode_iv(struct
>>>>>>>> amdgpu_device *adev,
>>>>>>>>    static void tonga_ih_set_rptr(struct amdgpu_device *adev,
>>>>>>>>                      struct amdgpu_ih_ring *ih)
>>>>>>>>    {
>>>>>>>> +    u32 tmp;
>>>>>>>> +
>>>>>>>>        if (ih->use_doorbell) {
>>>>>>>>            /* XXX check if swapping is necessary on BE */
>>>>>>>>            *ih->rptr_cpu = ih->rptr;
>>>>>>>> @@ -275,6 +278,16 @@ static void tonga_ih_set_rptr(struct
>>>>>>>> amdgpu_device *adev,
>>>>>>>>        } else {
>>>>>>>>            WREG32(mmIH_RB_RPTR, ih->rptr);
>>>>>>>>        }
>>>>>>>> +
>>>>>>>> +    /* If we overflowed previously (and thus set the 
>>>>>>>> OVERFLOW_CLEAR
>>>>>>>> bit),
>>>>>>>> +     * reset it here to detect more overflows if they occur.
>>>>>>>> +     */
>>>>>>>> +    if (ih->overflow) {
>>>>>>>> +        tmp = RREG32(mmIH_RB_CNTL);
>>>>>>>> +        tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR,
>>>>>>>> 0);
>>>>>>>> +        WREG32(mmIH_RB_CNTL, tmp);
>>>>>>>> +        ih->overflow = false;
>>>>>>>> +    }
>>>>>>>>    }
>>>>>>>>
>>>>>>>>    static int tonga_ih_early_init(void *handle)
>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
>>>>>>>> index d364c6dd152c..bb005924f194 100644
>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
>>>>>>>> @@ -372,6 +372,7 @@ static u32 vega10_ih_get_wptr(struct
>>>>>>>> amdgpu_device *adev,
>>>>>>>>        tmp = RREG32_NO_KIQ(ih_regs->ih_rb_cntl);
>>>>>>>>        tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, 
>>>>>>>> WPTR_OVERFLOW_CLEAR, 1);
>>>>>>>>        WREG32_NO_KIQ(ih_regs->ih_rb_cntl, tmp);
>>>>>>>> +    ih->overflow = true;
>>>>>>>>
>>>>>>>>    out:
>>>>>>>>        return (wptr & ih->ptr_mask);
>>>>>>>> @@ -413,6 +414,7 @@ static void vega10_ih_irq_rearm(struct
>>>>>>>> amdgpu_device *adev,
>>>>>>>>    static void vega10_ih_set_rptr(struct amdgpu_device *adev,
>>>>>>>>                       struct amdgpu_ih_ring *ih)
>>>>>>>>    {
>>>>>>>> +    u32 tmp;
>>>>>>>>        struct amdgpu_ih_regs *ih_regs;
>>>>>>>>
>>>>>>>>        if (ih == &adev->irq.ih_soft)
>>>>>>>> @@ -429,6 +431,16 @@ static void vega10_ih_set_rptr(struct
>>>>>>>> amdgpu_device *adev,
>>>>>>>>            ih_regs = &ih->ih_regs;
>>>>>>>>            WREG32(ih_regs->ih_rb_rptr, ih->rptr);
>>>>>>>>        }
>>>>>>>> +
>>>>>>>> +    /* If we overflowed previously (and thus set the 
>>>>>>>> OVERFLOW_CLEAR
>>>>>>>> bit),
>>>>>>>> +     * reset it here to detect more overflows if they occur.
>>>>>>>> +     */
>>>>>>>> +    if (ih->overflow) {
>>>>>>>> +        tmp = RREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl);
>>>>>>>> +        tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR,
>>>>>>>> 0);
>>>>>>>> +        WREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl, tmp);
>>>>>>>> +        ih->overflow = false;
>>>>>>>> +    }
>>>>>>>>    }
>>>>>>>>
>>>>>>>>    /**
>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/vega20_ih.c
>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/vega20_ih.c
>>>>>>>> index ddfc6941f9d5..bb725a970697 100644
>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/vega20_ih.c
>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/vega20_ih.c
>>>>>>>> @@ -420,6 +420,7 @@ static u32 vega20_ih_get_wptr(struct
>>>>>>>> amdgpu_device *adev,
>>>>>>>>        tmp = RREG32_NO_KIQ(ih_regs->ih_rb_cntl);
>>>>>>>>        tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, 
>>>>>>>> WPTR_OVERFLOW_CLEAR, 1);
>>>>>>>>        WREG32_NO_KIQ(ih_regs->ih_rb_cntl, tmp);
>>>>>>>> +    ih->overflow = true;
>>>>>>>>
>>>>>>>>    out:
>>>>>>>>        return (wptr & ih->ptr_mask);
>>>>>>>> @@ -462,6 +463,7 @@ static void vega20_ih_irq_rearm(struct
>>>>>>>> amdgpu_device *adev,
>>>>>>>>    static void vega20_ih_set_rptr(struct amdgpu_device *adev,
>>>>>>>>                       struct amdgpu_ih_ring *ih)
>>>>>>>>    {
>>>>>>>> +    u32 tmp;
>>>>>>>>        struct amdgpu_ih_regs *ih_regs;
>>>>>>>>
>>>>>>>>        if (ih == &adev->irq.ih_soft)
>>>>>>>> @@ -478,6 +480,16 @@ static void vega20_ih_set_rptr(struct
>>>>>>>> amdgpu_device *adev,
>>>>>>>>            ih_regs = &ih->ih_regs;
>>>>>>>>            WREG32(ih_regs->ih_rb_rptr, ih->rptr);
>>>>>>>>        }
>>>>>>>> +
>>>>>>>> +    /* If we overflowed previously (and thus set the 
>>>>>>>> OVERFLOW_CLEAR
>>>>>>>> bit),
>>>>>>>> +     * reset it here to detect more overflows if they occur.
>>>>>>>> +     */
>>>>>>>> +    if (ih->overflow) {
>>>>>>>> +        tmp = RREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl);
>>>>>>>> +        tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR,
>>>>>>>> 0);
>>>>>>>> +        WREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl, tmp);
>>>>>>>> +        ih->overflow = false;
>>>>>>>> +    }
>>>>>>>>    }
>>>>>>>>
>>>>>>>>    /**
>>>>>>>> -- 
>>>>>>>> 2.43.0
>>>>>>>>


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 1/2] drm/amdgpu: Reset IH OVERFLOW_CLEAR bit after writing rptr
  2024-01-18 12:07               ` Christian König
@ 2024-01-19 19:18                 ` Felix Kuehling
  2024-01-22 10:10                   ` Christian König
  0 siblings, 1 reply; 28+ messages in thread
From: Felix Kuehling @ 2024-01-19 19:18 UTC (permalink / raw)
  To: Christian König, Friedrich Vock, Alex Deucher,
	Christian Koenig
  Cc: Alex Deucher, amd-gfx, Joshua Ashton

On 2024-01-18 07:07, Christian König wrote:
> Am 18.01.24 um 00:44 schrieb Friedrich Vock:
>> On 18.01.24 00:00, Alex Deucher wrote:
>>> [SNIP]
>>>>>>> Right now, IH overflows, even if they occur repeatedly, only get
>>>>>>> registered once. If not registering IH overflows can trivially 
>>>>>>> lead to
>>>>>>> system crashes, it's amdgpu's current handling that is broken.
>>>>>> It's years that we last tested this but according to the HW
>>>>>> documentation this should work fine.
>>>>>>
>>>>>> What could potentially happen is that the IH has silenced the source
>>>>>> of the overflow. We never implemented resetting those, but in this
>>>>>> case that here won't help either.
>>>>>>
>>>>> If the IH silenced the page faults (which quite clearly cause the
>>>>> overflow here), then how are the page faults still logged in dmesg?
>>>> There should be a hardware rate limit for the page faults, e.g. there
>>>> can only be X faults reported in N clock cycles and then a delay is
>>>> inserted.
>>> @Christian Koenig  Is that tied to xnack (i.e., noretry)?  The default
>>> is noretry=1 on gfx10.3 and newer.  But it can be overridden. It was
>>> not set on some older kernels, maybe that is the problem? @Friedrich
>>> Vock does setting amdgpu.noretry=1 fix the issue?
>>
>>
>> No, amdgpu.noretry=1 does not change anything.
>
> Well the good news first the hw engineer answered rather quickly. The 
> bad news is that the hardware really doesn't work as documented in 
> multiple ways.
>
> First of all the CLEAR bit is a level and not a trigger, so the 
> intention to clear it is indeed correct. For now please modify this 
> patch so that the CLEAR bit is set and cleared directly after setting 
> it, this way we should be able to detect further overflows immediately.
>
> Then the APU the Steam Deck uses simply doesn't have the filter 
> function for page faults in the hardware, the really bad news is it 
> also doesn't have the extra IH rings where we could re-route the 
> faults to prevent overflows.
>
> That full explains the behavior you have been seeing, but doesn't 
> really provide a doable solution to mitigate this problem.
>
> I'm going to dig deeper into the hw documentation and specification to 
> see if we can use a different feature to avoid the overflow.

If we're not enabling retry faults, then each wave front should generate 
at most one fault. You should be able to avoid overflows by making the 
IH ring large enough to accommodate one fault per wave front.

If the faults are coming from SDMA, that may be another problem. I'm not 
sure whether it can generate multiple no-retry faults from the same queue.

Regards,
   Felix


>
> Thanks,
> Christian.
>
>>
>> Regards,
>> Friedrich
>>
>>> Alex
>>>
>>>>>>> The possibility of a repeated IH overflow in between reading the 
>>>>>>> wptr
>>>>>>> and updating the rptr is a good point, but how can we detect 
>>>>>>> that at
>>>>>>> all? It seems to me like we can't set the OVERFLOW_CLEAR bit at all
>>>>>>> then, because we're guaranteed to miss any overflows that happen 
>>>>>>> while
>>>>>>> the bit is set.
>>>>>> When an IH overflow is signaled we clear that flag by writing 1 into
>>>>>> the OVERFLOW_CLEAR bit and skip one entry in the IH ring buffer.
>>>>>>
>>>>>> What can of course happen is that the IH ring buffer overflows more
>>>>>> than this single entry and we process IVs which are potentially
>>>>>> corrupted, but we won't miss any additional overflows since we only
>>>>>> start processing after resetting the flag.
>>>>>>
>>>>>> An IH overflow is also something you should *never* see in a
>>>>>> production system. This is purely for driver bringup and as fallback
>>>>>> when there is a severe incorrect programming of the HW.
>>>>>>
>>>>>> The only exception of that is page fault handling on MI products
>>>>>> because of a hardware bug, to mitigate this we are processing page
>>>>>> faults on a separate IH ring on those parts.
>>>>>>
>>>>>> On all other hw generations the IH should have some rate limit 
>>>>>> for the
>>>>>> number of faults generated per second, so that the CPU is always 
>>>>>> able
>>>>>> to catch up.
>>>>> I'm wondering if there is another bug in here somewhere. Your
>>>>> explanation of how it's supposed to work makes a lot of sense, but 
>>>>> from
>>>>> what I can tell it doesn't work that way when I test it.
>>>>>
>>>>>  From the printk_ratelimit stats it would seem like >2000 faults 
>>>>> arrive
>>>>> in less than a second, so perhaps your theory about fault interrupt
>>>>> ratelimiting not working is correct (but it's hard for me to 
>>>>> verify what
>>>>> is going on without the documentation).
>>>> I'm going to ping the relevant engineer and putting someone on the 
>>>> task
>>>> to take a look.
>>>>
>>>> Thanks,
>>>> Christian.
>>>>
>>>>> Regards,
>>>>> Friedrich
>>>>>
>>>>>> Regards,
>>>>>> Christian.
>>>>>>
>>>>>>> Regards,
>>>>>>> Friedrich
>>>>>>>
>>>>>>>> When you clear the overflow again when updating the RPTR you could
>>>>>>>> loose another overflow which might have happened in between and so
>>>>>>>> potentially process corrupted IVs.
>>>>>>>>
>>>>>>>> That can trivially crash the system.
>>>>>>>>
>>>>>>>> Regards,
>>>>>>>> Christian.
>>>>>>>>
>>>>>>>>>    }
>>>>>>>>>
>>>>>>>>>    static int cik_ih_early_init(void *handle)
>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/cz_ih.c
>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/cz_ih.c
>>>>>>>>> index b8c47e0cf37a..076559668573 100644
>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/cz_ih.c
>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/cz_ih.c
>>>>>>>>> @@ -215,7 +215,7 @@ static u32 cz_ih_get_wptr(struct 
>>>>>>>>> amdgpu_device
>>>>>>>>> *adev,
>>>>>>>>>        tmp = RREG32(mmIH_RB_CNTL);
>>>>>>>>>        tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, 
>>>>>>>>> WPTR_OVERFLOW_CLEAR, 1);
>>>>>>>>>        WREG32(mmIH_RB_CNTL, tmp);
>>>>>>>>> -
>>>>>>>>> +    ih->overflow = true;
>>>>>>>>>
>>>>>>>>>    out:
>>>>>>>>>        return (wptr & ih->ptr_mask);
>>>>>>>>> @@ -266,7 +266,19 @@ static void cz_ih_decode_iv(struct 
>>>>>>>>> amdgpu_device
>>>>>>>>> *adev,
>>>>>>>>>    static void cz_ih_set_rptr(struct amdgpu_device *adev,
>>>>>>>>>                   struct amdgpu_ih_ring *ih)
>>>>>>>>>    {
>>>>>>>>> +    u32 tmp;
>>>>>>>>> +
>>>>>>>>>        WREG32(mmIH_RB_RPTR, ih->rptr);
>>>>>>>>> +
>>>>>>>>> +    /* If we overflowed previously (and thus set the 
>>>>>>>>> OVERFLOW_CLEAR
>>>>>>>>> bit),
>>>>>>>>> +     * reset it here to detect more overflows if they occur.
>>>>>>>>> +     */
>>>>>>>>> +    if (ih->overflow) {
>>>>>>>>> +        tmp = RREG32(mmIH_RB_CNTL);
>>>>>>>>> +        tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, 
>>>>>>>>> WPTR_OVERFLOW_CLEAR,
>>>>>>>>> 0);
>>>>>>>>> +        WREG32(mmIH_RB_CNTL, tmp);
>>>>>>>>> +        ih->overflow = false;
>>>>>>>>> +    }
>>>>>>>>>    }
>>>>>>>>>
>>>>>>>>>    static int cz_ih_early_init(void *handle)
>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/iceland_ih.c
>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/iceland_ih.c
>>>>>>>>> index aecad530b10a..1a5e668643d1 100644
>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/iceland_ih.c
>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/iceland_ih.c
>>>>>>>>> @@ -214,7 +214,7 @@ static u32 iceland_ih_get_wptr(struct
>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>        tmp = RREG32(mmIH_RB_CNTL);
>>>>>>>>>        tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, 
>>>>>>>>> WPTR_OVERFLOW_CLEAR, 1);
>>>>>>>>>        WREG32(mmIH_RB_CNTL, tmp);
>>>>>>>>> -
>>>>>>>>> +    ih->overflow = true;
>>>>>>>>>
>>>>>>>>>    out:
>>>>>>>>>        return (wptr & ih->ptr_mask);
>>>>>>>>> @@ -265,7 +265,19 @@ static void iceland_ih_decode_iv(struct
>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>    static void iceland_ih_set_rptr(struct amdgpu_device *adev,
>>>>>>>>>                    struct amdgpu_ih_ring *ih)
>>>>>>>>>    {
>>>>>>>>> +    u32 tmp;
>>>>>>>>> +
>>>>>>>>>        WREG32(mmIH_RB_RPTR, ih->rptr);
>>>>>>>>> +
>>>>>>>>> +    /* If we overflowed previously (and thus set the 
>>>>>>>>> OVERFLOW_CLEAR
>>>>>>>>> bit),
>>>>>>>>> +     * reset it here to detect more overflows if they occur.
>>>>>>>>> +     */
>>>>>>>>> +    if (ih->overflow) {
>>>>>>>>> +        tmp = RREG32(mmIH_RB_CNTL);
>>>>>>>>> +        tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, 
>>>>>>>>> WPTR_OVERFLOW_CLEAR,
>>>>>>>>> 0);
>>>>>>>>> +        WREG32(mmIH_RB_CNTL, tmp);
>>>>>>>>> +        ih->overflow = false;
>>>>>>>>> +    }
>>>>>>>>>    }
>>>>>>>>>
>>>>>>>>>    static int iceland_ih_early_init(void *handle)
>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/ih_v6_0.c
>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/ih_v6_0.c
>>>>>>>>> index d9ed7332d805..ce8f7feec713 100644
>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/ih_v6_0.c
>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/ih_v6_0.c
>>>>>>>>> @@ -418,6 +418,8 @@ static u32 ih_v6_0_get_wptr(struct 
>>>>>>>>> amdgpu_device
>>>>>>>>> *adev,
>>>>>>>>>        tmp = RREG32_NO_KIQ(ih_regs->ih_rb_cntl);
>>>>>>>>>        tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, 
>>>>>>>>> WPTR_OVERFLOW_CLEAR, 1);
>>>>>>>>>        WREG32_NO_KIQ(ih_regs->ih_rb_cntl, tmp);
>>>>>>>>> +    ih->overflow = true;
>>>>>>>>> +
>>>>>>>>>    out:
>>>>>>>>>        return (wptr & ih->ptr_mask);
>>>>>>>>>    }
>>>>>>>>> @@ -459,6 +461,7 @@ static void ih_v6_0_irq_rearm(struct
>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>    static void ih_v6_0_set_rptr(struct amdgpu_device *adev,
>>>>>>>>>                       struct amdgpu_ih_ring *ih)
>>>>>>>>>    {
>>>>>>>>> +    u32 tmp;
>>>>>>>>>        struct amdgpu_ih_regs *ih_regs;
>>>>>>>>>
>>>>>>>>>        if (ih->use_doorbell) {
>>>>>>>>> @@ -472,6 +475,16 @@ static void ih_v6_0_set_rptr(struct
>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>            ih_regs = &ih->ih_regs;
>>>>>>>>>            WREG32(ih_regs->ih_rb_rptr, ih->rptr);
>>>>>>>>>        }
>>>>>>>>> +
>>>>>>>>> +    /* If we overflowed previously (and thus set the 
>>>>>>>>> OVERFLOW_CLEAR
>>>>>>>>> bit),
>>>>>>>>> +     * reset it here to detect more overflows if they occur.
>>>>>>>>> +     */
>>>>>>>>> +    if (ih->overflow) {
>>>>>>>>> +        tmp = RREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl);
>>>>>>>>> +        tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, 
>>>>>>>>> WPTR_OVERFLOW_CLEAR,
>>>>>>>>> 0);
>>>>>>>>> +        WREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl, tmp);
>>>>>>>>> +        ih->overflow = false;
>>>>>>>>> +    }
>>>>>>>>>    }
>>>>>>>>>
>>>>>>>>>    /**
>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/ih_v6_1.c
>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/ih_v6_1.c
>>>>>>>>> index 8fb05eae340a..668788ad34d9 100644
>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/ih_v6_1.c
>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/ih_v6_1.c
>>>>>>>>> @@ -418,6 +418,8 @@ static u32 ih_v6_1_get_wptr(struct 
>>>>>>>>> amdgpu_device
>>>>>>>>> *adev,
>>>>>>>>>        tmp = RREG32_NO_KIQ(ih_regs->ih_rb_cntl);
>>>>>>>>>        tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, 
>>>>>>>>> WPTR_OVERFLOW_CLEAR, 1);
>>>>>>>>>        WREG32_NO_KIQ(ih_regs->ih_rb_cntl, tmp);
>>>>>>>>> +    ih->overflow = true;
>>>>>>>>> +
>>>>>>>>>    out:
>>>>>>>>>        return (wptr & ih->ptr_mask);
>>>>>>>>>    }
>>>>>>>>> @@ -459,6 +461,7 @@ static void ih_v6_1_irq_rearm(struct
>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>    static void ih_v6_1_set_rptr(struct amdgpu_device *adev,
>>>>>>>>>                       struct amdgpu_ih_ring *ih)
>>>>>>>>>    {
>>>>>>>>> +    u32 tmp;
>>>>>>>>>        struct amdgpu_ih_regs *ih_regs;
>>>>>>>>>
>>>>>>>>>        if (ih->use_doorbell) {
>>>>>>>>> @@ -472,6 +475,16 @@ static void ih_v6_1_set_rptr(struct
>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>            ih_regs = &ih->ih_regs;
>>>>>>>>>            WREG32(ih_regs->ih_rb_rptr, ih->rptr);
>>>>>>>>>        }
>>>>>>>>> +
>>>>>>>>> +    /* If we overflowed previously (and thus set the 
>>>>>>>>> OVERFLOW_CLEAR
>>>>>>>>> bit),
>>>>>>>>> +     * reset it here to detect more overflows if they occur.
>>>>>>>>> +     */
>>>>>>>>> +    if (ih->overflow) {
>>>>>>>>> +        tmp = RREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl);
>>>>>>>>> +        tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, 
>>>>>>>>> WPTR_OVERFLOW_CLEAR,
>>>>>>>>> 0);
>>>>>>>>> +        WREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl, tmp);
>>>>>>>>> +        ih->overflow = false;
>>>>>>>>> +    }
>>>>>>>>>    }
>>>>>>>>>
>>>>>>>>>    /**
>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/navi10_ih.c
>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/navi10_ih.c
>>>>>>>>> index e64b33115848..0bdac923cb4d 100644
>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/navi10_ih.c
>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/navi10_ih.c
>>>>>>>>> @@ -442,6 +442,7 @@ static u32 navi10_ih_get_wptr(struct
>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>        tmp = RREG32_NO_KIQ(ih_regs->ih_rb_cntl);
>>>>>>>>>        tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, 
>>>>>>>>> WPTR_OVERFLOW_CLEAR, 1);
>>>>>>>>>        WREG32_NO_KIQ(ih_regs->ih_rb_cntl, tmp);
>>>>>>>>> +    ih->overflow = true;
>>>>>>>>>    out:
>>>>>>>>>        return (wptr & ih->ptr_mask);
>>>>>>>>>    }
>>>>>>>>> @@ -483,6 +484,7 @@ static void navi10_ih_irq_rearm(struct
>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>    static void navi10_ih_set_rptr(struct amdgpu_device *adev,
>>>>>>>>>                       struct amdgpu_ih_ring *ih)
>>>>>>>>>    {
>>>>>>>>> +    u32 tmp;
>>>>>>>>>        struct amdgpu_ih_regs *ih_regs;
>>>>>>>>>
>>>>>>>>>        if (ih == &adev->irq.ih_soft)
>>>>>>>>> @@ -499,6 +501,16 @@ static void navi10_ih_set_rptr(struct
>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>            ih_regs = &ih->ih_regs;
>>>>>>>>>            WREG32(ih_regs->ih_rb_rptr, ih->rptr);
>>>>>>>>>        }
>>>>>>>>> +
>>>>>>>>> +    /* If we overflowed previously (and thus set the 
>>>>>>>>> OVERFLOW_CLEAR
>>>>>>>>> bit),
>>>>>>>>> +     * reset it here to detect more overflows if they occur.
>>>>>>>>> +     */
>>>>>>>>> +    if (ih->overflow) {
>>>>>>>>> +        tmp = RREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl);
>>>>>>>>> +        tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, 
>>>>>>>>> WPTR_OVERFLOW_CLEAR,
>>>>>>>>> 0);
>>>>>>>>> +        WREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl, tmp);
>>>>>>>>> +        ih->overflow = false;
>>>>>>>>> +    }
>>>>>>>>>    }
>>>>>>>>>
>>>>>>>>>    /**
>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/si_ih.c
>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/si_ih.c
>>>>>>>>> index 9a24f17a5750..ff35056d2b54 100644
>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/si_ih.c
>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/si_ih.c
>>>>>>>>> @@ -119,6 +119,7 @@ static u32 si_ih_get_wptr(struct 
>>>>>>>>> amdgpu_device
>>>>>>>>> *adev,
>>>>>>>>>            tmp = RREG32(IH_RB_CNTL);
>>>>>>>>>            tmp |= IH_RB_CNTL__WPTR_OVERFLOW_CLEAR_MASK;
>>>>>>>>>            WREG32(IH_RB_CNTL, tmp);
>>>>>>>>> +        ih->overflow = true;
>>>>>>>>>        }
>>>>>>>>>        return (wptr & ih->ptr_mask);
>>>>>>>>>    }
>>>>>>>>> @@ -147,7 +148,18 @@ static void si_ih_decode_iv(struct 
>>>>>>>>> amdgpu_device
>>>>>>>>> *adev,
>>>>>>>>>    static void si_ih_set_rptr(struct amdgpu_device *adev,
>>>>>>>>>                   struct amdgpu_ih_ring *ih)
>>>>>>>>>    {
>>>>>>>>> +    u32 tmp;
>>>>>>>>> +
>>>>>>>>>        WREG32(IH_RB_RPTR, ih->rptr);
>>>>>>>>> +
>>>>>>>>> +    /* If we overflowed previously (and thus set the 
>>>>>>>>> OVERFLOW_CLEAR
>>>>>>>>> bit),
>>>>>>>>> +     * reset it here to detect more overflows if they occur.
>>>>>>>>> +     */
>>>>>>>>> +    if (ih->overflow) {
>>>>>>>>> +        tmp = RREG32(IH_RB_CNTL);
>>>>>>>>> +        tmp &= ~IH_RB_CNTL__WPTR_OVERFLOW_CLEAR_MASK;
>>>>>>>>> +        WREG32(IH_RB_CNTL, tmp);
>>>>>>>>> +    }
>>>>>>>>>    }
>>>>>>>>>
>>>>>>>>>    static int si_ih_early_init(void *handle)
>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/tonga_ih.c
>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/tonga_ih.c
>>>>>>>>> index 917707bba7f3..6f5090d3db48 100644
>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/tonga_ih.c
>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/tonga_ih.c
>>>>>>>>> @@ -218,6 +218,7 @@ static u32 tonga_ih_get_wptr(struct 
>>>>>>>>> amdgpu_device
>>>>>>>>> *adev,
>>>>>>>>>        tmp = RREG32(mmIH_RB_CNTL);
>>>>>>>>>        tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, 
>>>>>>>>> WPTR_OVERFLOW_CLEAR, 1);
>>>>>>>>>        WREG32(mmIH_RB_CNTL, tmp);
>>>>>>>>> +    ih->overflow = true;
>>>>>>>>>
>>>>>>>>>    out:
>>>>>>>>>        return (wptr & ih->ptr_mask);
>>>>>>>>> @@ -268,6 +269,8 @@ static void tonga_ih_decode_iv(struct
>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>    static void tonga_ih_set_rptr(struct amdgpu_device *adev,
>>>>>>>>>                      struct amdgpu_ih_ring *ih)
>>>>>>>>>    {
>>>>>>>>> +    u32 tmp;
>>>>>>>>> +
>>>>>>>>>        if (ih->use_doorbell) {
>>>>>>>>>            /* XXX check if swapping is necessary on BE */
>>>>>>>>>            *ih->rptr_cpu = ih->rptr;
>>>>>>>>> @@ -275,6 +278,16 @@ static void tonga_ih_set_rptr(struct
>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>        } else {
>>>>>>>>>            WREG32(mmIH_RB_RPTR, ih->rptr);
>>>>>>>>>        }
>>>>>>>>> +
>>>>>>>>> +    /* If we overflowed previously (and thus set the 
>>>>>>>>> OVERFLOW_CLEAR
>>>>>>>>> bit),
>>>>>>>>> +     * reset it here to detect more overflows if they occur.
>>>>>>>>> +     */
>>>>>>>>> +    if (ih->overflow) {
>>>>>>>>> +        tmp = RREG32(mmIH_RB_CNTL);
>>>>>>>>> +        tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, 
>>>>>>>>> WPTR_OVERFLOW_CLEAR,
>>>>>>>>> 0);
>>>>>>>>> +        WREG32(mmIH_RB_CNTL, tmp);
>>>>>>>>> +        ih->overflow = false;
>>>>>>>>> +    }
>>>>>>>>>    }
>>>>>>>>>
>>>>>>>>>    static int tonga_ih_early_init(void *handle)
>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
>>>>>>>>> index d364c6dd152c..bb005924f194 100644
>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
>>>>>>>>> @@ -372,6 +372,7 @@ static u32 vega10_ih_get_wptr(struct
>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>        tmp = RREG32_NO_KIQ(ih_regs->ih_rb_cntl);
>>>>>>>>>        tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, 
>>>>>>>>> WPTR_OVERFLOW_CLEAR, 1);
>>>>>>>>>        WREG32_NO_KIQ(ih_regs->ih_rb_cntl, tmp);
>>>>>>>>> +    ih->overflow = true;
>>>>>>>>>
>>>>>>>>>    out:
>>>>>>>>>        return (wptr & ih->ptr_mask);
>>>>>>>>> @@ -413,6 +414,7 @@ static void vega10_ih_irq_rearm(struct
>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>    static void vega10_ih_set_rptr(struct amdgpu_device *adev,
>>>>>>>>>                       struct amdgpu_ih_ring *ih)
>>>>>>>>>    {
>>>>>>>>> +    u32 tmp;
>>>>>>>>>        struct amdgpu_ih_regs *ih_regs;
>>>>>>>>>
>>>>>>>>>        if (ih == &adev->irq.ih_soft)
>>>>>>>>> @@ -429,6 +431,16 @@ static void vega10_ih_set_rptr(struct
>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>            ih_regs = &ih->ih_regs;
>>>>>>>>>            WREG32(ih_regs->ih_rb_rptr, ih->rptr);
>>>>>>>>>        }
>>>>>>>>> +
>>>>>>>>> +    /* If we overflowed previously (and thus set the 
>>>>>>>>> OVERFLOW_CLEAR
>>>>>>>>> bit),
>>>>>>>>> +     * reset it here to detect more overflows if they occur.
>>>>>>>>> +     */
>>>>>>>>> +    if (ih->overflow) {
>>>>>>>>> +        tmp = RREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl);
>>>>>>>>> +        tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, 
>>>>>>>>> WPTR_OVERFLOW_CLEAR,
>>>>>>>>> 0);
>>>>>>>>> +        WREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl, tmp);
>>>>>>>>> +        ih->overflow = false;
>>>>>>>>> +    }
>>>>>>>>>    }
>>>>>>>>>
>>>>>>>>>    /**
>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/vega20_ih.c
>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/vega20_ih.c
>>>>>>>>> index ddfc6941f9d5..bb725a970697 100644
>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/vega20_ih.c
>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/vega20_ih.c
>>>>>>>>> @@ -420,6 +420,7 @@ static u32 vega20_ih_get_wptr(struct
>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>        tmp = RREG32_NO_KIQ(ih_regs->ih_rb_cntl);
>>>>>>>>>        tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, 
>>>>>>>>> WPTR_OVERFLOW_CLEAR, 1);
>>>>>>>>>        WREG32_NO_KIQ(ih_regs->ih_rb_cntl, tmp);
>>>>>>>>> +    ih->overflow = true;
>>>>>>>>>
>>>>>>>>>    out:
>>>>>>>>>        return (wptr & ih->ptr_mask);
>>>>>>>>> @@ -462,6 +463,7 @@ static void vega20_ih_irq_rearm(struct
>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>    static void vega20_ih_set_rptr(struct amdgpu_device *adev,
>>>>>>>>>                       struct amdgpu_ih_ring *ih)
>>>>>>>>>    {
>>>>>>>>> +    u32 tmp;
>>>>>>>>>        struct amdgpu_ih_regs *ih_regs;
>>>>>>>>>
>>>>>>>>>        if (ih == &adev->irq.ih_soft)
>>>>>>>>> @@ -478,6 +480,16 @@ static void vega20_ih_set_rptr(struct
>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>            ih_regs = &ih->ih_regs;
>>>>>>>>>            WREG32(ih_regs->ih_rb_rptr, ih->rptr);
>>>>>>>>>        }
>>>>>>>>> +
>>>>>>>>> +    /* If we overflowed previously (and thus set the 
>>>>>>>>> OVERFLOW_CLEAR
>>>>>>>>> bit),
>>>>>>>>> +     * reset it here to detect more overflows if they occur.
>>>>>>>>> +     */
>>>>>>>>> +    if (ih->overflow) {
>>>>>>>>> +        tmp = RREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl);
>>>>>>>>> +        tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, 
>>>>>>>>> WPTR_OVERFLOW_CLEAR,
>>>>>>>>> 0);
>>>>>>>>> +        WREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl, tmp);
>>>>>>>>> +        ih->overflow = false;
>>>>>>>>> +    }
>>>>>>>>>    }
>>>>>>>>>
>>>>>>>>>    /**
>>>>>>>>> -- 
>>>>>>>>> 2.43.0
>>>>>>>>>
>

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 1/2] drm/amdgpu: Reset IH OVERFLOW_CLEAR bit after writing rptr
  2024-01-19 19:18                 ` Felix Kuehling
@ 2024-01-22 10:10                   ` Christian König
  2024-01-22 10:21                     ` Friedrich Vock
  0 siblings, 1 reply; 28+ messages in thread
From: Christian König @ 2024-01-22 10:10 UTC (permalink / raw)
  To: Felix Kuehling, Christian König, Friedrich Vock,
	Alex Deucher, Dommati, Sunil-kumar
  Cc: Alex Deucher, amd-gfx, Joshua Ashton

Am 19.01.24 um 20:18 schrieb Felix Kuehling:
> On 2024-01-18 07:07, Christian König wrote:
>> Am 18.01.24 um 00:44 schrieb Friedrich Vock:
>>> On 18.01.24 00:00, Alex Deucher wrote:
>>>> [SNIP]
>>>>>>>> Right now, IH overflows, even if they occur repeatedly, only get
>>>>>>>> registered once. If not registering IH overflows can trivially 
>>>>>>>> lead to
>>>>>>>> system crashes, it's amdgpu's current handling that is broken.
>>>>>>> It's years that we last tested this but according to the HW
>>>>>>> documentation this should work fine.
>>>>>>>
>>>>>>> What could potentially happen is that the IH has silenced the 
>>>>>>> source
>>>>>>> of the overflow. We never implemented resetting those, but in this
>>>>>>> case that here won't help either.
>>>>>>>
>>>>>> If the IH silenced the page faults (which quite clearly cause the
>>>>>> overflow here), then how are the page faults still logged in dmesg?
>>>>> There should be a hardware rate limit for the page faults, e.g. there
>>>>> can only be X faults reported in N clock cycles and then a delay is
>>>>> inserted.
>>>> @Christian Koenig  Is that tied to xnack (i.e., noretry)? The default
>>>> is noretry=1 on gfx10.3 and newer.  But it can be overridden. It was
>>>> not set on some older kernels, maybe that is the problem? @Friedrich
>>>> Vock does setting amdgpu.noretry=1 fix the issue?
>>>
>>>
>>> No, amdgpu.noretry=1 does not change anything.
>>
>> Well the good news first the hw engineer answered rather quickly. The 
>> bad news is that the hardware really doesn't work as documented in 
>> multiple ways.
>>
>> First of all the CLEAR bit is a level and not a trigger, so the 
>> intention to clear it is indeed correct. For now please modify this 
>> patch so that the CLEAR bit is set and cleared directly after setting 
>> it, this way we should be able to detect further overflows immediately.
>>
>> Then the APU the Steam Deck uses simply doesn't have the filter 
>> function for page faults in the hardware, the really bad news is it 
>> also doesn't have the extra IH rings where we could re-route the 
>> faults to prevent overflows.
>>
>> That full explains the behavior you have been seeing, but doesn't 
>> really provide a doable solution to mitigate this problem.
>>
>> I'm going to dig deeper into the hw documentation and specification 
>> to see if we can use a different feature to avoid the overflow.
>
> If we're not enabling retry faults, then each wave front should 
> generate at most one fault. You should be able to avoid overflows by 
> making the IH ring large enough to accommodate one fault per wave front.

That is the exact same argument our HW engineers came up with when we 
asked why the APU is missing all those nice IH ring overflow avoidance 
features the dGPUs have :)

The only problem with this approach is that on Navi when a wave is 
blocked by waiting on a fault you can't kill it using soft recovery any 
more (at least when my understanding is correct).

>
> If the faults are coming from SDMA, that may be another problem. I'm 
> not sure whether it can generate multiple no-retry faults from the 
> same queue.

Regarding faults the SDMA is relatively harmless compared to the 3D 
engine, IIRC the resolve queue is something like 128 entries deep. So 
you never see more than those 128 faults if I'm not completely mistaken.

Sunil is setting up a test system for this in an AMD lab and will play 
around with a few HW features to mitigate the issue. I still hope that 
we can completely avoid the overflow altogether.

Regards,
Christian.

>
> Regards,
>   Felix
>
>
>>
>> Thanks,
>> Christian.
>>
>>>
>>> Regards,
>>> Friedrich
>>>
>>>> Alex
>>>>
>>>>>>>> The possibility of a repeated IH overflow in between reading 
>>>>>>>> the wptr
>>>>>>>> and updating the rptr is a good point, but how can we detect 
>>>>>>>> that at
>>>>>>>> all? It seems to me like we can't set the OVERFLOW_CLEAR bit at 
>>>>>>>> all
>>>>>>>> then, because we're guaranteed to miss any overflows that 
>>>>>>>> happen while
>>>>>>>> the bit is set.
>>>>>>> When an IH overflow is signaled we clear that flag by writing 1 
>>>>>>> into
>>>>>>> the OVERFLOW_CLEAR bit and skip one entry in the IH ring buffer.
>>>>>>>
>>>>>>> What can of course happen is that the IH ring buffer overflows more
>>>>>>> than this single entry and we process IVs which are potentially
>>>>>>> corrupted, but we won't miss any additional overflows since we only
>>>>>>> start processing after resetting the flag.
>>>>>>>
>>>>>>> An IH overflow is also something you should *never* see in a
>>>>>>> production system. This is purely for driver bringup and as 
>>>>>>> fallback
>>>>>>> when there is a severe incorrect programming of the HW.
>>>>>>>
>>>>>>> The only exception of that is page fault handling on MI products
>>>>>>> because of a hardware bug, to mitigate this we are processing page
>>>>>>> faults on a separate IH ring on those parts.
>>>>>>>
>>>>>>> On all other hw generations the IH should have some rate limit 
>>>>>>> for the
>>>>>>> number of faults generated per second, so that the CPU is always 
>>>>>>> able
>>>>>>> to catch up.
>>>>>> I'm wondering if there is another bug in here somewhere. Your
>>>>>> explanation of how it's supposed to work makes a lot of sense, 
>>>>>> but from
>>>>>> what I can tell it doesn't work that way when I test it.
>>>>>>
>>>>>>  From the printk_ratelimit stats it would seem like >2000 faults 
>>>>>> arrive
>>>>>> in less than a second, so perhaps your theory about fault interrupt
>>>>>> ratelimiting not working is correct (but it's hard for me to 
>>>>>> verify what
>>>>>> is going on without the documentation).
>>>>> I'm going to ping the relevant engineer and putting someone on the 
>>>>> task
>>>>> to take a look.
>>>>>
>>>>> Thanks,
>>>>> Christian.
>>>>>
>>>>>> Regards,
>>>>>> Friedrich
>>>>>>
>>>>>>> Regards,
>>>>>>> Christian.
>>>>>>>
>>>>>>>> Regards,
>>>>>>>> Friedrich
>>>>>>>>
>>>>>>>>> When you clear the overflow again when updating the RPTR you 
>>>>>>>>> could
>>>>>>>>> loose another overflow which might have happened in between 
>>>>>>>>> and so
>>>>>>>>> potentially process corrupted IVs.
>>>>>>>>>
>>>>>>>>> That can trivially crash the system.
>>>>>>>>>
>>>>>>>>> Regards,
>>>>>>>>> Christian.
>>>>>>>>>
>>>>>>>>>>    }
>>>>>>>>>>
>>>>>>>>>>    static int cik_ih_early_init(void *handle)
>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/cz_ih.c
>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/cz_ih.c
>>>>>>>>>> index b8c47e0cf37a..076559668573 100644
>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/cz_ih.c
>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/cz_ih.c
>>>>>>>>>> @@ -215,7 +215,7 @@ static u32 cz_ih_get_wptr(struct 
>>>>>>>>>> amdgpu_device
>>>>>>>>>> *adev,
>>>>>>>>>>        tmp = RREG32(mmIH_RB_CNTL);
>>>>>>>>>>        tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, 
>>>>>>>>>> WPTR_OVERFLOW_CLEAR, 1);
>>>>>>>>>>        WREG32(mmIH_RB_CNTL, tmp);
>>>>>>>>>> -
>>>>>>>>>> +    ih->overflow = true;
>>>>>>>>>>
>>>>>>>>>>    out:
>>>>>>>>>>        return (wptr & ih->ptr_mask);
>>>>>>>>>> @@ -266,7 +266,19 @@ static void cz_ih_decode_iv(struct 
>>>>>>>>>> amdgpu_device
>>>>>>>>>> *adev,
>>>>>>>>>>    static void cz_ih_set_rptr(struct amdgpu_device *adev,
>>>>>>>>>>                   struct amdgpu_ih_ring *ih)
>>>>>>>>>>    {
>>>>>>>>>> +    u32 tmp;
>>>>>>>>>> +
>>>>>>>>>>        WREG32(mmIH_RB_RPTR, ih->rptr);
>>>>>>>>>> +
>>>>>>>>>> +    /* If we overflowed previously (and thus set the 
>>>>>>>>>> OVERFLOW_CLEAR
>>>>>>>>>> bit),
>>>>>>>>>> +     * reset it here to detect more overflows if they occur.
>>>>>>>>>> +     */
>>>>>>>>>> +    if (ih->overflow) {
>>>>>>>>>> +        tmp = RREG32(mmIH_RB_CNTL);
>>>>>>>>>> +        tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, 
>>>>>>>>>> WPTR_OVERFLOW_CLEAR,
>>>>>>>>>> 0);
>>>>>>>>>> +        WREG32(mmIH_RB_CNTL, tmp);
>>>>>>>>>> +        ih->overflow = false;
>>>>>>>>>> +    }
>>>>>>>>>>    }
>>>>>>>>>>
>>>>>>>>>>    static int cz_ih_early_init(void *handle)
>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/iceland_ih.c
>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/iceland_ih.c
>>>>>>>>>> index aecad530b10a..1a5e668643d1 100644
>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/iceland_ih.c
>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/iceland_ih.c
>>>>>>>>>> @@ -214,7 +214,7 @@ static u32 iceland_ih_get_wptr(struct
>>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>>        tmp = RREG32(mmIH_RB_CNTL);
>>>>>>>>>>        tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, 
>>>>>>>>>> WPTR_OVERFLOW_CLEAR, 1);
>>>>>>>>>>        WREG32(mmIH_RB_CNTL, tmp);
>>>>>>>>>> -
>>>>>>>>>> +    ih->overflow = true;
>>>>>>>>>>
>>>>>>>>>>    out:
>>>>>>>>>>        return (wptr & ih->ptr_mask);
>>>>>>>>>> @@ -265,7 +265,19 @@ static void iceland_ih_decode_iv(struct
>>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>>    static void iceland_ih_set_rptr(struct amdgpu_device *adev,
>>>>>>>>>>                    struct amdgpu_ih_ring *ih)
>>>>>>>>>>    {
>>>>>>>>>> +    u32 tmp;
>>>>>>>>>> +
>>>>>>>>>>        WREG32(mmIH_RB_RPTR, ih->rptr);
>>>>>>>>>> +
>>>>>>>>>> +    /* If we overflowed previously (and thus set the 
>>>>>>>>>> OVERFLOW_CLEAR
>>>>>>>>>> bit),
>>>>>>>>>> +     * reset it here to detect more overflows if they occur.
>>>>>>>>>> +     */
>>>>>>>>>> +    if (ih->overflow) {
>>>>>>>>>> +        tmp = RREG32(mmIH_RB_CNTL);
>>>>>>>>>> +        tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, 
>>>>>>>>>> WPTR_OVERFLOW_CLEAR,
>>>>>>>>>> 0);
>>>>>>>>>> +        WREG32(mmIH_RB_CNTL, tmp);
>>>>>>>>>> +        ih->overflow = false;
>>>>>>>>>> +    }
>>>>>>>>>>    }
>>>>>>>>>>
>>>>>>>>>>    static int iceland_ih_early_init(void *handle)
>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/ih_v6_0.c
>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/ih_v6_0.c
>>>>>>>>>> index d9ed7332d805..ce8f7feec713 100644
>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/ih_v6_0.c
>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/ih_v6_0.c
>>>>>>>>>> @@ -418,6 +418,8 @@ static u32 ih_v6_0_get_wptr(struct 
>>>>>>>>>> amdgpu_device
>>>>>>>>>> *adev,
>>>>>>>>>>        tmp = RREG32_NO_KIQ(ih_regs->ih_rb_cntl);
>>>>>>>>>>        tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, 
>>>>>>>>>> WPTR_OVERFLOW_CLEAR, 1);
>>>>>>>>>>        WREG32_NO_KIQ(ih_regs->ih_rb_cntl, tmp);
>>>>>>>>>> +    ih->overflow = true;
>>>>>>>>>> +
>>>>>>>>>>    out:
>>>>>>>>>>        return (wptr & ih->ptr_mask);
>>>>>>>>>>    }
>>>>>>>>>> @@ -459,6 +461,7 @@ static void ih_v6_0_irq_rearm(struct
>>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>>    static void ih_v6_0_set_rptr(struct amdgpu_device *adev,
>>>>>>>>>>                       struct amdgpu_ih_ring *ih)
>>>>>>>>>>    {
>>>>>>>>>> +    u32 tmp;
>>>>>>>>>>        struct amdgpu_ih_regs *ih_regs;
>>>>>>>>>>
>>>>>>>>>>        if (ih->use_doorbell) {
>>>>>>>>>> @@ -472,6 +475,16 @@ static void ih_v6_0_set_rptr(struct
>>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>>            ih_regs = &ih->ih_regs;
>>>>>>>>>>            WREG32(ih_regs->ih_rb_rptr, ih->rptr);
>>>>>>>>>>        }
>>>>>>>>>> +
>>>>>>>>>> +    /* If we overflowed previously (and thus set the 
>>>>>>>>>> OVERFLOW_CLEAR
>>>>>>>>>> bit),
>>>>>>>>>> +     * reset it here to detect more overflows if they occur.
>>>>>>>>>> +     */
>>>>>>>>>> +    if (ih->overflow) {
>>>>>>>>>> +        tmp = RREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl);
>>>>>>>>>> +        tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, 
>>>>>>>>>> WPTR_OVERFLOW_CLEAR,
>>>>>>>>>> 0);
>>>>>>>>>> + WREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl, tmp);
>>>>>>>>>> +        ih->overflow = false;
>>>>>>>>>> +    }
>>>>>>>>>>    }
>>>>>>>>>>
>>>>>>>>>>    /**
>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/ih_v6_1.c
>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/ih_v6_1.c
>>>>>>>>>> index 8fb05eae340a..668788ad34d9 100644
>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/ih_v6_1.c
>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/ih_v6_1.c
>>>>>>>>>> @@ -418,6 +418,8 @@ static u32 ih_v6_1_get_wptr(struct 
>>>>>>>>>> amdgpu_device
>>>>>>>>>> *adev,
>>>>>>>>>>        tmp = RREG32_NO_KIQ(ih_regs->ih_rb_cntl);
>>>>>>>>>>        tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, 
>>>>>>>>>> WPTR_OVERFLOW_CLEAR, 1);
>>>>>>>>>>        WREG32_NO_KIQ(ih_regs->ih_rb_cntl, tmp);
>>>>>>>>>> +    ih->overflow = true;
>>>>>>>>>> +
>>>>>>>>>>    out:
>>>>>>>>>>        return (wptr & ih->ptr_mask);
>>>>>>>>>>    }
>>>>>>>>>> @@ -459,6 +461,7 @@ static void ih_v6_1_irq_rearm(struct
>>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>>    static void ih_v6_1_set_rptr(struct amdgpu_device *adev,
>>>>>>>>>>                       struct amdgpu_ih_ring *ih)
>>>>>>>>>>    {
>>>>>>>>>> +    u32 tmp;
>>>>>>>>>>        struct amdgpu_ih_regs *ih_regs;
>>>>>>>>>>
>>>>>>>>>>        if (ih->use_doorbell) {
>>>>>>>>>> @@ -472,6 +475,16 @@ static void ih_v6_1_set_rptr(struct
>>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>>            ih_regs = &ih->ih_regs;
>>>>>>>>>>            WREG32(ih_regs->ih_rb_rptr, ih->rptr);
>>>>>>>>>>        }
>>>>>>>>>> +
>>>>>>>>>> +    /* If we overflowed previously (and thus set the 
>>>>>>>>>> OVERFLOW_CLEAR
>>>>>>>>>> bit),
>>>>>>>>>> +     * reset it here to detect more overflows if they occur.
>>>>>>>>>> +     */
>>>>>>>>>> +    if (ih->overflow) {
>>>>>>>>>> +        tmp = RREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl);
>>>>>>>>>> +        tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, 
>>>>>>>>>> WPTR_OVERFLOW_CLEAR,
>>>>>>>>>> 0);
>>>>>>>>>> + WREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl, tmp);
>>>>>>>>>> +        ih->overflow = false;
>>>>>>>>>> +    }
>>>>>>>>>>    }
>>>>>>>>>>
>>>>>>>>>>    /**
>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/navi10_ih.c
>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/navi10_ih.c
>>>>>>>>>> index e64b33115848..0bdac923cb4d 100644
>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/navi10_ih.c
>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/navi10_ih.c
>>>>>>>>>> @@ -442,6 +442,7 @@ static u32 navi10_ih_get_wptr(struct
>>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>>        tmp = RREG32_NO_KIQ(ih_regs->ih_rb_cntl);
>>>>>>>>>>        tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, 
>>>>>>>>>> WPTR_OVERFLOW_CLEAR, 1);
>>>>>>>>>>        WREG32_NO_KIQ(ih_regs->ih_rb_cntl, tmp);
>>>>>>>>>> +    ih->overflow = true;
>>>>>>>>>>    out:
>>>>>>>>>>        return (wptr & ih->ptr_mask);
>>>>>>>>>>    }
>>>>>>>>>> @@ -483,6 +484,7 @@ static void navi10_ih_irq_rearm(struct
>>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>>    static void navi10_ih_set_rptr(struct amdgpu_device *adev,
>>>>>>>>>>                       struct amdgpu_ih_ring *ih)
>>>>>>>>>>    {
>>>>>>>>>> +    u32 tmp;
>>>>>>>>>>        struct amdgpu_ih_regs *ih_regs;
>>>>>>>>>>
>>>>>>>>>>        if (ih == &adev->irq.ih_soft)
>>>>>>>>>> @@ -499,6 +501,16 @@ static void navi10_ih_set_rptr(struct
>>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>>            ih_regs = &ih->ih_regs;
>>>>>>>>>>            WREG32(ih_regs->ih_rb_rptr, ih->rptr);
>>>>>>>>>>        }
>>>>>>>>>> +
>>>>>>>>>> +    /* If we overflowed previously (and thus set the 
>>>>>>>>>> OVERFLOW_CLEAR
>>>>>>>>>> bit),
>>>>>>>>>> +     * reset it here to detect more overflows if they occur.
>>>>>>>>>> +     */
>>>>>>>>>> +    if (ih->overflow) {
>>>>>>>>>> +        tmp = RREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl);
>>>>>>>>>> +        tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, 
>>>>>>>>>> WPTR_OVERFLOW_CLEAR,
>>>>>>>>>> 0);
>>>>>>>>>> + WREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl, tmp);
>>>>>>>>>> +        ih->overflow = false;
>>>>>>>>>> +    }
>>>>>>>>>>    }
>>>>>>>>>>
>>>>>>>>>>    /**
>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/si_ih.c
>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/si_ih.c
>>>>>>>>>> index 9a24f17a5750..ff35056d2b54 100644
>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/si_ih.c
>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/si_ih.c
>>>>>>>>>> @@ -119,6 +119,7 @@ static u32 si_ih_get_wptr(struct 
>>>>>>>>>> amdgpu_device
>>>>>>>>>> *adev,
>>>>>>>>>>            tmp = RREG32(IH_RB_CNTL);
>>>>>>>>>>            tmp |= IH_RB_CNTL__WPTR_OVERFLOW_CLEAR_MASK;
>>>>>>>>>>            WREG32(IH_RB_CNTL, tmp);
>>>>>>>>>> +        ih->overflow = true;
>>>>>>>>>>        }
>>>>>>>>>>        return (wptr & ih->ptr_mask);
>>>>>>>>>>    }
>>>>>>>>>> @@ -147,7 +148,18 @@ static void si_ih_decode_iv(struct 
>>>>>>>>>> amdgpu_device
>>>>>>>>>> *adev,
>>>>>>>>>>    static void si_ih_set_rptr(struct amdgpu_device *adev,
>>>>>>>>>>                   struct amdgpu_ih_ring *ih)
>>>>>>>>>>    {
>>>>>>>>>> +    u32 tmp;
>>>>>>>>>> +
>>>>>>>>>>        WREG32(IH_RB_RPTR, ih->rptr);
>>>>>>>>>> +
>>>>>>>>>> +    /* If we overflowed previously (and thus set the 
>>>>>>>>>> OVERFLOW_CLEAR
>>>>>>>>>> bit),
>>>>>>>>>> +     * reset it here to detect more overflows if they occur.
>>>>>>>>>> +     */
>>>>>>>>>> +    if (ih->overflow) {
>>>>>>>>>> +        tmp = RREG32(IH_RB_CNTL);
>>>>>>>>>> +        tmp &= ~IH_RB_CNTL__WPTR_OVERFLOW_CLEAR_MASK;
>>>>>>>>>> +        WREG32(IH_RB_CNTL, tmp);
>>>>>>>>>> +    }
>>>>>>>>>>    }
>>>>>>>>>>
>>>>>>>>>>    static int si_ih_early_init(void *handle)
>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/tonga_ih.c
>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/tonga_ih.c
>>>>>>>>>> index 917707bba7f3..6f5090d3db48 100644
>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/tonga_ih.c
>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/tonga_ih.c
>>>>>>>>>> @@ -218,6 +218,7 @@ static u32 tonga_ih_get_wptr(struct 
>>>>>>>>>> amdgpu_device
>>>>>>>>>> *adev,
>>>>>>>>>>        tmp = RREG32(mmIH_RB_CNTL);
>>>>>>>>>>        tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, 
>>>>>>>>>> WPTR_OVERFLOW_CLEAR, 1);
>>>>>>>>>>        WREG32(mmIH_RB_CNTL, tmp);
>>>>>>>>>> +    ih->overflow = true;
>>>>>>>>>>
>>>>>>>>>>    out:
>>>>>>>>>>        return (wptr & ih->ptr_mask);
>>>>>>>>>> @@ -268,6 +269,8 @@ static void tonga_ih_decode_iv(struct
>>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>>    static void tonga_ih_set_rptr(struct amdgpu_device *adev,
>>>>>>>>>>                      struct amdgpu_ih_ring *ih)
>>>>>>>>>>    {
>>>>>>>>>> +    u32 tmp;
>>>>>>>>>> +
>>>>>>>>>>        if (ih->use_doorbell) {
>>>>>>>>>>            /* XXX check if swapping is necessary on BE */
>>>>>>>>>>            *ih->rptr_cpu = ih->rptr;
>>>>>>>>>> @@ -275,6 +278,16 @@ static void tonga_ih_set_rptr(struct
>>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>>        } else {
>>>>>>>>>>            WREG32(mmIH_RB_RPTR, ih->rptr);
>>>>>>>>>>        }
>>>>>>>>>> +
>>>>>>>>>> +    /* If we overflowed previously (and thus set the 
>>>>>>>>>> OVERFLOW_CLEAR
>>>>>>>>>> bit),
>>>>>>>>>> +     * reset it here to detect more overflows if they occur.
>>>>>>>>>> +     */
>>>>>>>>>> +    if (ih->overflow) {
>>>>>>>>>> +        tmp = RREG32(mmIH_RB_CNTL);
>>>>>>>>>> +        tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, 
>>>>>>>>>> WPTR_OVERFLOW_CLEAR,
>>>>>>>>>> 0);
>>>>>>>>>> +        WREG32(mmIH_RB_CNTL, tmp);
>>>>>>>>>> +        ih->overflow = false;
>>>>>>>>>> +    }
>>>>>>>>>>    }
>>>>>>>>>>
>>>>>>>>>>    static int tonga_ih_early_init(void *handle)
>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
>>>>>>>>>> index d364c6dd152c..bb005924f194 100644
>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
>>>>>>>>>> @@ -372,6 +372,7 @@ static u32 vega10_ih_get_wptr(struct
>>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>>        tmp = RREG32_NO_KIQ(ih_regs->ih_rb_cntl);
>>>>>>>>>>        tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, 
>>>>>>>>>> WPTR_OVERFLOW_CLEAR, 1);
>>>>>>>>>>        WREG32_NO_KIQ(ih_regs->ih_rb_cntl, tmp);
>>>>>>>>>> +    ih->overflow = true;
>>>>>>>>>>
>>>>>>>>>>    out:
>>>>>>>>>>        return (wptr & ih->ptr_mask);
>>>>>>>>>> @@ -413,6 +414,7 @@ static void vega10_ih_irq_rearm(struct
>>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>>    static void vega10_ih_set_rptr(struct amdgpu_device *adev,
>>>>>>>>>>                       struct amdgpu_ih_ring *ih)
>>>>>>>>>>    {
>>>>>>>>>> +    u32 tmp;
>>>>>>>>>>        struct amdgpu_ih_regs *ih_regs;
>>>>>>>>>>
>>>>>>>>>>        if (ih == &adev->irq.ih_soft)
>>>>>>>>>> @@ -429,6 +431,16 @@ static void vega10_ih_set_rptr(struct
>>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>>            ih_regs = &ih->ih_regs;
>>>>>>>>>>            WREG32(ih_regs->ih_rb_rptr, ih->rptr);
>>>>>>>>>>        }
>>>>>>>>>> +
>>>>>>>>>> +    /* If we overflowed previously (and thus set the 
>>>>>>>>>> OVERFLOW_CLEAR
>>>>>>>>>> bit),
>>>>>>>>>> +     * reset it here to detect more overflows if they occur.
>>>>>>>>>> +     */
>>>>>>>>>> +    if (ih->overflow) {
>>>>>>>>>> +        tmp = RREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl);
>>>>>>>>>> +        tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, 
>>>>>>>>>> WPTR_OVERFLOW_CLEAR,
>>>>>>>>>> 0);
>>>>>>>>>> + WREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl, tmp);
>>>>>>>>>> +        ih->overflow = false;
>>>>>>>>>> +    }
>>>>>>>>>>    }
>>>>>>>>>>
>>>>>>>>>>    /**
>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/vega20_ih.c
>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/vega20_ih.c
>>>>>>>>>> index ddfc6941f9d5..bb725a970697 100644
>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/vega20_ih.c
>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/vega20_ih.c
>>>>>>>>>> @@ -420,6 +420,7 @@ static u32 vega20_ih_get_wptr(struct
>>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>>        tmp = RREG32_NO_KIQ(ih_regs->ih_rb_cntl);
>>>>>>>>>>        tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, 
>>>>>>>>>> WPTR_OVERFLOW_CLEAR, 1);
>>>>>>>>>>        WREG32_NO_KIQ(ih_regs->ih_rb_cntl, tmp);
>>>>>>>>>> +    ih->overflow = true;
>>>>>>>>>>
>>>>>>>>>>    out:
>>>>>>>>>>        return (wptr & ih->ptr_mask);
>>>>>>>>>> @@ -462,6 +463,7 @@ static void vega20_ih_irq_rearm(struct
>>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>>    static void vega20_ih_set_rptr(struct amdgpu_device *adev,
>>>>>>>>>>                       struct amdgpu_ih_ring *ih)
>>>>>>>>>>    {
>>>>>>>>>> +    u32 tmp;
>>>>>>>>>>        struct amdgpu_ih_regs *ih_regs;
>>>>>>>>>>
>>>>>>>>>>        if (ih == &adev->irq.ih_soft)
>>>>>>>>>> @@ -478,6 +480,16 @@ static void vega20_ih_set_rptr(struct
>>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>>            ih_regs = &ih->ih_regs;
>>>>>>>>>>            WREG32(ih_regs->ih_rb_rptr, ih->rptr);
>>>>>>>>>>        }
>>>>>>>>>> +
>>>>>>>>>> +    /* If we overflowed previously (and thus set the 
>>>>>>>>>> OVERFLOW_CLEAR
>>>>>>>>>> bit),
>>>>>>>>>> +     * reset it here to detect more overflows if they occur.
>>>>>>>>>> +     */
>>>>>>>>>> +    if (ih->overflow) {
>>>>>>>>>> +        tmp = RREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl);
>>>>>>>>>> +        tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, 
>>>>>>>>>> WPTR_OVERFLOW_CLEAR,
>>>>>>>>>> 0);
>>>>>>>>>> + WREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl, tmp);
>>>>>>>>>> +        ih->overflow = false;
>>>>>>>>>> +    }
>>>>>>>>>>    }
>>>>>>>>>>
>>>>>>>>>>    /**
>>>>>>>>>> -- 
>>>>>>>>>> 2.43.0
>>>>>>>>>>
>>


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 1/2] drm/amdgpu: Reset IH OVERFLOW_CLEAR bit after writing rptr
  2024-01-22 10:10                   ` Christian König
@ 2024-01-22 10:21                     ` Friedrich Vock
  2024-01-22 10:45                       ` Friedrich Vock
  0 siblings, 1 reply; 28+ messages in thread
From: Friedrich Vock @ 2024-01-22 10:21 UTC (permalink / raw)
  To: Christian König, Felix Kuehling, Christian König,
	Alex Deucher, Dommati, Sunil-kumar
  Cc: Alex Deucher, Joshua Ashton, amd-gfx

On 22.01.24 11:10, Christian König wrote:
> Am 19.01.24 um 20:18 schrieb Felix Kuehling:
>> On 2024-01-18 07:07, Christian König wrote:
>>> Am 18.01.24 um 00:44 schrieb Friedrich Vock:
>>>> On 18.01.24 00:00, Alex Deucher wrote:
>>>>> [SNIP]
>>>>>>>>> Right now, IH overflows, even if they occur repeatedly, only get
>>>>>>>>> registered once. If not registering IH overflows can trivially
>>>>>>>>> lead to
>>>>>>>>> system crashes, it's amdgpu's current handling that is broken.
>>>>>>>> It's years that we last tested this but according to the HW
>>>>>>>> documentation this should work fine.
>>>>>>>>
>>>>>>>> What could potentially happen is that the IH has silenced the
>>>>>>>> source
>>>>>>>> of the overflow. We never implemented resetting those, but in this
>>>>>>>> case that here won't help either.
>>>>>>>>
>>>>>>> If the IH silenced the page faults (which quite clearly cause the
>>>>>>> overflow here), then how are the page faults still logged in dmesg?
>>>>>> There should be a hardware rate limit for the page faults, e.g.
>>>>>> there
>>>>>> can only be X faults reported in N clock cycles and then a delay is
>>>>>> inserted.
>>>>> @Christian Koenig  Is that tied to xnack (i.e., noretry)? The default
>>>>> is noretry=1 on gfx10.3 and newer.  But it can be overridden. It was
>>>>> not set on some older kernels, maybe that is the problem? @Friedrich
>>>>> Vock does setting amdgpu.noretry=1 fix the issue?
>>>>
>>>>
>>>> No, amdgpu.noretry=1 does not change anything.
>>>
>>> Well the good news first the hw engineer answered rather quickly.
>>> The bad news is that the hardware really doesn't work as documented
>>> in multiple ways.
>>>
>>> First of all the CLEAR bit is a level and not a trigger, so the
>>> intention to clear it is indeed correct. For now please modify this
>>> patch so that the CLEAR bit is set and cleared directly after
>>> setting it, this way we should be able to detect further overflows
>>> immediately.
>>>
>>> Then the APU the Steam Deck uses simply doesn't have the filter
>>> function for page faults in the hardware, the really bad news is it
>>> also doesn't have the extra IH rings where we could re-route the
>>> faults to prevent overflows.
>>>
>>> That full explains the behavior you have been seeing, but doesn't
>>> really provide a doable solution to mitigate this problem.
>>>
>>> I'm going to dig deeper into the hw documentation and specification
>>> to see if we can use a different feature to avoid the overflow.
>>
>> If we're not enabling retry faults, then each wave front should
>> generate at most one fault. You should be able to avoid overflows by
>> making the IH ring large enough to accommodate one fault per wave front.
>
> That is the exact same argument our HW engineers came up with when we
> asked why the APU is missing all those nice IH ring overflow avoidance
> features the dGPUs have :)
>
I can reproduce IH overflows on my RX 6700 XT dGPU as well FWIW.

> The only problem with this approach is that on Navi when a wave is
> blocked by waiting on a fault you can't kill it using soft recovery
> any more (at least when my understanding is correct).
>
Killing page-faulted waves via soft recovery works. From my testing on
Deck, it seems to take a bit of time, but if you try for long enough
soft recovery eventually succeeds.

Regards,
Friedrich

>>
>> If the faults are coming from SDMA, that may be another problem. I'm
>> not sure whether it can generate multiple no-retry faults from the
>> same queue.
>
> Regarding faults the SDMA is relatively harmless compared to the 3D
> engine, IIRC the resolve queue is something like 128 entries deep. So
> you never see more than those 128 faults if I'm not completely mistaken.
>
> Sunil is setting up a test system for this in an AMD lab and will play
> around with a few HW features to mitigate the issue. I still hope that
> we can completely avoid the overflow altogether.
>
> Regards,
> Christian.
>
>>
>> Regards,
>>   Felix
>>
>>
>>>
>>> Thanks,
>>> Christian.
>>>
>>>>
>>>> Regards,
>>>> Friedrich
>>>>
>>>>> Alex
>>>>>
>>>>>>>>> The possibility of a repeated IH overflow in between reading
>>>>>>>>> the wptr
>>>>>>>>> and updating the rptr is a good point, but how can we detect
>>>>>>>>> that at
>>>>>>>>> all? It seems to me like we can't set the OVERFLOW_CLEAR bit
>>>>>>>>> at all
>>>>>>>>> then, because we're guaranteed to miss any overflows that
>>>>>>>>> happen while
>>>>>>>>> the bit is set.
>>>>>>>> When an IH overflow is signaled we clear that flag by writing 1
>>>>>>>> into
>>>>>>>> the OVERFLOW_CLEAR bit and skip one entry in the IH ring buffer.
>>>>>>>>
>>>>>>>> What can of course happen is that the IH ring buffer overflows
>>>>>>>> more
>>>>>>>> than this single entry and we process IVs which are potentially
>>>>>>>> corrupted, but we won't miss any additional overflows since we
>>>>>>>> only
>>>>>>>> start processing after resetting the flag.
>>>>>>>>
>>>>>>>> An IH overflow is also something you should *never* see in a
>>>>>>>> production system. This is purely for driver bringup and as
>>>>>>>> fallback
>>>>>>>> when there is a severe incorrect programming of the HW.
>>>>>>>>
>>>>>>>> The only exception of that is page fault handling on MI products
>>>>>>>> because of a hardware bug, to mitigate this we are processing page
>>>>>>>> faults on a separate IH ring on those parts.
>>>>>>>>
>>>>>>>> On all other hw generations the IH should have some rate limit
>>>>>>>> for the
>>>>>>>> number of faults generated per second, so that the CPU is
>>>>>>>> always able
>>>>>>>> to catch up.
>>>>>>> I'm wondering if there is another bug in here somewhere. Your
>>>>>>> explanation of how it's supposed to work makes a lot of sense,
>>>>>>> but from
>>>>>>> what I can tell it doesn't work that way when I test it.
>>>>>>>
>>>>>>>  From the printk_ratelimit stats it would seem like >2000 faults
>>>>>>> arrive
>>>>>>> in less than a second, so perhaps your theory about fault interrupt
>>>>>>> ratelimiting not working is correct (but it's hard for me to
>>>>>>> verify what
>>>>>>> is going on without the documentation).
>>>>>> I'm going to ping the relevant engineer and putting someone on
>>>>>> the task
>>>>>> to take a look.
>>>>>>
>>>>>> Thanks,
>>>>>> Christian.
>>>>>>
>>>>>>> Regards,
>>>>>>> Friedrich
>>>>>>>
>>>>>>>> Regards,
>>>>>>>> Christian.
>>>>>>>>
>>>>>>>>> Regards,
>>>>>>>>> Friedrich
>>>>>>>>>
>>>>>>>>>> When you clear the overflow again when updating the RPTR you
>>>>>>>>>> could
>>>>>>>>>> loose another overflow which might have happened in between
>>>>>>>>>> and so
>>>>>>>>>> potentially process corrupted IVs.
>>>>>>>>>>
>>>>>>>>>> That can trivially crash the system.
>>>>>>>>>>
>>>>>>>>>> Regards,
>>>>>>>>>> Christian.
>>>>>>>>>>
>>>>>>>>>>>    }
>>>>>>>>>>>
>>>>>>>>>>>    static int cik_ih_early_init(void *handle)
>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/cz_ih.c
>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/cz_ih.c
>>>>>>>>>>> index b8c47e0cf37a..076559668573 100644
>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/cz_ih.c
>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/cz_ih.c
>>>>>>>>>>> @@ -215,7 +215,7 @@ static u32 cz_ih_get_wptr(struct
>>>>>>>>>>> amdgpu_device
>>>>>>>>>>> *adev,
>>>>>>>>>>>        tmp = RREG32(mmIH_RB_CNTL);
>>>>>>>>>>>        tmp = REG_SET_FIELD(tmp, IH_RB_CNTL,
>>>>>>>>>>> WPTR_OVERFLOW_CLEAR, 1);
>>>>>>>>>>>        WREG32(mmIH_RB_CNTL, tmp);
>>>>>>>>>>> -
>>>>>>>>>>> +    ih->overflow = true;
>>>>>>>>>>>
>>>>>>>>>>>    out:
>>>>>>>>>>>        return (wptr & ih->ptr_mask);
>>>>>>>>>>> @@ -266,7 +266,19 @@ static void cz_ih_decode_iv(struct
>>>>>>>>>>> amdgpu_device
>>>>>>>>>>> *adev,
>>>>>>>>>>>    static void cz_ih_set_rptr(struct amdgpu_device *adev,
>>>>>>>>>>>                   struct amdgpu_ih_ring *ih)
>>>>>>>>>>>    {
>>>>>>>>>>> +    u32 tmp;
>>>>>>>>>>> +
>>>>>>>>>>>        WREG32(mmIH_RB_RPTR, ih->rptr);
>>>>>>>>>>> +
>>>>>>>>>>> +    /* If we overflowed previously (and thus set the
>>>>>>>>>>> OVERFLOW_CLEAR
>>>>>>>>>>> bit),
>>>>>>>>>>> +     * reset it here to detect more overflows if they occur.
>>>>>>>>>>> +     */
>>>>>>>>>>> +    if (ih->overflow) {
>>>>>>>>>>> +        tmp = RREG32(mmIH_RB_CNTL);
>>>>>>>>>>> +        tmp = REG_SET_FIELD(tmp, IH_RB_CNTL,
>>>>>>>>>>> WPTR_OVERFLOW_CLEAR,
>>>>>>>>>>> 0);
>>>>>>>>>>> +        WREG32(mmIH_RB_CNTL, tmp);
>>>>>>>>>>> +        ih->overflow = false;
>>>>>>>>>>> +    }
>>>>>>>>>>>    }
>>>>>>>>>>>
>>>>>>>>>>>    static int cz_ih_early_init(void *handle)
>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/iceland_ih.c
>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/iceland_ih.c
>>>>>>>>>>> index aecad530b10a..1a5e668643d1 100644
>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/iceland_ih.c
>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/iceland_ih.c
>>>>>>>>>>> @@ -214,7 +214,7 @@ static u32 iceland_ih_get_wptr(struct
>>>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>>>        tmp = RREG32(mmIH_RB_CNTL);
>>>>>>>>>>>        tmp = REG_SET_FIELD(tmp, IH_RB_CNTL,
>>>>>>>>>>> WPTR_OVERFLOW_CLEAR, 1);
>>>>>>>>>>>        WREG32(mmIH_RB_CNTL, tmp);
>>>>>>>>>>> -
>>>>>>>>>>> +    ih->overflow = true;
>>>>>>>>>>>
>>>>>>>>>>>    out:
>>>>>>>>>>>        return (wptr & ih->ptr_mask);
>>>>>>>>>>> @@ -265,7 +265,19 @@ static void iceland_ih_decode_iv(struct
>>>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>>>    static void iceland_ih_set_rptr(struct amdgpu_device *adev,
>>>>>>>>>>>                    struct amdgpu_ih_ring *ih)
>>>>>>>>>>>    {
>>>>>>>>>>> +    u32 tmp;
>>>>>>>>>>> +
>>>>>>>>>>>        WREG32(mmIH_RB_RPTR, ih->rptr);
>>>>>>>>>>> +
>>>>>>>>>>> +    /* If we overflowed previously (and thus set the
>>>>>>>>>>> OVERFLOW_CLEAR
>>>>>>>>>>> bit),
>>>>>>>>>>> +     * reset it here to detect more overflows if they occur.
>>>>>>>>>>> +     */
>>>>>>>>>>> +    if (ih->overflow) {
>>>>>>>>>>> +        tmp = RREG32(mmIH_RB_CNTL);
>>>>>>>>>>> +        tmp = REG_SET_FIELD(tmp, IH_RB_CNTL,
>>>>>>>>>>> WPTR_OVERFLOW_CLEAR,
>>>>>>>>>>> 0);
>>>>>>>>>>> +        WREG32(mmIH_RB_CNTL, tmp);
>>>>>>>>>>> +        ih->overflow = false;
>>>>>>>>>>> +    }
>>>>>>>>>>>    }
>>>>>>>>>>>
>>>>>>>>>>>    static int iceland_ih_early_init(void *handle)
>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/ih_v6_0.c
>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/ih_v6_0.c
>>>>>>>>>>> index d9ed7332d805..ce8f7feec713 100644
>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/ih_v6_0.c
>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/ih_v6_0.c
>>>>>>>>>>> @@ -418,6 +418,8 @@ static u32 ih_v6_0_get_wptr(struct
>>>>>>>>>>> amdgpu_device
>>>>>>>>>>> *adev,
>>>>>>>>>>>        tmp = RREG32_NO_KIQ(ih_regs->ih_rb_cntl);
>>>>>>>>>>>        tmp = REG_SET_FIELD(tmp, IH_RB_CNTL,
>>>>>>>>>>> WPTR_OVERFLOW_CLEAR, 1);
>>>>>>>>>>>        WREG32_NO_KIQ(ih_regs->ih_rb_cntl, tmp);
>>>>>>>>>>> +    ih->overflow = true;
>>>>>>>>>>> +
>>>>>>>>>>>    out:
>>>>>>>>>>>        return (wptr & ih->ptr_mask);
>>>>>>>>>>>    }
>>>>>>>>>>> @@ -459,6 +461,7 @@ static void ih_v6_0_irq_rearm(struct
>>>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>>>    static void ih_v6_0_set_rptr(struct amdgpu_device *adev,
>>>>>>>>>>>                       struct amdgpu_ih_ring *ih)
>>>>>>>>>>>    {
>>>>>>>>>>> +    u32 tmp;
>>>>>>>>>>>        struct amdgpu_ih_regs *ih_regs;
>>>>>>>>>>>
>>>>>>>>>>>        if (ih->use_doorbell) {
>>>>>>>>>>> @@ -472,6 +475,16 @@ static void ih_v6_0_set_rptr(struct
>>>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>>>            ih_regs = &ih->ih_regs;
>>>>>>>>>>>            WREG32(ih_regs->ih_rb_rptr, ih->rptr);
>>>>>>>>>>>        }
>>>>>>>>>>> +
>>>>>>>>>>> +    /* If we overflowed previously (and thus set the
>>>>>>>>>>> OVERFLOW_CLEAR
>>>>>>>>>>> bit),
>>>>>>>>>>> +     * reset it here to detect more overflows if they occur.
>>>>>>>>>>> +     */
>>>>>>>>>>> +    if (ih->overflow) {
>>>>>>>>>>> +        tmp = RREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl);
>>>>>>>>>>> +        tmp = REG_SET_FIELD(tmp, IH_RB_CNTL,
>>>>>>>>>>> WPTR_OVERFLOW_CLEAR,
>>>>>>>>>>> 0);
>>>>>>>>>>> + WREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl, tmp);
>>>>>>>>>>> +        ih->overflow = false;
>>>>>>>>>>> +    }
>>>>>>>>>>>    }
>>>>>>>>>>>
>>>>>>>>>>>    /**
>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/ih_v6_1.c
>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/ih_v6_1.c
>>>>>>>>>>> index 8fb05eae340a..668788ad34d9 100644
>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/ih_v6_1.c
>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/ih_v6_1.c
>>>>>>>>>>> @@ -418,6 +418,8 @@ static u32 ih_v6_1_get_wptr(struct
>>>>>>>>>>> amdgpu_device
>>>>>>>>>>> *adev,
>>>>>>>>>>>        tmp = RREG32_NO_KIQ(ih_regs->ih_rb_cntl);
>>>>>>>>>>>        tmp = REG_SET_FIELD(tmp, IH_RB_CNTL,
>>>>>>>>>>> WPTR_OVERFLOW_CLEAR, 1);
>>>>>>>>>>>        WREG32_NO_KIQ(ih_regs->ih_rb_cntl, tmp);
>>>>>>>>>>> +    ih->overflow = true;
>>>>>>>>>>> +
>>>>>>>>>>>    out:
>>>>>>>>>>>        return (wptr & ih->ptr_mask);
>>>>>>>>>>>    }
>>>>>>>>>>> @@ -459,6 +461,7 @@ static void ih_v6_1_irq_rearm(struct
>>>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>>>    static void ih_v6_1_set_rptr(struct amdgpu_device *adev,
>>>>>>>>>>>                       struct amdgpu_ih_ring *ih)
>>>>>>>>>>>    {
>>>>>>>>>>> +    u32 tmp;
>>>>>>>>>>>        struct amdgpu_ih_regs *ih_regs;
>>>>>>>>>>>
>>>>>>>>>>>        if (ih->use_doorbell) {
>>>>>>>>>>> @@ -472,6 +475,16 @@ static void ih_v6_1_set_rptr(struct
>>>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>>>            ih_regs = &ih->ih_regs;
>>>>>>>>>>>            WREG32(ih_regs->ih_rb_rptr, ih->rptr);
>>>>>>>>>>>        }
>>>>>>>>>>> +
>>>>>>>>>>> +    /* If we overflowed previously (and thus set the
>>>>>>>>>>> OVERFLOW_CLEAR
>>>>>>>>>>> bit),
>>>>>>>>>>> +     * reset it here to detect more overflows if they occur.
>>>>>>>>>>> +     */
>>>>>>>>>>> +    if (ih->overflow) {
>>>>>>>>>>> +        tmp = RREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl);
>>>>>>>>>>> +        tmp = REG_SET_FIELD(tmp, IH_RB_CNTL,
>>>>>>>>>>> WPTR_OVERFLOW_CLEAR,
>>>>>>>>>>> 0);
>>>>>>>>>>> + WREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl, tmp);
>>>>>>>>>>> +        ih->overflow = false;
>>>>>>>>>>> +    }
>>>>>>>>>>>    }
>>>>>>>>>>>
>>>>>>>>>>>    /**
>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/navi10_ih.c
>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/navi10_ih.c
>>>>>>>>>>> index e64b33115848..0bdac923cb4d 100644
>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/navi10_ih.c
>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/navi10_ih.c
>>>>>>>>>>> @@ -442,6 +442,7 @@ static u32 navi10_ih_get_wptr(struct
>>>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>>>        tmp = RREG32_NO_KIQ(ih_regs->ih_rb_cntl);
>>>>>>>>>>>        tmp = REG_SET_FIELD(tmp, IH_RB_CNTL,
>>>>>>>>>>> WPTR_OVERFLOW_CLEAR, 1);
>>>>>>>>>>>        WREG32_NO_KIQ(ih_regs->ih_rb_cntl, tmp);
>>>>>>>>>>> +    ih->overflow = true;
>>>>>>>>>>>    out:
>>>>>>>>>>>        return (wptr & ih->ptr_mask);
>>>>>>>>>>>    }
>>>>>>>>>>> @@ -483,6 +484,7 @@ static void navi10_ih_irq_rearm(struct
>>>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>>>    static void navi10_ih_set_rptr(struct amdgpu_device *adev,
>>>>>>>>>>>                       struct amdgpu_ih_ring *ih)
>>>>>>>>>>>    {
>>>>>>>>>>> +    u32 tmp;
>>>>>>>>>>>        struct amdgpu_ih_regs *ih_regs;
>>>>>>>>>>>
>>>>>>>>>>>        if (ih == &adev->irq.ih_soft)
>>>>>>>>>>> @@ -499,6 +501,16 @@ static void navi10_ih_set_rptr(struct
>>>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>>>            ih_regs = &ih->ih_regs;
>>>>>>>>>>>            WREG32(ih_regs->ih_rb_rptr, ih->rptr);
>>>>>>>>>>>        }
>>>>>>>>>>> +
>>>>>>>>>>> +    /* If we overflowed previously (and thus set the
>>>>>>>>>>> OVERFLOW_CLEAR
>>>>>>>>>>> bit),
>>>>>>>>>>> +     * reset it here to detect more overflows if they occur.
>>>>>>>>>>> +     */
>>>>>>>>>>> +    if (ih->overflow) {
>>>>>>>>>>> +        tmp = RREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl);
>>>>>>>>>>> +        tmp = REG_SET_FIELD(tmp, IH_RB_CNTL,
>>>>>>>>>>> WPTR_OVERFLOW_CLEAR,
>>>>>>>>>>> 0);
>>>>>>>>>>> + WREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl, tmp);
>>>>>>>>>>> +        ih->overflow = false;
>>>>>>>>>>> +    }
>>>>>>>>>>>    }
>>>>>>>>>>>
>>>>>>>>>>>    /**
>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/si_ih.c
>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/si_ih.c
>>>>>>>>>>> index 9a24f17a5750..ff35056d2b54 100644
>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/si_ih.c
>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/si_ih.c
>>>>>>>>>>> @@ -119,6 +119,7 @@ static u32 si_ih_get_wptr(struct
>>>>>>>>>>> amdgpu_device
>>>>>>>>>>> *adev,
>>>>>>>>>>>            tmp = RREG32(IH_RB_CNTL);
>>>>>>>>>>>            tmp |= IH_RB_CNTL__WPTR_OVERFLOW_CLEAR_MASK;
>>>>>>>>>>>            WREG32(IH_RB_CNTL, tmp);
>>>>>>>>>>> +        ih->overflow = true;
>>>>>>>>>>>        }
>>>>>>>>>>>        return (wptr & ih->ptr_mask);
>>>>>>>>>>>    }
>>>>>>>>>>> @@ -147,7 +148,18 @@ static void si_ih_decode_iv(struct
>>>>>>>>>>> amdgpu_device
>>>>>>>>>>> *adev,
>>>>>>>>>>>    static void si_ih_set_rptr(struct amdgpu_device *adev,
>>>>>>>>>>>                   struct amdgpu_ih_ring *ih)
>>>>>>>>>>>    {
>>>>>>>>>>> +    u32 tmp;
>>>>>>>>>>> +
>>>>>>>>>>>        WREG32(IH_RB_RPTR, ih->rptr);
>>>>>>>>>>> +
>>>>>>>>>>> +    /* If we overflowed previously (and thus set the
>>>>>>>>>>> OVERFLOW_CLEAR
>>>>>>>>>>> bit),
>>>>>>>>>>> +     * reset it here to detect more overflows if they occur.
>>>>>>>>>>> +     */
>>>>>>>>>>> +    if (ih->overflow) {
>>>>>>>>>>> +        tmp = RREG32(IH_RB_CNTL);
>>>>>>>>>>> +        tmp &= ~IH_RB_CNTL__WPTR_OVERFLOW_CLEAR_MASK;
>>>>>>>>>>> +        WREG32(IH_RB_CNTL, tmp);
>>>>>>>>>>> +    }
>>>>>>>>>>>    }
>>>>>>>>>>>
>>>>>>>>>>>    static int si_ih_early_init(void *handle)
>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/tonga_ih.c
>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/tonga_ih.c
>>>>>>>>>>> index 917707bba7f3..6f5090d3db48 100644
>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/tonga_ih.c
>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/tonga_ih.c
>>>>>>>>>>> @@ -218,6 +218,7 @@ static u32 tonga_ih_get_wptr(struct
>>>>>>>>>>> amdgpu_device
>>>>>>>>>>> *adev,
>>>>>>>>>>>        tmp = RREG32(mmIH_RB_CNTL);
>>>>>>>>>>>        tmp = REG_SET_FIELD(tmp, IH_RB_CNTL,
>>>>>>>>>>> WPTR_OVERFLOW_CLEAR, 1);
>>>>>>>>>>>        WREG32(mmIH_RB_CNTL, tmp);
>>>>>>>>>>> +    ih->overflow = true;
>>>>>>>>>>>
>>>>>>>>>>>    out:
>>>>>>>>>>>        return (wptr & ih->ptr_mask);
>>>>>>>>>>> @@ -268,6 +269,8 @@ static void tonga_ih_decode_iv(struct
>>>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>>>    static void tonga_ih_set_rptr(struct amdgpu_device *adev,
>>>>>>>>>>>                      struct amdgpu_ih_ring *ih)
>>>>>>>>>>>    {
>>>>>>>>>>> +    u32 tmp;
>>>>>>>>>>> +
>>>>>>>>>>>        if (ih->use_doorbell) {
>>>>>>>>>>>            /* XXX check if swapping is necessary on BE */
>>>>>>>>>>>            *ih->rptr_cpu = ih->rptr;
>>>>>>>>>>> @@ -275,6 +278,16 @@ static void tonga_ih_set_rptr(struct
>>>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>>>        } else {
>>>>>>>>>>>            WREG32(mmIH_RB_RPTR, ih->rptr);
>>>>>>>>>>>        }
>>>>>>>>>>> +
>>>>>>>>>>> +    /* If we overflowed previously (and thus set the
>>>>>>>>>>> OVERFLOW_CLEAR
>>>>>>>>>>> bit),
>>>>>>>>>>> +     * reset it here to detect more overflows if they occur.
>>>>>>>>>>> +     */
>>>>>>>>>>> +    if (ih->overflow) {
>>>>>>>>>>> +        tmp = RREG32(mmIH_RB_CNTL);
>>>>>>>>>>> +        tmp = REG_SET_FIELD(tmp, IH_RB_CNTL,
>>>>>>>>>>> WPTR_OVERFLOW_CLEAR,
>>>>>>>>>>> 0);
>>>>>>>>>>> +        WREG32(mmIH_RB_CNTL, tmp);
>>>>>>>>>>> +        ih->overflow = false;
>>>>>>>>>>> +    }
>>>>>>>>>>>    }
>>>>>>>>>>>
>>>>>>>>>>>    static int tonga_ih_early_init(void *handle)
>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
>>>>>>>>>>> index d364c6dd152c..bb005924f194 100644
>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
>>>>>>>>>>> @@ -372,6 +372,7 @@ static u32 vega10_ih_get_wptr(struct
>>>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>>>        tmp = RREG32_NO_KIQ(ih_regs->ih_rb_cntl);
>>>>>>>>>>>        tmp = REG_SET_FIELD(tmp, IH_RB_CNTL,
>>>>>>>>>>> WPTR_OVERFLOW_CLEAR, 1);
>>>>>>>>>>>        WREG32_NO_KIQ(ih_regs->ih_rb_cntl, tmp);
>>>>>>>>>>> +    ih->overflow = true;
>>>>>>>>>>>
>>>>>>>>>>>    out:
>>>>>>>>>>>        return (wptr & ih->ptr_mask);
>>>>>>>>>>> @@ -413,6 +414,7 @@ static void vega10_ih_irq_rearm(struct
>>>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>>>    static void vega10_ih_set_rptr(struct amdgpu_device *adev,
>>>>>>>>>>>                       struct amdgpu_ih_ring *ih)
>>>>>>>>>>>    {
>>>>>>>>>>> +    u32 tmp;
>>>>>>>>>>>        struct amdgpu_ih_regs *ih_regs;
>>>>>>>>>>>
>>>>>>>>>>>        if (ih == &adev->irq.ih_soft)
>>>>>>>>>>> @@ -429,6 +431,16 @@ static void vega10_ih_set_rptr(struct
>>>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>>>            ih_regs = &ih->ih_regs;
>>>>>>>>>>>            WREG32(ih_regs->ih_rb_rptr, ih->rptr);
>>>>>>>>>>>        }
>>>>>>>>>>> +
>>>>>>>>>>> +    /* If we overflowed previously (and thus set the
>>>>>>>>>>> OVERFLOW_CLEAR
>>>>>>>>>>> bit),
>>>>>>>>>>> +     * reset it here to detect more overflows if they occur.
>>>>>>>>>>> +     */
>>>>>>>>>>> +    if (ih->overflow) {
>>>>>>>>>>> +        tmp = RREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl);
>>>>>>>>>>> +        tmp = REG_SET_FIELD(tmp, IH_RB_CNTL,
>>>>>>>>>>> WPTR_OVERFLOW_CLEAR,
>>>>>>>>>>> 0);
>>>>>>>>>>> + WREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl, tmp);
>>>>>>>>>>> +        ih->overflow = false;
>>>>>>>>>>> +    }
>>>>>>>>>>>    }
>>>>>>>>>>>
>>>>>>>>>>>    /**
>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/vega20_ih.c
>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/vega20_ih.c
>>>>>>>>>>> index ddfc6941f9d5..bb725a970697 100644
>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/vega20_ih.c
>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/vega20_ih.c
>>>>>>>>>>> @@ -420,6 +420,7 @@ static u32 vega20_ih_get_wptr(struct
>>>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>>>        tmp = RREG32_NO_KIQ(ih_regs->ih_rb_cntl);
>>>>>>>>>>>        tmp = REG_SET_FIELD(tmp, IH_RB_CNTL,
>>>>>>>>>>> WPTR_OVERFLOW_CLEAR, 1);
>>>>>>>>>>>        WREG32_NO_KIQ(ih_regs->ih_rb_cntl, tmp);
>>>>>>>>>>> +    ih->overflow = true;
>>>>>>>>>>>
>>>>>>>>>>>    out:
>>>>>>>>>>>        return (wptr & ih->ptr_mask);
>>>>>>>>>>> @@ -462,6 +463,7 @@ static void vega20_ih_irq_rearm(struct
>>>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>>>    static void vega20_ih_set_rptr(struct amdgpu_device *adev,
>>>>>>>>>>>                       struct amdgpu_ih_ring *ih)
>>>>>>>>>>>    {
>>>>>>>>>>> +    u32 tmp;
>>>>>>>>>>>        struct amdgpu_ih_regs *ih_regs;
>>>>>>>>>>>
>>>>>>>>>>>        if (ih == &adev->irq.ih_soft)
>>>>>>>>>>> @@ -478,6 +480,16 @@ static void vega20_ih_set_rptr(struct
>>>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>>>            ih_regs = &ih->ih_regs;
>>>>>>>>>>>            WREG32(ih_regs->ih_rb_rptr, ih->rptr);
>>>>>>>>>>>        }
>>>>>>>>>>> +
>>>>>>>>>>> +    /* If we overflowed previously (and thus set the
>>>>>>>>>>> OVERFLOW_CLEAR
>>>>>>>>>>> bit),
>>>>>>>>>>> +     * reset it here to detect more overflows if they occur.
>>>>>>>>>>> +     */
>>>>>>>>>>> +    if (ih->overflow) {
>>>>>>>>>>> +        tmp = RREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl);
>>>>>>>>>>> +        tmp = REG_SET_FIELD(tmp, IH_RB_CNTL,
>>>>>>>>>>> WPTR_OVERFLOW_CLEAR,
>>>>>>>>>>> 0);
>>>>>>>>>>> + WREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl, tmp);
>>>>>>>>>>> +        ih->overflow = false;
>>>>>>>>>>> +    }
>>>>>>>>>>>    }
>>>>>>>>>>>
>>>>>>>>>>>    /**
>>>>>>>>>>> --
>>>>>>>>>>> 2.43.0
>>>>>>>>>>>
>>>
>

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 1/2] drm/amdgpu: Reset IH OVERFLOW_CLEAR bit after writing rptr
  2024-01-22 10:21                     ` Friedrich Vock
@ 2024-01-22 10:45                       ` Friedrich Vock
  2024-01-22 13:35                         ` Christian König
  0 siblings, 1 reply; 28+ messages in thread
From: Friedrich Vock @ 2024-01-22 10:45 UTC (permalink / raw)
  To: Christian König, Felix Kuehling, Christian König,
	Alex Deucher, Dommati, Sunil-kumar
  Cc: Alex Deucher, amd-gfx, Joshua Ashton

On 22.01.24 11:21, Friedrich Vock wrote:
> On 22.01.24 11:10, Christian König wrote:
>> Am 19.01.24 um 20:18 schrieb Felix Kuehling:
>>> On 2024-01-18 07:07, Christian König wrote:
>>>> Am 18.01.24 um 00:44 schrieb Friedrich Vock:
>>>>> On 18.01.24 00:00, Alex Deucher wrote:
>>>>>> [SNIP]
>>>>>>>>>> Right now, IH overflows, even if they occur repeatedly, only get
>>>>>>>>>> registered once. If not registering IH overflows can trivially
>>>>>>>>>> lead to
>>>>>>>>>> system crashes, it's amdgpu's current handling that is broken.
>>>>>>>>> It's years that we last tested this but according to the HW
>>>>>>>>> documentation this should work fine.
>>>>>>>>>
>>>>>>>>> What could potentially happen is that the IH has silenced the
>>>>>>>>> source
>>>>>>>>> of the overflow. We never implemented resetting those, but in
>>>>>>>>> this
>>>>>>>>> case that here won't help either.
>>>>>>>>>
>>>>>>>> If the IH silenced the page faults (which quite clearly cause the
>>>>>>>> overflow here), then how are the page faults still logged in
>>>>>>>> dmesg?
>>>>>>> There should be a hardware rate limit for the page faults, e.g.
>>>>>>> there
>>>>>>> can only be X faults reported in N clock cycles and then a delay is
>>>>>>> inserted.
>>>>>> @Christian Koenig  Is that tied to xnack (i.e., noretry)? The
>>>>>> default
>>>>>> is noretry=1 on gfx10.3 and newer.  But it can be overridden. It was
>>>>>> not set on some older kernels, maybe that is the problem? @Friedrich
>>>>>> Vock does setting amdgpu.noretry=1 fix the issue?
>>>>>
>>>>>
>>>>> No, amdgpu.noretry=1 does not change anything.
>>>>
>>>> Well the good news first the hw engineer answered rather quickly.
>>>> The bad news is that the hardware really doesn't work as documented
>>>> in multiple ways.
>>>>
>>>> First of all the CLEAR bit is a level and not a trigger, so the
>>>> intention to clear it is indeed correct. For now please modify this
>>>> patch so that the CLEAR bit is set and cleared directly after
>>>> setting it, this way we should be able to detect further overflows
>>>> immediately.
>>>>
>>>> Then the APU the Steam Deck uses simply doesn't have the filter
>>>> function for page faults in the hardware, the really bad news is it
>>>> also doesn't have the extra IH rings where we could re-route the
>>>> faults to prevent overflows.
>>>>
>>>> That full explains the behavior you have been seeing, but doesn't
>>>> really provide a doable solution to mitigate this problem.
>>>>
>>>> I'm going to dig deeper into the hw documentation and specification
>>>> to see if we can use a different feature to avoid the overflow.
>>>
>>> If we're not enabling retry faults, then each wave front should
>>> generate at most one fault. You should be able to avoid overflows by
>>> making the IH ring large enough to accommodate one fault per wave
>>> front.
>>
>> That is the exact same argument our HW engineers came up with when we
>> asked why the APU is missing all those nice IH ring overflow avoidance
>> features the dGPUs have :)
>>
> I can reproduce IH overflows on my RX 6700 XT dGPU as well FWIW.
>
>> The only problem with this approach is that on Navi when a wave is
>> blocked by waiting on a fault you can't kill it using soft recovery
>> any more (at least when my understanding is correct).
>>
> Killing page-faulted waves via soft recovery works. From my testing on
> Deck, it seems to take a bit of time, but if you try for long enough
> soft recovery eventually succeeds.


On second thought, could it be that this is the critical flaw in the "at
most one fault per wave" thinking?

Most work submissions in practice submit more waves than the number of
wave slots the GPU has.
As far as I understand soft recovery, the only thing it does is kill all
active waves. This frees up the CUs so more waves are launched, which
can fault again, and that leads to potentially lots of faults for a
single wave slot in the end.

Regards,
Friedrich

>
> Regards,
> Friedrich
>
>>>
>>> If the faults are coming from SDMA, that may be another problem. I'm
>>> not sure whether it can generate multiple no-retry faults from the
>>> same queue.
>>
>> Regarding faults the SDMA is relatively harmless compared to the 3D
>> engine, IIRC the resolve queue is something like 128 entries deep. So
>> you never see more than those 128 faults if I'm not completely mistaken.
>>
>> Sunil is setting up a test system for this in an AMD lab and will play
>> around with a few HW features to mitigate the issue. I still hope that
>> we can completely avoid the overflow altogether.
>>
>> Regards,
>> Christian.
>>
>>>
>>> Regards,
>>>   Felix
>>>
>>>
>>>>
>>>> Thanks,
>>>> Christian.
>>>>
>>>>>
>>>>> Regards,
>>>>> Friedrich
>>>>>
>>>>>> Alex
>>>>>>
>>>>>>>>>> The possibility of a repeated IH overflow in between reading
>>>>>>>>>> the wptr
>>>>>>>>>> and updating the rptr is a good point, but how can we detect
>>>>>>>>>> that at
>>>>>>>>>> all? It seems to me like we can't set the OVERFLOW_CLEAR bit
>>>>>>>>>> at all
>>>>>>>>>> then, because we're guaranteed to miss any overflows that
>>>>>>>>>> happen while
>>>>>>>>>> the bit is set.
>>>>>>>>> When an IH overflow is signaled we clear that flag by writing 1
>>>>>>>>> into
>>>>>>>>> the OVERFLOW_CLEAR bit and skip one entry in the IH ring buffer.
>>>>>>>>>
>>>>>>>>> What can of course happen is that the IH ring buffer overflows
>>>>>>>>> more
>>>>>>>>> than this single entry and we process IVs which are potentially
>>>>>>>>> corrupted, but we won't miss any additional overflows since we
>>>>>>>>> only
>>>>>>>>> start processing after resetting the flag.
>>>>>>>>>
>>>>>>>>> An IH overflow is also something you should *never* see in a
>>>>>>>>> production system. This is purely for driver bringup and as
>>>>>>>>> fallback
>>>>>>>>> when there is a severe incorrect programming of the HW.
>>>>>>>>>
>>>>>>>>> The only exception of that is page fault handling on MI products
>>>>>>>>> because of a hardware bug, to mitigate this we are processing
>>>>>>>>> page
>>>>>>>>> faults on a separate IH ring on those parts.
>>>>>>>>>
>>>>>>>>> On all other hw generations the IH should have some rate limit
>>>>>>>>> for the
>>>>>>>>> number of faults generated per second, so that the CPU is
>>>>>>>>> always able
>>>>>>>>> to catch up.
>>>>>>>> I'm wondering if there is another bug in here somewhere. Your
>>>>>>>> explanation of how it's supposed to work makes a lot of sense,
>>>>>>>> but from
>>>>>>>> what I can tell it doesn't work that way when I test it.
>>>>>>>>
>>>>>>>>  From the printk_ratelimit stats it would seem like >2000 faults
>>>>>>>> arrive
>>>>>>>> in less than a second, so perhaps your theory about fault
>>>>>>>> interrupt
>>>>>>>> ratelimiting not working is correct (but it's hard for me to
>>>>>>>> verify what
>>>>>>>> is going on without the documentation).
>>>>>>> I'm going to ping the relevant engineer and putting someone on
>>>>>>> the task
>>>>>>> to take a look.
>>>>>>>
>>>>>>> Thanks,
>>>>>>> Christian.
>>>>>>>
>>>>>>>> Regards,
>>>>>>>> Friedrich
>>>>>>>>
>>>>>>>>> Regards,
>>>>>>>>> Christian.
>>>>>>>>>
>>>>>>>>>> Regards,
>>>>>>>>>> Friedrich
>>>>>>>>>>
>>>>>>>>>>> When you clear the overflow again when updating the RPTR you
>>>>>>>>>>> could
>>>>>>>>>>> loose another overflow which might have happened in between
>>>>>>>>>>> and so
>>>>>>>>>>> potentially process corrupted IVs.
>>>>>>>>>>>
>>>>>>>>>>> That can trivially crash the system.
>>>>>>>>>>>
>>>>>>>>>>> Regards,
>>>>>>>>>>> Christian.
>>>>>>>>>>>
>>>>>>>>>>>>    }
>>>>>>>>>>>>
>>>>>>>>>>>>    static int cik_ih_early_init(void *handle)
>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/cz_ih.c
>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/cz_ih.c
>>>>>>>>>>>> index b8c47e0cf37a..076559668573 100644
>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/cz_ih.c
>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/cz_ih.c
>>>>>>>>>>>> @@ -215,7 +215,7 @@ static u32 cz_ih_get_wptr(struct
>>>>>>>>>>>> amdgpu_device
>>>>>>>>>>>> *adev,
>>>>>>>>>>>>        tmp = RREG32(mmIH_RB_CNTL);
>>>>>>>>>>>>        tmp = REG_SET_FIELD(tmp, IH_RB_CNTL,
>>>>>>>>>>>> WPTR_OVERFLOW_CLEAR, 1);
>>>>>>>>>>>>        WREG32(mmIH_RB_CNTL, tmp);
>>>>>>>>>>>> -
>>>>>>>>>>>> +    ih->overflow = true;
>>>>>>>>>>>>
>>>>>>>>>>>>    out:
>>>>>>>>>>>>        return (wptr & ih->ptr_mask);
>>>>>>>>>>>> @@ -266,7 +266,19 @@ static void cz_ih_decode_iv(struct
>>>>>>>>>>>> amdgpu_device
>>>>>>>>>>>> *adev,
>>>>>>>>>>>>    static void cz_ih_set_rptr(struct amdgpu_device *adev,
>>>>>>>>>>>>                   struct amdgpu_ih_ring *ih)
>>>>>>>>>>>>    {
>>>>>>>>>>>> +    u32 tmp;
>>>>>>>>>>>> +
>>>>>>>>>>>>        WREG32(mmIH_RB_RPTR, ih->rptr);
>>>>>>>>>>>> +
>>>>>>>>>>>> +    /* If we overflowed previously (and thus set the
>>>>>>>>>>>> OVERFLOW_CLEAR
>>>>>>>>>>>> bit),
>>>>>>>>>>>> +     * reset it here to detect more overflows if they occur.
>>>>>>>>>>>> +     */
>>>>>>>>>>>> +    if (ih->overflow) {
>>>>>>>>>>>> +        tmp = RREG32(mmIH_RB_CNTL);
>>>>>>>>>>>> +        tmp = REG_SET_FIELD(tmp, IH_RB_CNTL,
>>>>>>>>>>>> WPTR_OVERFLOW_CLEAR,
>>>>>>>>>>>> 0);
>>>>>>>>>>>> +        WREG32(mmIH_RB_CNTL, tmp);
>>>>>>>>>>>> +        ih->overflow = false;
>>>>>>>>>>>> +    }
>>>>>>>>>>>>    }
>>>>>>>>>>>>
>>>>>>>>>>>>    static int cz_ih_early_init(void *handle)
>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/iceland_ih.c
>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/iceland_ih.c
>>>>>>>>>>>> index aecad530b10a..1a5e668643d1 100644
>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/iceland_ih.c
>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/iceland_ih.c
>>>>>>>>>>>> @@ -214,7 +214,7 @@ static u32 iceland_ih_get_wptr(struct
>>>>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>>>>        tmp = RREG32(mmIH_RB_CNTL);
>>>>>>>>>>>>        tmp = REG_SET_FIELD(tmp, IH_RB_CNTL,
>>>>>>>>>>>> WPTR_OVERFLOW_CLEAR, 1);
>>>>>>>>>>>>        WREG32(mmIH_RB_CNTL, tmp);
>>>>>>>>>>>> -
>>>>>>>>>>>> +    ih->overflow = true;
>>>>>>>>>>>>
>>>>>>>>>>>>    out:
>>>>>>>>>>>>        return (wptr & ih->ptr_mask);
>>>>>>>>>>>> @@ -265,7 +265,19 @@ static void iceland_ih_decode_iv(struct
>>>>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>>>>    static void iceland_ih_set_rptr(struct amdgpu_device *adev,
>>>>>>>>>>>>                    struct amdgpu_ih_ring *ih)
>>>>>>>>>>>>    {
>>>>>>>>>>>> +    u32 tmp;
>>>>>>>>>>>> +
>>>>>>>>>>>>        WREG32(mmIH_RB_RPTR, ih->rptr);
>>>>>>>>>>>> +
>>>>>>>>>>>> +    /* If we overflowed previously (and thus set the
>>>>>>>>>>>> OVERFLOW_CLEAR
>>>>>>>>>>>> bit),
>>>>>>>>>>>> +     * reset it here to detect more overflows if they occur.
>>>>>>>>>>>> +     */
>>>>>>>>>>>> +    if (ih->overflow) {
>>>>>>>>>>>> +        tmp = RREG32(mmIH_RB_CNTL);
>>>>>>>>>>>> +        tmp = REG_SET_FIELD(tmp, IH_RB_CNTL,
>>>>>>>>>>>> WPTR_OVERFLOW_CLEAR,
>>>>>>>>>>>> 0);
>>>>>>>>>>>> +        WREG32(mmIH_RB_CNTL, tmp);
>>>>>>>>>>>> +        ih->overflow = false;
>>>>>>>>>>>> +    }
>>>>>>>>>>>>    }
>>>>>>>>>>>>
>>>>>>>>>>>>    static int iceland_ih_early_init(void *handle)
>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/ih_v6_0.c
>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/ih_v6_0.c
>>>>>>>>>>>> index d9ed7332d805..ce8f7feec713 100644
>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/ih_v6_0.c
>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/ih_v6_0.c
>>>>>>>>>>>> @@ -418,6 +418,8 @@ static u32 ih_v6_0_get_wptr(struct
>>>>>>>>>>>> amdgpu_device
>>>>>>>>>>>> *adev,
>>>>>>>>>>>>        tmp = RREG32_NO_KIQ(ih_regs->ih_rb_cntl);
>>>>>>>>>>>>        tmp = REG_SET_FIELD(tmp, IH_RB_CNTL,
>>>>>>>>>>>> WPTR_OVERFLOW_CLEAR, 1);
>>>>>>>>>>>>        WREG32_NO_KIQ(ih_regs->ih_rb_cntl, tmp);
>>>>>>>>>>>> +    ih->overflow = true;
>>>>>>>>>>>> +
>>>>>>>>>>>>    out:
>>>>>>>>>>>>        return (wptr & ih->ptr_mask);
>>>>>>>>>>>>    }
>>>>>>>>>>>> @@ -459,6 +461,7 @@ static void ih_v6_0_irq_rearm(struct
>>>>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>>>>    static void ih_v6_0_set_rptr(struct amdgpu_device *adev,
>>>>>>>>>>>>                       struct amdgpu_ih_ring *ih)
>>>>>>>>>>>>    {
>>>>>>>>>>>> +    u32 tmp;
>>>>>>>>>>>>        struct amdgpu_ih_regs *ih_regs;
>>>>>>>>>>>>
>>>>>>>>>>>>        if (ih->use_doorbell) {
>>>>>>>>>>>> @@ -472,6 +475,16 @@ static void ih_v6_0_set_rptr(struct
>>>>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>>>>            ih_regs = &ih->ih_regs;
>>>>>>>>>>>>            WREG32(ih_regs->ih_rb_rptr, ih->rptr);
>>>>>>>>>>>>        }
>>>>>>>>>>>> +
>>>>>>>>>>>> +    /* If we overflowed previously (and thus set the
>>>>>>>>>>>> OVERFLOW_CLEAR
>>>>>>>>>>>> bit),
>>>>>>>>>>>> +     * reset it here to detect more overflows if they occur.
>>>>>>>>>>>> +     */
>>>>>>>>>>>> +    if (ih->overflow) {
>>>>>>>>>>>> +        tmp = RREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl);
>>>>>>>>>>>> +        tmp = REG_SET_FIELD(tmp, IH_RB_CNTL,
>>>>>>>>>>>> WPTR_OVERFLOW_CLEAR,
>>>>>>>>>>>> 0);
>>>>>>>>>>>> + WREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl, tmp);
>>>>>>>>>>>> +        ih->overflow = false;
>>>>>>>>>>>> +    }
>>>>>>>>>>>>    }
>>>>>>>>>>>>
>>>>>>>>>>>>    /**
>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/ih_v6_1.c
>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/ih_v6_1.c
>>>>>>>>>>>> index 8fb05eae340a..668788ad34d9 100644
>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/ih_v6_1.c
>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/ih_v6_1.c
>>>>>>>>>>>> @@ -418,6 +418,8 @@ static u32 ih_v6_1_get_wptr(struct
>>>>>>>>>>>> amdgpu_device
>>>>>>>>>>>> *adev,
>>>>>>>>>>>>        tmp = RREG32_NO_KIQ(ih_regs->ih_rb_cntl);
>>>>>>>>>>>>        tmp = REG_SET_FIELD(tmp, IH_RB_CNTL,
>>>>>>>>>>>> WPTR_OVERFLOW_CLEAR, 1);
>>>>>>>>>>>>        WREG32_NO_KIQ(ih_regs->ih_rb_cntl, tmp);
>>>>>>>>>>>> +    ih->overflow = true;
>>>>>>>>>>>> +
>>>>>>>>>>>>    out:
>>>>>>>>>>>>        return (wptr & ih->ptr_mask);
>>>>>>>>>>>>    }
>>>>>>>>>>>> @@ -459,6 +461,7 @@ static void ih_v6_1_irq_rearm(struct
>>>>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>>>>    static void ih_v6_1_set_rptr(struct amdgpu_device *adev,
>>>>>>>>>>>>                       struct amdgpu_ih_ring *ih)
>>>>>>>>>>>>    {
>>>>>>>>>>>> +    u32 tmp;
>>>>>>>>>>>>        struct amdgpu_ih_regs *ih_regs;
>>>>>>>>>>>>
>>>>>>>>>>>>        if (ih->use_doorbell) {
>>>>>>>>>>>> @@ -472,6 +475,16 @@ static void ih_v6_1_set_rptr(struct
>>>>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>>>>            ih_regs = &ih->ih_regs;
>>>>>>>>>>>>            WREG32(ih_regs->ih_rb_rptr, ih->rptr);
>>>>>>>>>>>>        }
>>>>>>>>>>>> +
>>>>>>>>>>>> +    /* If we overflowed previously (and thus set the
>>>>>>>>>>>> OVERFLOW_CLEAR
>>>>>>>>>>>> bit),
>>>>>>>>>>>> +     * reset it here to detect more overflows if they occur.
>>>>>>>>>>>> +     */
>>>>>>>>>>>> +    if (ih->overflow) {
>>>>>>>>>>>> +        tmp = RREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl);
>>>>>>>>>>>> +        tmp = REG_SET_FIELD(tmp, IH_RB_CNTL,
>>>>>>>>>>>> WPTR_OVERFLOW_CLEAR,
>>>>>>>>>>>> 0);
>>>>>>>>>>>> + WREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl, tmp);
>>>>>>>>>>>> +        ih->overflow = false;
>>>>>>>>>>>> +    }
>>>>>>>>>>>>    }
>>>>>>>>>>>>
>>>>>>>>>>>>    /**
>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/navi10_ih.c
>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/navi10_ih.c
>>>>>>>>>>>> index e64b33115848..0bdac923cb4d 100644
>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/navi10_ih.c
>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/navi10_ih.c
>>>>>>>>>>>> @@ -442,6 +442,7 @@ static u32 navi10_ih_get_wptr(struct
>>>>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>>>>        tmp = RREG32_NO_KIQ(ih_regs->ih_rb_cntl);
>>>>>>>>>>>>        tmp = REG_SET_FIELD(tmp, IH_RB_CNTL,
>>>>>>>>>>>> WPTR_OVERFLOW_CLEAR, 1);
>>>>>>>>>>>>        WREG32_NO_KIQ(ih_regs->ih_rb_cntl, tmp);
>>>>>>>>>>>> +    ih->overflow = true;
>>>>>>>>>>>>    out:
>>>>>>>>>>>>        return (wptr & ih->ptr_mask);
>>>>>>>>>>>>    }
>>>>>>>>>>>> @@ -483,6 +484,7 @@ static void navi10_ih_irq_rearm(struct
>>>>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>>>>    static void navi10_ih_set_rptr(struct amdgpu_device *adev,
>>>>>>>>>>>>                       struct amdgpu_ih_ring *ih)
>>>>>>>>>>>>    {
>>>>>>>>>>>> +    u32 tmp;
>>>>>>>>>>>>        struct amdgpu_ih_regs *ih_regs;
>>>>>>>>>>>>
>>>>>>>>>>>>        if (ih == &adev->irq.ih_soft)
>>>>>>>>>>>> @@ -499,6 +501,16 @@ static void navi10_ih_set_rptr(struct
>>>>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>>>>            ih_regs = &ih->ih_regs;
>>>>>>>>>>>>            WREG32(ih_regs->ih_rb_rptr, ih->rptr);
>>>>>>>>>>>>        }
>>>>>>>>>>>> +
>>>>>>>>>>>> +    /* If we overflowed previously (and thus set the
>>>>>>>>>>>> OVERFLOW_CLEAR
>>>>>>>>>>>> bit),
>>>>>>>>>>>> +     * reset it here to detect more overflows if they occur.
>>>>>>>>>>>> +     */
>>>>>>>>>>>> +    if (ih->overflow) {
>>>>>>>>>>>> +        tmp = RREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl);
>>>>>>>>>>>> +        tmp = REG_SET_FIELD(tmp, IH_RB_CNTL,
>>>>>>>>>>>> WPTR_OVERFLOW_CLEAR,
>>>>>>>>>>>> 0);
>>>>>>>>>>>> + WREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl, tmp);
>>>>>>>>>>>> +        ih->overflow = false;
>>>>>>>>>>>> +    }
>>>>>>>>>>>>    }
>>>>>>>>>>>>
>>>>>>>>>>>>    /**
>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/si_ih.c
>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/si_ih.c
>>>>>>>>>>>> index 9a24f17a5750..ff35056d2b54 100644
>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/si_ih.c
>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/si_ih.c
>>>>>>>>>>>> @@ -119,6 +119,7 @@ static u32 si_ih_get_wptr(struct
>>>>>>>>>>>> amdgpu_device
>>>>>>>>>>>> *adev,
>>>>>>>>>>>>            tmp = RREG32(IH_RB_CNTL);
>>>>>>>>>>>>            tmp |= IH_RB_CNTL__WPTR_OVERFLOW_CLEAR_MASK;
>>>>>>>>>>>>            WREG32(IH_RB_CNTL, tmp);
>>>>>>>>>>>> +        ih->overflow = true;
>>>>>>>>>>>>        }
>>>>>>>>>>>>        return (wptr & ih->ptr_mask);
>>>>>>>>>>>>    }
>>>>>>>>>>>> @@ -147,7 +148,18 @@ static void si_ih_decode_iv(struct
>>>>>>>>>>>> amdgpu_device
>>>>>>>>>>>> *adev,
>>>>>>>>>>>>    static void si_ih_set_rptr(struct amdgpu_device *adev,
>>>>>>>>>>>>                   struct amdgpu_ih_ring *ih)
>>>>>>>>>>>>    {
>>>>>>>>>>>> +    u32 tmp;
>>>>>>>>>>>> +
>>>>>>>>>>>>        WREG32(IH_RB_RPTR, ih->rptr);
>>>>>>>>>>>> +
>>>>>>>>>>>> +    /* If we overflowed previously (and thus set the
>>>>>>>>>>>> OVERFLOW_CLEAR
>>>>>>>>>>>> bit),
>>>>>>>>>>>> +     * reset it here to detect more overflows if they occur.
>>>>>>>>>>>> +     */
>>>>>>>>>>>> +    if (ih->overflow) {
>>>>>>>>>>>> +        tmp = RREG32(IH_RB_CNTL);
>>>>>>>>>>>> +        tmp &= ~IH_RB_CNTL__WPTR_OVERFLOW_CLEAR_MASK;
>>>>>>>>>>>> +        WREG32(IH_RB_CNTL, tmp);
>>>>>>>>>>>> +    }
>>>>>>>>>>>>    }
>>>>>>>>>>>>
>>>>>>>>>>>>    static int si_ih_early_init(void *handle)
>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/tonga_ih.c
>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/tonga_ih.c
>>>>>>>>>>>> index 917707bba7f3..6f5090d3db48 100644
>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/tonga_ih.c
>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/tonga_ih.c
>>>>>>>>>>>> @@ -218,6 +218,7 @@ static u32 tonga_ih_get_wptr(struct
>>>>>>>>>>>> amdgpu_device
>>>>>>>>>>>> *adev,
>>>>>>>>>>>>        tmp = RREG32(mmIH_RB_CNTL);
>>>>>>>>>>>>        tmp = REG_SET_FIELD(tmp, IH_RB_CNTL,
>>>>>>>>>>>> WPTR_OVERFLOW_CLEAR, 1);
>>>>>>>>>>>>        WREG32(mmIH_RB_CNTL, tmp);
>>>>>>>>>>>> +    ih->overflow = true;
>>>>>>>>>>>>
>>>>>>>>>>>>    out:
>>>>>>>>>>>>        return (wptr & ih->ptr_mask);
>>>>>>>>>>>> @@ -268,6 +269,8 @@ static void tonga_ih_decode_iv(struct
>>>>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>>>>    static void tonga_ih_set_rptr(struct amdgpu_device *adev,
>>>>>>>>>>>>                      struct amdgpu_ih_ring *ih)
>>>>>>>>>>>>    {
>>>>>>>>>>>> +    u32 tmp;
>>>>>>>>>>>> +
>>>>>>>>>>>>        if (ih->use_doorbell) {
>>>>>>>>>>>>            /* XXX check if swapping is necessary on BE */
>>>>>>>>>>>>            *ih->rptr_cpu = ih->rptr;
>>>>>>>>>>>> @@ -275,6 +278,16 @@ static void tonga_ih_set_rptr(struct
>>>>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>>>>        } else {
>>>>>>>>>>>>            WREG32(mmIH_RB_RPTR, ih->rptr);
>>>>>>>>>>>>        }
>>>>>>>>>>>> +
>>>>>>>>>>>> +    /* If we overflowed previously (and thus set the
>>>>>>>>>>>> OVERFLOW_CLEAR
>>>>>>>>>>>> bit),
>>>>>>>>>>>> +     * reset it here to detect more overflows if they occur.
>>>>>>>>>>>> +     */
>>>>>>>>>>>> +    if (ih->overflow) {
>>>>>>>>>>>> +        tmp = RREG32(mmIH_RB_CNTL);
>>>>>>>>>>>> +        tmp = REG_SET_FIELD(tmp, IH_RB_CNTL,
>>>>>>>>>>>> WPTR_OVERFLOW_CLEAR,
>>>>>>>>>>>> 0);
>>>>>>>>>>>> +        WREG32(mmIH_RB_CNTL, tmp);
>>>>>>>>>>>> +        ih->overflow = false;
>>>>>>>>>>>> +    }
>>>>>>>>>>>>    }
>>>>>>>>>>>>
>>>>>>>>>>>>    static int tonga_ih_early_init(void *handle)
>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
>>>>>>>>>>>> index d364c6dd152c..bb005924f194 100644
>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
>>>>>>>>>>>> @@ -372,6 +372,7 @@ static u32 vega10_ih_get_wptr(struct
>>>>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>>>>        tmp = RREG32_NO_KIQ(ih_regs->ih_rb_cntl);
>>>>>>>>>>>>        tmp = REG_SET_FIELD(tmp, IH_RB_CNTL,
>>>>>>>>>>>> WPTR_OVERFLOW_CLEAR, 1);
>>>>>>>>>>>>        WREG32_NO_KIQ(ih_regs->ih_rb_cntl, tmp);
>>>>>>>>>>>> +    ih->overflow = true;
>>>>>>>>>>>>
>>>>>>>>>>>>    out:
>>>>>>>>>>>>        return (wptr & ih->ptr_mask);
>>>>>>>>>>>> @@ -413,6 +414,7 @@ static void vega10_ih_irq_rearm(struct
>>>>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>>>>    static void vega10_ih_set_rptr(struct amdgpu_device *adev,
>>>>>>>>>>>>                       struct amdgpu_ih_ring *ih)
>>>>>>>>>>>>    {
>>>>>>>>>>>> +    u32 tmp;
>>>>>>>>>>>>        struct amdgpu_ih_regs *ih_regs;
>>>>>>>>>>>>
>>>>>>>>>>>>        if (ih == &adev->irq.ih_soft)
>>>>>>>>>>>> @@ -429,6 +431,16 @@ static void vega10_ih_set_rptr(struct
>>>>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>>>>            ih_regs = &ih->ih_regs;
>>>>>>>>>>>>            WREG32(ih_regs->ih_rb_rptr, ih->rptr);
>>>>>>>>>>>>        }
>>>>>>>>>>>> +
>>>>>>>>>>>> +    /* If we overflowed previously (and thus set the
>>>>>>>>>>>> OVERFLOW_CLEAR
>>>>>>>>>>>> bit),
>>>>>>>>>>>> +     * reset it here to detect more overflows if they occur.
>>>>>>>>>>>> +     */
>>>>>>>>>>>> +    if (ih->overflow) {
>>>>>>>>>>>> +        tmp = RREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl);
>>>>>>>>>>>> +        tmp = REG_SET_FIELD(tmp, IH_RB_CNTL,
>>>>>>>>>>>> WPTR_OVERFLOW_CLEAR,
>>>>>>>>>>>> 0);
>>>>>>>>>>>> + WREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl, tmp);
>>>>>>>>>>>> +        ih->overflow = false;
>>>>>>>>>>>> +    }
>>>>>>>>>>>>    }
>>>>>>>>>>>>
>>>>>>>>>>>>    /**
>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/vega20_ih.c
>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/vega20_ih.c
>>>>>>>>>>>> index ddfc6941f9d5..bb725a970697 100644
>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/vega20_ih.c
>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/vega20_ih.c
>>>>>>>>>>>> @@ -420,6 +420,7 @@ static u32 vega20_ih_get_wptr(struct
>>>>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>>>>        tmp = RREG32_NO_KIQ(ih_regs->ih_rb_cntl);
>>>>>>>>>>>>        tmp = REG_SET_FIELD(tmp, IH_RB_CNTL,
>>>>>>>>>>>> WPTR_OVERFLOW_CLEAR, 1);
>>>>>>>>>>>>        WREG32_NO_KIQ(ih_regs->ih_rb_cntl, tmp);
>>>>>>>>>>>> +    ih->overflow = true;
>>>>>>>>>>>>
>>>>>>>>>>>>    out:
>>>>>>>>>>>>        return (wptr & ih->ptr_mask);
>>>>>>>>>>>> @@ -462,6 +463,7 @@ static void vega20_ih_irq_rearm(struct
>>>>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>>>>    static void vega20_ih_set_rptr(struct amdgpu_device *adev,
>>>>>>>>>>>>                       struct amdgpu_ih_ring *ih)
>>>>>>>>>>>>    {
>>>>>>>>>>>> +    u32 tmp;
>>>>>>>>>>>>        struct amdgpu_ih_regs *ih_regs;
>>>>>>>>>>>>
>>>>>>>>>>>>        if (ih == &adev->irq.ih_soft)
>>>>>>>>>>>> @@ -478,6 +480,16 @@ static void vega20_ih_set_rptr(struct
>>>>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>>>>            ih_regs = &ih->ih_regs;
>>>>>>>>>>>>            WREG32(ih_regs->ih_rb_rptr, ih->rptr);
>>>>>>>>>>>>        }
>>>>>>>>>>>> +
>>>>>>>>>>>> +    /* If we overflowed previously (and thus set the
>>>>>>>>>>>> OVERFLOW_CLEAR
>>>>>>>>>>>> bit),
>>>>>>>>>>>> +     * reset it here to detect more overflows if they occur.
>>>>>>>>>>>> +     */
>>>>>>>>>>>> +    if (ih->overflow) {
>>>>>>>>>>>> +        tmp = RREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl);
>>>>>>>>>>>> +        tmp = REG_SET_FIELD(tmp, IH_RB_CNTL,
>>>>>>>>>>>> WPTR_OVERFLOW_CLEAR,
>>>>>>>>>>>> 0);
>>>>>>>>>>>> + WREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl, tmp);
>>>>>>>>>>>> +        ih->overflow = false;
>>>>>>>>>>>> +    }
>>>>>>>>>>>>    }
>>>>>>>>>>>>
>>>>>>>>>>>>    /**
>>>>>>>>>>>> --
>>>>>>>>>>>> 2.43.0
>>>>>>>>>>>>
>>>>
>>

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 1/2] drm/amdgpu: Reset IH OVERFLOW_CLEAR bit after writing rptr
  2024-01-22 10:45                       ` Friedrich Vock
@ 2024-01-22 13:35                         ` Christian König
  2024-01-22 22:39                           ` Joshua Ashton
  0 siblings, 1 reply; 28+ messages in thread
From: Christian König @ 2024-01-22 13:35 UTC (permalink / raw)
  To: Friedrich Vock, Felix Kuehling, Christian König,
	Alex Deucher, Dommati, Sunil-kumar
  Cc: Alex Deucher, amd-gfx, Joshua Ashton

Am 22.01.24 um 11:45 schrieb Friedrich Vock:
> On 22.01.24 11:21, Friedrich Vock wrote:
>> On 22.01.24 11:10, Christian König wrote:
>>> Am 19.01.24 um 20:18 schrieb Felix Kuehling:
>>>> On 2024-01-18 07:07, Christian König wrote:
>>>>> Am 18.01.24 um 00:44 schrieb Friedrich Vock:
>>>>>> On 18.01.24 00:00, Alex Deucher wrote:
>>>>>> [SNIP]
>>>>>> No, amdgpu.noretry=1 does not change anything.
>>>>>
>>>>> Well the good news first the hw engineer answered rather quickly.
>>>>> The bad news is that the hardware really doesn't work as documented
>>>>> in multiple ways.
>>>>>
>>>>> First of all the CLEAR bit is a level and not a trigger, so the
>>>>> intention to clear it is indeed correct. For now please modify this
>>>>> patch so that the CLEAR bit is set and cleared directly after
>>>>> setting it, this way we should be able to detect further overflows
>>>>> immediately.
>>>>>
>>>>> Then the APU the Steam Deck uses simply doesn't have the filter
>>>>> function for page faults in the hardware, the really bad news is it
>>>>> also doesn't have the extra IH rings where we could re-route the
>>>>> faults to prevent overflows.
>>>>>
>>>>> That full explains the behavior you have been seeing, but doesn't
>>>>> really provide a doable solution to mitigate this problem.
>>>>>
>>>>> I'm going to dig deeper into the hw documentation and specification
>>>>> to see if we can use a different feature to avoid the overflow.
>>>>
>>>> If we're not enabling retry faults, then each wave front should
>>>> generate at most one fault. You should be able to avoid overflows by
>>>> making the IH ring large enough to accommodate one fault per wave
>>>> front.
>>>
>>> That is the exact same argument our HW engineers came up with when we
>>> asked why the APU is missing all those nice IH ring overflow avoidance
>>> features the dGPUs have :)
>>>
>> I can reproduce IH overflows on my RX 6700 XT dGPU as well FWIW.

Interesting data point. We have probably looked to much into the faults 
on MI* products and never checked Navi.

Can you try to just setting WPTR_OVERFLOW_ENABLE to 0? At least in 
theory that should disable IH overflows altogether on Navi without 
causing loss of IVs.

>>
>>> The only problem with this approach is that on Navi when a wave is
>>> blocked by waiting on a fault you can't kill it using soft recovery
>>> any more (at least when my understanding is correct).
>>>
>> Killing page-faulted waves via soft recovery works. From my testing on
>> Deck, it seems to take a bit of time, but if you try for long enough
>> soft recovery eventually succeeds.

Ok that is massively strange. We had tons of discussions about that 
shader can't be interrupted while they wait for a fault on Navi.

Maybe killing them is still possible, need to double check that as well.

>
>
> On second thought, could it be that this is the critical flaw in the "at
> most one fault per wave" thinking?

Well completely agree that this. That rational to leave out the new IH 
features on APUs is rather weak.

>
> Most work submissions in practice submit more waves than the number of
> wave slots the GPU has.
> As far as I understand soft recovery, the only thing it does is kill all
> active waves. This frees up the CUs so more waves are launched, which
> can fault again, and that leads to potentially lots of faults for a
> single wave slot in the end.

Exactly that, but killing each wave takes a moment since we do that in a 
loop with a bit delay in there.

So the interrupt handler should at least in theory have time to catch up.

Regards,
Christian.

>
> Regards,
> Friedrich

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 1/2] drm/amdgpu: Reset IH OVERFLOW_CLEAR bit after writing rptr
  2024-01-22 13:35                         ` Christian König
@ 2024-01-22 22:39                           ` Joshua Ashton
  2024-01-23  9:36                             ` Christian König
  0 siblings, 1 reply; 28+ messages in thread
From: Joshua Ashton @ 2024-01-22 22:39 UTC (permalink / raw)
  To: Christian König, Friedrich Vock, Felix Kuehling,
	Christian König, Alex Deucher, Dommati, Sunil-kumar
  Cc: Alex Deucher, amd-gfx



On 1/22/24 13:35, Christian König wrote:
> Am 22.01.24 um 11:45 schrieb Friedrich Vock:
>> On 22.01.24 11:21, Friedrich Vock wrote:
>>> On 22.01.24 11:10, Christian König wrote:
>>>> Am 19.01.24 um 20:18 schrieb Felix Kuehling:
>>>>> On 2024-01-18 07:07, Christian König wrote:
>>>>>> Am 18.01.24 um 00:44 schrieb Friedrich Vock:
>>>>>>> On 18.01.24 00:00, Alex Deucher wrote:
>>>>>>> [SNIP]
>>>>>>> No, amdgpu.noretry=1 does not change anything.
>>>>>>
>>>>>> Well the good news first the hw engineer answered rather quickly.
>>>>>> The bad news is that the hardware really doesn't work as documented
>>>>>> in multiple ways.
>>>>>>
>>>>>> First of all the CLEAR bit is a level and not a trigger, so the
>>>>>> intention to clear it is indeed correct. For now please modify this
>>>>>> patch so that the CLEAR bit is set and cleared directly after
>>>>>> setting it, this way we should be able to detect further overflows
>>>>>> immediately.
>>>>>>
>>>>>> Then the APU the Steam Deck uses simply doesn't have the filter
>>>>>> function for page faults in the hardware, the really bad news is it
>>>>>> also doesn't have the extra IH rings where we could re-route the
>>>>>> faults to prevent overflows.
>>>>>>
>>>>>> That full explains the behavior you have been seeing, but doesn't
>>>>>> really provide a doable solution to mitigate this problem.
>>>>>>
>>>>>> I'm going to dig deeper into the hw documentation and specification
>>>>>> to see if we can use a different feature to avoid the overflow.
>>>>>
>>>>> If we're not enabling retry faults, then each wave front should
>>>>> generate at most one fault. You should be able to avoid overflows by
>>>>> making the IH ring large enough to accommodate one fault per wave
>>>>> front.
>>>>
>>>> That is the exact same argument our HW engineers came up with when we
>>>> asked why the APU is missing all those nice IH ring overflow avoidance
>>>> features the dGPUs have :)
>>>>
>>> I can reproduce IH overflows on my RX 6700 XT dGPU as well FWIW.
> 
> Interesting data point. We have probably looked to much into the faults 
> on MI* products and never checked Navi.
> 
> Can you try to just setting WPTR_OVERFLOW_ENABLE to 0? At least in 
> theory that should disable IH overflows altogether on Navi without 
> causing loss of IVs.
> 
>>>
>>>> The only problem with this approach is that on Navi when a wave is
>>>> blocked by waiting on a fault you can't kill it using soft recovery
>>>> any more (at least when my understanding is correct).
>>>>
>>> Killing page-faulted waves via soft recovery works. From my testing on
>>> Deck, it seems to take a bit of time, but if you try for long enough
>>> soft recovery eventually succeeds.
> 
> Ok that is massively strange. We had tons of discussions about that 
> shader can't be interrupted while they wait for a fault on Navi.
> 
> Maybe killing them is still possible, need to double check that as well.
> 
>>
>>
>> On second thought, could it be that this is the critical flaw in the "at
>> most one fault per wave" thinking?
> 
> Well completely agree that this. That rational to leave out the new IH 
> features on APUs is rather weak.
> 
>>
>> Most work submissions in practice submit more waves than the number of
>> wave slots the GPU has.
>> As far as I understand soft recovery, the only thing it does is kill all
>> active waves. This frees up the CUs so more waves are launched, which
>> can fault again, and that leads to potentially lots of faults for a
>> single wave slot in the end.
> 
> Exactly that, but killing each wave takes a moment since we do that in a 
> loop with a bit delay in there.
> 
> So the interrupt handler should at least in theory have time to catch up.

I don't think there is any delay in that loop is there?

	while (!dma_fence_is_signaled(fence) &&
	       ktime_to_ns(ktime_sub(deadline, ktime_get())) > 0)
		ring->funcs->soft_recovery(ring, vmid);

(soft_recovery function does not have a delay/sleep/whatever either)

FWIW, two other changes we did in SteamOS to make recovery more reliable 
on VANGOGH was:

1) Move the timeout determination after the spinlock setting the fence 
error.

2) Raise the timeout from 0.1s to 1s.

- Joshie 🐸✨


> 
> Regards,
> Christian.
> 
>>
>> Regards,
>> Friedrich


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 1/2] drm/amdgpu: Reset IH OVERFLOW_CLEAR bit after writing rptr
  2024-01-22 22:39                           ` Joshua Ashton
@ 2024-01-23  9:36                             ` Christian König
  2024-01-23 11:35                               ` Friedrich Vock
  0 siblings, 1 reply; 28+ messages in thread
From: Christian König @ 2024-01-23  9:36 UTC (permalink / raw)
  To: Joshua Ashton, Friedrich Vock, Felix Kuehling,
	Christian König, Alex Deucher, Dommati, Sunil-kumar
  Cc: Alex Deucher, amd-gfx



Am 22.01.24 um 23:39 schrieb Joshua Ashton:
> [SNIP]
>>>
>>> Most work submissions in practice submit more waves than the number of
>>> wave slots the GPU has.
>>> As far as I understand soft recovery, the only thing it does is kill 
>>> all
>>> active waves. This frees up the CUs so more waves are launched, which
>>> can fault again, and that leads to potentially lots of faults for a
>>> single wave slot in the end.
>>
>> Exactly that, but killing each wave takes a moment since we do that 
>> in a loop with a bit delay in there.
>>
>> So the interrupt handler should at least in theory have time to catch 
>> up.
>
> I don't think there is any delay in that loop is there?

Mhm, looks like I remember that incorrectly.

>
>     while (!dma_fence_is_signaled(fence) &&
>            ktime_to_ns(ktime_sub(deadline, ktime_get())) > 0)
>         ring->funcs->soft_recovery(ring, vmid);
>
> (soft_recovery function does not have a delay/sleep/whatever either)
>
> FWIW, two other changes we did in SteamOS to make recovery more 
> reliable on VANGOGH was:
>
> 1) Move the timeout determination after the spinlock setting the fence 
> error.

Well that should not really have any effect.

>
> 2) Raise the timeout from 0.1s to 1s.

Well that's not necessarily a good idea. If the SQ isn't able to respond 
in 100ms then I would really go into a hard reset.

Waiting one extra second is way to long here.

Regards,
Christian.

>
> - Joshie 🐸✨
>
>
>>
>> Regards,
>> Christian.
>>
>>>
>>> Regards,
>>> Friedrich
>


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 1/2] drm/amdgpu: Reset IH OVERFLOW_CLEAR bit after writing rptr
  2024-01-23  9:36                             ` Christian König
@ 2024-01-23 11:35                               ` Friedrich Vock
  2024-01-23 12:49                                 ` Christian König
  0 siblings, 1 reply; 28+ messages in thread
From: Friedrich Vock @ 2024-01-23 11:35 UTC (permalink / raw)
  To: Christian König, Joshua Ashton, Felix Kuehling,
	Christian König, Alex Deucher, Dommati, Sunil-kumar
  Cc: Alex Deucher, amd-gfx

On 23.01.24 10:36, Christian König wrote:
>
>
> Am 22.01.24 um 23:39 schrieb Joshua Ashton:
>> [SNIP]
>>>>
>>>> Most work submissions in practice submit more waves than the number of
>>>> wave slots the GPU has.
>>>> As far as I understand soft recovery, the only thing it does is
>>>> kill all
>>>> active waves. This frees up the CUs so more waves are launched, which
>>>> can fault again, and that leads to potentially lots of faults for a
>>>> single wave slot in the end.
>>>
>>> Exactly that, but killing each wave takes a moment since we do that
>>> in a loop with a bit delay in there.
>>>
>>> So the interrupt handler should at least in theory have time to
>>> catch up.
>>
>> I don't think there is any delay in that loop is there?
>
> Mhm, looks like I remember that incorrectly.
>
>>
>>     while (!dma_fence_is_signaled(fence) &&
>>            ktime_to_ns(ktime_sub(deadline, ktime_get())) > 0)
>>         ring->funcs->soft_recovery(ring, vmid);
>>
>> (soft_recovery function does not have a delay/sleep/whatever either)
>>
>> FWIW, two other changes we did in SteamOS to make recovery more
>> reliable on VANGOGH was:
>>
>> 1) Move the timeout determination after the spinlock setting the
>> fence error.
>
> Well that should not really have any effect.
>
>>
>> 2) Raise the timeout from 0.1s to 1s.
>
> Well that's not necessarily a good idea. If the SQ isn't able to
> respond in 100ms then I would really go into a hard reset.
>
> Waiting one extra second is way to long here.

Bumping the timeout seemed to be necessary in order to reliably
soft-recover from hangs with page faults. (Being able to soft-recover
from these is actually a really good thing, because if e.g. games
accidentally trigger faults, it won't kill a user's entire system.)

However, the bump I had in mind was more moderate: Currently the timeout
is 10ms (=0.01s). Bumping that to 0.1s already improves reliability
enough. I agree that waiting a full second before giving up might be a
bit too long.

Regards,
Friedrich

>
> Regards,
> Christian.
>
>>
>> - Joshie 🐸✨
>>
>>
>>>
>>> Regards,
>>> Christian.
>>>
>>>>
>>>> Regards,
>>>> Friedrich
>>
>

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 1/2] drm/amdgpu: Reset IH OVERFLOW_CLEAR bit after writing rptr
  2024-01-23 11:35                               ` Friedrich Vock
@ 2024-01-23 12:49                                 ` Christian König
  2024-02-02 11:11                                   ` Joshua Ashton
  0 siblings, 1 reply; 28+ messages in thread
From: Christian König @ 2024-01-23 12:49 UTC (permalink / raw)
  To: Friedrich Vock, Joshua Ashton, Felix Kuehling,
	Christian König, Alex Deucher, Dommati, Sunil-kumar
  Cc: Alex Deucher, amd-gfx

Am 23.01.24 um 12:35 schrieb Friedrich Vock:
> On 23.01.24 10:36, Christian König wrote:
>>
>>
>> Am 22.01.24 um 23:39 schrieb Joshua Ashton:
>>> [SNIP]
>>>>>
>>>>> Most work submissions in practice submit more waves than the 
>>>>> number of
>>>>> wave slots the GPU has.
>>>>> As far as I understand soft recovery, the only thing it does is
>>>>> kill all
>>>>> active waves. This frees up the CUs so more waves are launched, which
>>>>> can fault again, and that leads to potentially lots of faults for a
>>>>> single wave slot in the end.
>>>>
>>>> Exactly that, but killing each wave takes a moment since we do that
>>>> in a loop with a bit delay in there.
>>>>
>>>> So the interrupt handler should at least in theory have time to
>>>> catch up.
>>>
>>> I don't think there is any delay in that loop is there?
>>
>> Mhm, looks like I remember that incorrectly.
>>
>>>
>>>     while (!dma_fence_is_signaled(fence) &&
>>>            ktime_to_ns(ktime_sub(deadline, ktime_get())) > 0)
>>>         ring->funcs->soft_recovery(ring, vmid);
>>>
>>> (soft_recovery function does not have a delay/sleep/whatever either)
>>>
>>> FWIW, two other changes we did in SteamOS to make recovery more
>>> reliable on VANGOGH was:
>>>
>>> 1) Move the timeout determination after the spinlock setting the
>>> fence error.
>>
>> Well that should not really have any effect.
>>
>>>
>>> 2) Raise the timeout from 0.1s to 1s.
>>
>> Well that's not necessarily a good idea. If the SQ isn't able to
>> respond in 100ms then I would really go into a hard reset.
>>
>> Waiting one extra second is way to long here.
>
> Bumping the timeout seemed to be necessary in order to reliably
> soft-recover from hangs with page faults. (Being able to soft-recover
> from these is actually a really good thing, because if e.g. games
> accidentally trigger faults, it won't kill a user's entire system.)

I still have an extremely bad feeling about that. From the discussions a 
wave which waits for a fault resolution can't be preempted nor killed.

So what most likely happens is that some of the state sticks around in 
the hw and can only be cleared with a hard recovery.

For the steam deck it might still be the better option but that is most 
likely not the best solution for every use case. It could for example be 
that the system doesn't have the full performance any more.

>
> However, the bump I had in mind was more moderate: Currently the timeout
> is 10ms (=0.01s). Bumping that to 0.1s already improves reliability
> enough. I agree that waiting a full second before giving up might be a
> bit too long.

Well we should never have a timeout longer than we would expect a 
submission to be. So assuming a minimum of 10fps we should never go over 
100ms or so.

If killing the waves takes longer than the original submission would 
have then there is most likely some state not correctly cleared in the 
hw and we really have to do a hard reset to clean up.

Regards,
Christian.

>
> Regards,
> Friedrich
>
>>
>> Regards,
>> Christian.
>>
>>>
>>> - Joshie 🐸✨
>>>
>>>
>>>>
>>>> Regards,
>>>> Christian.
>>>>
>>>>>
>>>>> Regards,
>>>>> Friedrich
>>>
>>


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 1/2] drm/amdgpu: Reset IH OVERFLOW_CLEAR bit after writing rptr
  2024-01-23 12:49                                 ` Christian König
@ 2024-02-02 11:11                                   ` Joshua Ashton
  2024-02-02 13:31                                     ` Christian König
  0 siblings, 1 reply; 28+ messages in thread
From: Joshua Ashton @ 2024-02-02 11:11 UTC (permalink / raw)
  To: Christian König, Friedrich Vock, Felix Kuehling,
	Christian König, Alex Deucher, Dommati, Sunil-kumar
  Cc: Alex Deucher, amd-gfx

Hello Christian,

Any update on finding an upstreamable solution for this problem?

Having working hang recovery is really important for us on Steam Deck, 
and it would be nice to have an upstream solution, and not carry a bunch 
of patches you disagree with. :P

Thanks
- Joshie 🐸✨

On 1/23/24 12:49, Christian König wrote:
> Am 23.01.24 um 12:35 schrieb Friedrich Vock:
>> On 23.01.24 10:36, Christian König wrote:
>>>
>>>
>>> Am 22.01.24 um 23:39 schrieb Joshua Ashton:
>>>> [SNIP]
>>>>>>
>>>>>> Most work submissions in practice submit more waves than the 
>>>>>> number of
>>>>>> wave slots the GPU has.
>>>>>> As far as I understand soft recovery, the only thing it does is
>>>>>> kill all
>>>>>> active waves. This frees up the CUs so more waves are launched, which
>>>>>> can fault again, and that leads to potentially lots of faults for a
>>>>>> single wave slot in the end.
>>>>>
>>>>> Exactly that, but killing each wave takes a moment since we do that
>>>>> in a loop with a bit delay in there.
>>>>>
>>>>> So the interrupt handler should at least in theory have time to
>>>>> catch up.
>>>>
>>>> I don't think there is any delay in that loop is there?
>>>
>>> Mhm, looks like I remember that incorrectly.
>>>
>>>>
>>>>     while (!dma_fence_is_signaled(fence) &&
>>>>            ktime_to_ns(ktime_sub(deadline, ktime_get())) > 0)
>>>>         ring->funcs->soft_recovery(ring, vmid);
>>>>
>>>> (soft_recovery function does not have a delay/sleep/whatever either)
>>>>
>>>> FWIW, two other changes we did in SteamOS to make recovery more
>>>> reliable on VANGOGH was:
>>>>
>>>> 1) Move the timeout determination after the spinlock setting the
>>>> fence error.
>>>
>>> Well that should not really have any effect.
>>>
>>>>
>>>> 2) Raise the timeout from 0.1s to 1s.
>>>
>>> Well that's not necessarily a good idea. If the SQ isn't able to
>>> respond in 100ms then I would really go into a hard reset.
>>>
>>> Waiting one extra second is way to long here.
>>
>> Bumping the timeout seemed to be necessary in order to reliably
>> soft-recover from hangs with page faults. (Being able to soft-recover
>> from these is actually a really good thing, because if e.g. games
>> accidentally trigger faults, it won't kill a user's entire system.)
> 
> I still have an extremely bad feeling about that. From the discussions a 
> wave which waits for a fault resolution can't be preempted nor killed.
> 
> So what most likely happens is that some of the state sticks around in 
> the hw and can only be cleared with a hard recovery.
> 
> For the steam deck it might still be the better option but that is most 
> likely not the best solution for every use case. It could for example be 
> that the system doesn't have the full performance any more.
> 
>>
>> However, the bump I had in mind was more moderate: Currently the timeout
>> is 10ms (=0.01s). Bumping that to 0.1s already improves reliability
>> enough. I agree that waiting a full second before giving up might be a
>> bit too long.
> 
> Well we should never have a timeout longer than we would expect a 
> submission to be. So assuming a minimum of 10fps we should never go over 
> 100ms or so.
> 
> If killing the waves takes longer than the original submission would 
> have then there is most likely some state not correctly cleared in the 
> hw and we really have to do a hard reset to clean up.
> 
> Regards,
> Christian.
> 
>>
>> Regards,
>> Friedrich
>>
>>>
>>> Regards,
>>> Christian.
>>>
>>>>
>>>> - Joshie 🐸✨
>>>>
>>>>
>>>>>
>>>>> Regards,
>>>>> Christian.
>>>>>
>>>>>>
>>>>>> Regards,
>>>>>> Friedrich
>>>>
>>>
> 

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 1/2] drm/amdgpu: Reset IH OVERFLOW_CLEAR bit after writing rptr
  2024-02-02 11:11                                   ` Joshua Ashton
@ 2024-02-02 13:31                                     ` Christian König
  0 siblings, 0 replies; 28+ messages in thread
From: Christian König @ 2024-02-02 13:31 UTC (permalink / raw)
  To: Joshua Ashton, Friedrich Vock, Felix Kuehling,
	Christian König, Alex Deucher, Dommati, Sunil-kumar
  Cc: Alex Deucher, amd-gfx

Hi Joshie,

the first patch is already on the way upstream since that is a clear bug 
fix.

Sunil has setup a test system and contacted up with Friedrich to get his 
hands on the test application and reproduced the problem. It looks like 
that the OVERFLOW_CLEAR bit is only the tip of the iceberg of incorrect 
IH documentation and we are now nuking the HW engineers responsible for 
this block with questions.

I will also be pushing that we get an IGT tests for this and that we 
find a long term solution to not be surprised by incorrect hw 
documentation any more.

Thanks,
Christian.

Am 02.02.24 um 12:11 schrieb Joshua Ashton:
> Hello Christian,
>
> Any update on finding an upstreamable solution for this problem?
>
> Having working hang recovery is really important for us on Steam Deck, 
> and it would be nice to have an upstream solution, and not carry a 
> bunch of patches you disagree with. :P
>
> Thanks
> - Joshie 🐸✨
>
> On 1/23/24 12:49, Christian König wrote:
>> Am 23.01.24 um 12:35 schrieb Friedrich Vock:
>>> On 23.01.24 10:36, Christian König wrote:
>>>>
>>>>
>>>> Am 22.01.24 um 23:39 schrieb Joshua Ashton:
>>>>> [SNIP]
>>>>>>>
>>>>>>> Most work submissions in practice submit more waves than the 
>>>>>>> number of
>>>>>>> wave slots the GPU has.
>>>>>>> As far as I understand soft recovery, the only thing it does is
>>>>>>> kill all
>>>>>>> active waves. This frees up the CUs so more waves are launched, 
>>>>>>> which
>>>>>>> can fault again, and that leads to potentially lots of faults for a
>>>>>>> single wave slot in the end.
>>>>>>
>>>>>> Exactly that, but killing each wave takes a moment since we do that
>>>>>> in a loop with a bit delay in there.
>>>>>>
>>>>>> So the interrupt handler should at least in theory have time to
>>>>>> catch up.
>>>>>
>>>>> I don't think there is any delay in that loop is there?
>>>>
>>>> Mhm, looks like I remember that incorrectly.
>>>>
>>>>>
>>>>>     while (!dma_fence_is_signaled(fence) &&
>>>>>            ktime_to_ns(ktime_sub(deadline, ktime_get())) > 0)
>>>>>         ring->funcs->soft_recovery(ring, vmid);
>>>>>
>>>>> (soft_recovery function does not have a delay/sleep/whatever either)
>>>>>
>>>>> FWIW, two other changes we did in SteamOS to make recovery more
>>>>> reliable on VANGOGH was:
>>>>>
>>>>> 1) Move the timeout determination after the spinlock setting the
>>>>> fence error.
>>>>
>>>> Well that should not really have any effect.
>>>>
>>>>>
>>>>> 2) Raise the timeout from 0.1s to 1s.
>>>>
>>>> Well that's not necessarily a good idea. If the SQ isn't able to
>>>> respond in 100ms then I would really go into a hard reset.
>>>>
>>>> Waiting one extra second is way to long here.
>>>
>>> Bumping the timeout seemed to be necessary in order to reliably
>>> soft-recover from hangs with page faults. (Being able to soft-recover
>>> from these is actually a really good thing, because if e.g. games
>>> accidentally trigger faults, it won't kill a user's entire system.)
>>
>> I still have an extremely bad feeling about that. From the 
>> discussions a wave which waits for a fault resolution can't be 
>> preempted nor killed.
>>
>> So what most likely happens is that some of the state sticks around 
>> in the hw and can only be cleared with a hard recovery.
>>
>> For the steam deck it might still be the better option but that is 
>> most likely not the best solution for every use case. It could for 
>> example be that the system doesn't have the full performance any more.
>>
>>>
>>> However, the bump I had in mind was more moderate: Currently the 
>>> timeout
>>> is 10ms (=0.01s). Bumping that to 0.1s already improves reliability
>>> enough. I agree that waiting a full second before giving up might be a
>>> bit too long.
>>
>> Well we should never have a timeout longer than we would expect a 
>> submission to be. So assuming a minimum of 10fps we should never go 
>> over 100ms or so.
>>
>> If killing the waves takes longer than the original submission would 
>> have then there is most likely some state not correctly cleared in 
>> the hw and we really have to do a hard reset to clean up.
>>
>> Regards,
>> Christian.
>>
>>>
>>> Regards,
>>> Friedrich
>>>
>>>>
>>>> Regards,
>>>> Christian.
>>>>
>>>>>
>>>>> - Joshie 🐸✨
>>>>>
>>>>>
>>>>>>
>>>>>> Regards,
>>>>>> Christian.
>>>>>>
>>>>>>>
>>>>>>> Regards,
>>>>>>> Friedrich
>>>>>
>>>>
>>


^ permalink raw reply	[flat|nested] 28+ messages in thread

end of thread, other threads:[~2024-02-02 13:43 UTC | newest]

Thread overview: 28+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2024-01-14 13:00 [PATCH 1/2] drm/amdgpu: Reset IH OVERFLOW_CLEAR bit after writing rptr Friedrich Vock
2024-01-14 13:00 ` Friedrich Vock
2024-01-14 13:00 ` [PATCH 2/2] drm/amdgpu: Process fences on IH overflow Friedrich Vock
2024-01-14 13:00   ` Friedrich Vock
2024-01-15 10:26   ` Christian König
2024-01-15 10:26     ` Christian König
2024-01-15 11:19     ` Friedrich Vock
2024-01-15 11:19       ` Friedrich Vock
2024-01-16  7:17       ` Christian König
2024-01-16  7:17         ` Christian König
     [not found] ` <69cec077-4011-4738-bbb0-8fb1e6f52159@gmail.com>
2024-01-15 11:18   ` [PATCH 1/2] drm/amdgpu: Reset IH OVERFLOW_CLEAR bit after writing rptr Friedrich Vock
2024-01-16  7:03     ` Christian König
2024-01-16 10:31       ` Friedrich Vock
2024-01-17 12:27         ` Christian König
2024-01-17 23:00           ` Alex Deucher
2024-01-17 23:44             ` Friedrich Vock
2024-01-18 12:07               ` Christian König
2024-01-19 19:18                 ` Felix Kuehling
2024-01-22 10:10                   ` Christian König
2024-01-22 10:21                     ` Friedrich Vock
2024-01-22 10:45                       ` Friedrich Vock
2024-01-22 13:35                         ` Christian König
2024-01-22 22:39                           ` Joshua Ashton
2024-01-23  9:36                             ` Christian König
2024-01-23 11:35                               ` Friedrich Vock
2024-01-23 12:49                                 ` Christian König
2024-02-02 11:11                                   ` Joshua Ashton
2024-02-02 13:31                                     ` Christian König

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.