* [PATCH] drm/amdgpu/mes: fix mes ring buffer overflow
@ 2024-07-19 9:16 Jack Xiao
2024-07-19 13:56 ` Alex Deucher
2024-07-19 15:44 ` Christian König
0 siblings, 2 replies; 14+ messages in thread
From: Jack Xiao @ 2024-07-19 9:16 UTC (permalink / raw)
To: amd-gfx, Alexander.Deucher; +Cc: Jack Xiao
wait memory room until enough before writing mes packets
to avoid ring buffer overflow.
Signed-off-by: Jack Xiao <Jack.Xiao@amd.com>
---
drivers/gpu/drm/amd/amdgpu/mes_v11_0.c | 18 ++++++++++++++----
drivers/gpu/drm/amd/amdgpu/mes_v12_0.c | 18 ++++++++++++++----
2 files changed, 28 insertions(+), 8 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
index 8ce51b9236c1..68c74adf79f1 100644
--- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
@@ -168,7 +168,7 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
const char *op_str, *misc_op_str;
unsigned long flags;
u64 status_gpu_addr;
- u32 status_offset;
+ u32 seq, status_offset;
u64 *status_ptr;
signed long r;
int ret;
@@ -196,6 +196,13 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
if (r)
goto error_unlock_free;
+ seq = ++ring->fence_drv.sync_seq;
+ r = amdgpu_fence_wait_polling(ring,
+ seq - ring->fence_drv.num_fences_mask,
+ timeout);
+ if (r < 1)
+ goto error_undo;
+
api_status = (struct MES_API_STATUS *)((char *)pkt + api_status_off);
api_status->api_completion_fence_addr = status_gpu_addr;
api_status->api_completion_fence_value = 1;
@@ -208,8 +215,7 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
mes_status_pkt.header.dwsize = API_FRAME_SIZE_IN_DWORDS;
mes_status_pkt.api_status.api_completion_fence_addr =
ring->fence_drv.gpu_addr;
- mes_status_pkt.api_status.api_completion_fence_value =
- ++ring->fence_drv.sync_seq;
+ mes_status_pkt.api_status.api_completion_fence_value = seq;
amdgpu_ring_write_multiple(ring, &mes_status_pkt,
sizeof(mes_status_pkt) / 4);
@@ -229,7 +235,7 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
dev_dbg(adev->dev, "MES msg=%d was emitted\n",
x_pkt->header.opcode);
- r = amdgpu_fence_wait_polling(ring, ring->fence_drv.sync_seq, timeout);
+ r = amdgpu_fence_wait_polling(ring, seq, timeout);
if (r < 1 || !*status_ptr) {
if (misc_op_str)
@@ -252,6 +258,10 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
amdgpu_device_wb_free(adev, status_offset);
return 0;
+error_undo:
+ dev_err(adev->dev, "MES ring buffer is full.\n");
+ amdgpu_ring_undo(ring);
+
error_unlock_free:
spin_unlock_irqrestore(&mes->ring_lock, flags);
diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
index c9f74231ad59..48e01206bcc4 100644
--- a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
@@ -154,7 +154,7 @@ static int mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
const char *op_str, *misc_op_str;
unsigned long flags;
u64 status_gpu_addr;
- u32 status_offset;
+ u32 seq, status_offset;
u64 *status_ptr;
signed long r;
int ret;
@@ -182,6 +182,13 @@ static int mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
if (r)
goto error_unlock_free;
+ seq = ++ring->fence_drv.sync_seq;
+ r = amdgpu_fence_wait_polling(ring,
+ seq - ring->fence_drv.num_fences_mask,
+ timeout);
+ if (r < 1)
+ goto error_undo;
+
api_status = (struct MES_API_STATUS *)((char *)pkt + api_status_off);
api_status->api_completion_fence_addr = status_gpu_addr;
api_status->api_completion_fence_value = 1;
@@ -194,8 +201,7 @@ static int mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
mes_status_pkt.header.dwsize = API_FRAME_SIZE_IN_DWORDS;
mes_status_pkt.api_status.api_completion_fence_addr =
ring->fence_drv.gpu_addr;
- mes_status_pkt.api_status.api_completion_fence_value =
- ++ring->fence_drv.sync_seq;
+ mes_status_pkt.api_status.api_completion_fence_value = seq;
amdgpu_ring_write_multiple(ring, &mes_status_pkt,
sizeof(mes_status_pkt) / 4);
@@ -215,7 +221,7 @@ static int mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
dev_dbg(adev->dev, "MES msg=%d was emitted\n",
x_pkt->header.opcode);
- r = amdgpu_fence_wait_polling(ring, ring->fence_drv.sync_seq, timeout);
+ r = amdgpu_fence_wait_polling(ring, seq, timeout);
if (r < 1 || !*status_ptr) {
if (misc_op_str)
@@ -238,6 +244,10 @@ static int mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
amdgpu_device_wb_free(adev, status_offset);
return 0;
+error_undo:
+ dev_err(adev->dev, "MES ring buffer is full.\n");
+ amdgpu_ring_undo(ring);
+
error_unlock_free:
spin_unlock_irqrestore(&mes->ring_lock, flags);
--
2.41.0
^ permalink raw reply related [flat|nested] 14+ messages in thread
* Re: [PATCH] drm/amdgpu/mes: fix mes ring buffer overflow
2024-07-19 9:16 Jack Xiao
@ 2024-07-19 13:56 ` Alex Deucher
2024-07-19 15:44 ` Christian König
1 sibling, 0 replies; 14+ messages in thread
From: Alex Deucher @ 2024-07-19 13:56 UTC (permalink / raw)
To: Jack Xiao; +Cc: amd-gfx, Alexander.Deucher
On Fri, Jul 19, 2024 at 5:35 AM Jack Xiao <Jack.Xiao@amd.com> wrote:
>
> wait memory room until enough before writing mes packets
> to avoid ring buffer overflow.
>
> Signed-off-by: Jack Xiao <Jack.Xiao@amd.com>
Fixes: de3246254156 ("drm/amdgpu: cleanup MES11 command submission")
Fixes: fffe347e1478 ("drm/amdgpu: cleanup MES12 command submission")
Acked-by: Alex Deucher <alexander.deucher@amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/mes_v11_0.c | 18 ++++++++++++++----
> drivers/gpu/drm/amd/amdgpu/mes_v12_0.c | 18 ++++++++++++++----
> 2 files changed, 28 insertions(+), 8 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
> index 8ce51b9236c1..68c74adf79f1 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
> @@ -168,7 +168,7 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
> const char *op_str, *misc_op_str;
> unsigned long flags;
> u64 status_gpu_addr;
> - u32 status_offset;
> + u32 seq, status_offset;
> u64 *status_ptr;
> signed long r;
> int ret;
> @@ -196,6 +196,13 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
> if (r)
> goto error_unlock_free;
>
> + seq = ++ring->fence_drv.sync_seq;
> + r = amdgpu_fence_wait_polling(ring,
> + seq - ring->fence_drv.num_fences_mask,
> + timeout);
> + if (r < 1)
> + goto error_undo;
> +
> api_status = (struct MES_API_STATUS *)((char *)pkt + api_status_off);
> api_status->api_completion_fence_addr = status_gpu_addr;
> api_status->api_completion_fence_value = 1;
> @@ -208,8 +215,7 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
> mes_status_pkt.header.dwsize = API_FRAME_SIZE_IN_DWORDS;
> mes_status_pkt.api_status.api_completion_fence_addr =
> ring->fence_drv.gpu_addr;
> - mes_status_pkt.api_status.api_completion_fence_value =
> - ++ring->fence_drv.sync_seq;
> + mes_status_pkt.api_status.api_completion_fence_value = seq;
>
> amdgpu_ring_write_multiple(ring, &mes_status_pkt,
> sizeof(mes_status_pkt) / 4);
> @@ -229,7 +235,7 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
> dev_dbg(adev->dev, "MES msg=%d was emitted\n",
> x_pkt->header.opcode);
>
> - r = amdgpu_fence_wait_polling(ring, ring->fence_drv.sync_seq, timeout);
> + r = amdgpu_fence_wait_polling(ring, seq, timeout);
> if (r < 1 || !*status_ptr) {
>
> if (misc_op_str)
> @@ -252,6 +258,10 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
> amdgpu_device_wb_free(adev, status_offset);
> return 0;
>
> +error_undo:
> + dev_err(adev->dev, "MES ring buffer is full.\n");
> + amdgpu_ring_undo(ring);
> +
> error_unlock_free:
> spin_unlock_irqrestore(&mes->ring_lock, flags);
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
> index c9f74231ad59..48e01206bcc4 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
> @@ -154,7 +154,7 @@ static int mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
> const char *op_str, *misc_op_str;
> unsigned long flags;
> u64 status_gpu_addr;
> - u32 status_offset;
> + u32 seq, status_offset;
> u64 *status_ptr;
> signed long r;
> int ret;
> @@ -182,6 +182,13 @@ static int mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
> if (r)
> goto error_unlock_free;
>
> + seq = ++ring->fence_drv.sync_seq;
> + r = amdgpu_fence_wait_polling(ring,
> + seq - ring->fence_drv.num_fences_mask,
> + timeout);
> + if (r < 1)
> + goto error_undo;
> +
> api_status = (struct MES_API_STATUS *)((char *)pkt + api_status_off);
> api_status->api_completion_fence_addr = status_gpu_addr;
> api_status->api_completion_fence_value = 1;
> @@ -194,8 +201,7 @@ static int mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
> mes_status_pkt.header.dwsize = API_FRAME_SIZE_IN_DWORDS;
> mes_status_pkt.api_status.api_completion_fence_addr =
> ring->fence_drv.gpu_addr;
> - mes_status_pkt.api_status.api_completion_fence_value =
> - ++ring->fence_drv.sync_seq;
> + mes_status_pkt.api_status.api_completion_fence_value = seq;
>
> amdgpu_ring_write_multiple(ring, &mes_status_pkt,
> sizeof(mes_status_pkt) / 4);
> @@ -215,7 +221,7 @@ static int mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
> dev_dbg(adev->dev, "MES msg=%d was emitted\n",
> x_pkt->header.opcode);
>
> - r = amdgpu_fence_wait_polling(ring, ring->fence_drv.sync_seq, timeout);
> + r = amdgpu_fence_wait_polling(ring, seq, timeout);
> if (r < 1 || !*status_ptr) {
>
> if (misc_op_str)
> @@ -238,6 +244,10 @@ static int mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
> amdgpu_device_wb_free(adev, status_offset);
> return 0;
>
> +error_undo:
> + dev_err(adev->dev, "MES ring buffer is full.\n");
> + amdgpu_ring_undo(ring);
> +
> error_unlock_free:
> spin_unlock_irqrestore(&mes->ring_lock, flags);
>
> --
> 2.41.0
>
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH] drm/amdgpu/mes: fix mes ring buffer overflow
2024-07-19 9:16 Jack Xiao
2024-07-19 13:56 ` Alex Deucher
@ 2024-07-19 15:44 ` Christian König
2024-07-22 3:27 ` Xiao, Jack
1 sibling, 1 reply; 14+ messages in thread
From: Christian König @ 2024-07-19 15:44 UTC (permalink / raw)
To: Jack Xiao, amd-gfx, Alexander.Deucher
Am 19.07.24 um 11:16 schrieb Jack Xiao:
> wait memory room until enough before writing mes packets
> to avoid ring buffer overflow.
>
> Signed-off-by: Jack Xiao <Jack.Xiao@amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/mes_v11_0.c | 18 ++++++++++++++----
> drivers/gpu/drm/amd/amdgpu/mes_v12_0.c | 18 ++++++++++++++----
> 2 files changed, 28 insertions(+), 8 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
> index 8ce51b9236c1..68c74adf79f1 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
> @@ -168,7 +168,7 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
> const char *op_str, *misc_op_str;
> unsigned long flags;
> u64 status_gpu_addr;
> - u32 status_offset;
> + u32 seq, status_offset;
> u64 *status_ptr;
> signed long r;
> int ret;
> @@ -196,6 +196,13 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
> if (r)
> goto error_unlock_free;
>
> + seq = ++ring->fence_drv.sync_seq;
> + r = amdgpu_fence_wait_polling(ring,
> + seq - ring->fence_drv.num_fences_mask,
> + timeout);
> + if (r < 1)
> + goto error_undo;
> +
> api_status = (struct MES_API_STATUS *)((char *)pkt + api_status_off);
> api_status->api_completion_fence_addr = status_gpu_addr;
> api_status->api_completion_fence_value = 1;
> @@ -208,8 +215,7 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
> mes_status_pkt.header.dwsize = API_FRAME_SIZE_IN_DWORDS;
> mes_status_pkt.api_status.api_completion_fence_addr =
> ring->fence_drv.gpu_addr;
> - mes_status_pkt.api_status.api_completion_fence_value =
> - ++ring->fence_drv.sync_seq;
> + mes_status_pkt.api_status.api_completion_fence_value = seq;
>
> amdgpu_ring_write_multiple(ring, &mes_status_pkt,
> sizeof(mes_status_pkt) / 4);
> @@ -229,7 +235,7 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
> dev_dbg(adev->dev, "MES msg=%d was emitted\n",
> x_pkt->header.opcode);
>
> - r = amdgpu_fence_wait_polling(ring, ring->fence_drv.sync_seq, timeout);
> + r = amdgpu_fence_wait_polling(ring, seq, timeout);
> if (r < 1 || !*status_ptr) {
>
> if (misc_op_str)
> @@ -252,6 +258,10 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
> amdgpu_device_wb_free(adev, status_offset);
> return 0;
>
> +error_undo:
> + dev_err(adev->dev, "MES ring buffer is full.\n");
> + amdgpu_ring_undo(ring);
> +
> error_unlock_free:
> spin_unlock_irqrestore(&mes->ring_lock, flags);
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
> index c9f74231ad59..48e01206bcc4 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
> @@ -154,7 +154,7 @@ static int mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
> const char *op_str, *misc_op_str;
> unsigned long flags;
> u64 status_gpu_addr;
> - u32 status_offset;
> + u32 seq, status_offset;
> u64 *status_ptr;
> signed long r;
> int ret;
> @@ -182,6 +182,13 @@ static int mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
> if (r)
> goto error_unlock_free;
>
> + seq = ++ring->fence_drv.sync_seq;
> + r = amdgpu_fence_wait_polling(ring,
> + seq - ring->fence_drv.num_fences_mask,
That's what's amdgpu_fence_emit_polling() does anyway.
So this here just moves the polling a bit earlier.
I think we rather need to increase the MES ring size instead.
Regards,
Christian.
> + timeout);
> + if (r < 1)
> + goto error_undo;
> +
> api_status = (struct MES_API_STATUS *)((char *)pkt + api_status_off);
> api_status->api_completion_fence_addr = status_gpu_addr;
> api_status->api_completion_fence_value = 1;
> @@ -194,8 +201,7 @@ static int mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
> mes_status_pkt.header.dwsize = API_FRAME_SIZE_IN_DWORDS;
> mes_status_pkt.api_status.api_completion_fence_addr =
> ring->fence_drv.gpu_addr;
> - mes_status_pkt.api_status.api_completion_fence_value =
> - ++ring->fence_drv.sync_seq;
> + mes_status_pkt.api_status.api_completion_fence_value = seq;
>
> amdgpu_ring_write_multiple(ring, &mes_status_pkt,
> sizeof(mes_status_pkt) / 4);
> @@ -215,7 +221,7 @@ static int mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
> dev_dbg(adev->dev, "MES msg=%d was emitted\n",
> x_pkt->header.opcode);
>
> - r = amdgpu_fence_wait_polling(ring, ring->fence_drv.sync_seq, timeout);
> + r = amdgpu_fence_wait_polling(ring, seq, timeout);
> if (r < 1 || !*status_ptr) {
>
> if (misc_op_str)
> @@ -238,6 +244,10 @@ static int mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
> amdgpu_device_wb_free(adev, status_offset);
> return 0;
>
> +error_undo:
> + dev_err(adev->dev, "MES ring buffer is full.\n");
> + amdgpu_ring_undo(ring);
> +
> error_unlock_free:
> spin_unlock_irqrestore(&mes->ring_lock, flags);
>
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH] drm/amdgpu/mes: fix mes ring buffer overflow
2024-07-19 15:44 ` Christian König
@ 2024-07-22 3:27 ` Xiao, Jack
2024-07-22 8:20 ` Christian König
0 siblings, 1 reply; 14+ messages in thread
From: Xiao, Jack @ 2024-07-22 3:27 UTC (permalink / raw)
To: Christian König, amd-gfx@lists.freedesktop.org,
Deucher, Alexander
[-- Attachment #1: Type: text/plain, Size: 6352 bytes --]
[AMD Official Use Only - AMD Internal Distribution Only]
>> I think we rather need to increase the MES ring size instead.
Unfortunately, it doesn't work. I guess mes firmware has limitation.
Regards,
Jack
________________________________
From: Christian König <ckoenig.leichtzumerken@gmail.com>
Sent: Friday, 19 July 2024 23:44
To: Xiao, Jack <Jack.Xiao@amd.com>; amd-gfx@lists.freedesktop.org <amd-gfx@lists.freedesktop.org>; Deucher, Alexander <Alexander.Deucher@amd.com>
Subject: Re: [PATCH] drm/amdgpu/mes: fix mes ring buffer overflow
Am 19.07.24 um 11:16 schrieb Jack Xiao:
> wait memory room until enough before writing mes packets
> to avoid ring buffer overflow.
>
> Signed-off-by: Jack Xiao <Jack.Xiao@amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/mes_v11_0.c | 18 ++++++++++++++----
> drivers/gpu/drm/amd/amdgpu/mes_v12_0.c | 18 ++++++++++++++----
> 2 files changed, 28 insertions(+), 8 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
> index 8ce51b9236c1..68c74adf79f1 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
> @@ -168,7 +168,7 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
> const char *op_str, *misc_op_str;
> unsigned long flags;
> u64 status_gpu_addr;
> - u32 status_offset;
> + u32 seq, status_offset;
> u64 *status_ptr;
> signed long r;
> int ret;
> @@ -196,6 +196,13 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
> if (r)
> goto error_unlock_free;
>
> + seq = ++ring->fence_drv.sync_seq;
> + r = amdgpu_fence_wait_polling(ring,
> + seq - ring->fence_drv.num_fences_mask,
> + timeout);
> + if (r < 1)
> + goto error_undo;
> +
> api_status = (struct MES_API_STATUS *)((char *)pkt + api_status_off);
> api_status->api_completion_fence_addr = status_gpu_addr;
> api_status->api_completion_fence_value = 1;
> @@ -208,8 +215,7 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
> mes_status_pkt.header.dwsize = API_FRAME_SIZE_IN_DWORDS;
> mes_status_pkt.api_status.api_completion_fence_addr =
> ring->fence_drv.gpu_addr;
> - mes_status_pkt.api_status.api_completion_fence_value =
> - ++ring->fence_drv.sync_seq;
> + mes_status_pkt.api_status.api_completion_fence_value = seq;
>
> amdgpu_ring_write_multiple(ring, &mes_status_pkt,
> sizeof(mes_status_pkt) / 4);
> @@ -229,7 +235,7 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
> dev_dbg(adev->dev, "MES msg=%d was emitted\n",
> x_pkt->header.opcode);
>
> - r = amdgpu_fence_wait_polling(ring, ring->fence_drv.sync_seq, timeout);
> + r = amdgpu_fence_wait_polling(ring, seq, timeout);
> if (r < 1 || !*status_ptr) {
>
> if (misc_op_str)
> @@ -252,6 +258,10 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
> amdgpu_device_wb_free(adev, status_offset);
> return 0;
>
> +error_undo:
> + dev_err(adev->dev, "MES ring buffer is full.\n");
> + amdgpu_ring_undo(ring);
> +
> error_unlock_free:
> spin_unlock_irqrestore(&mes->ring_lock, flags);
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
> index c9f74231ad59..48e01206bcc4 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
> @@ -154,7 +154,7 @@ static int mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
> const char *op_str, *misc_op_str;
> unsigned long flags;
> u64 status_gpu_addr;
> - u32 status_offset;
> + u32 seq, status_offset;
> u64 *status_ptr;
> signed long r;
> int ret;
> @@ -182,6 +182,13 @@ static int mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
> if (r)
> goto error_unlock_free;
>
> + seq = ++ring->fence_drv.sync_seq;
> + r = amdgpu_fence_wait_polling(ring,
> + seq - ring->fence_drv.num_fences_mask,
That's what's amdgpu_fence_emit_polling() does anyway.
So this here just moves the polling a bit earlier.
I think we rather need to increase the MES ring size instead.
Regards,
Christian.
> + timeout);
> + if (r < 1)
> + goto error_undo;
> +
> api_status = (struct MES_API_STATUS *)((char *)pkt + api_status_off);
> api_status->api_completion_fence_addr = status_gpu_addr;
> api_status->api_completion_fence_value = 1;
> @@ -194,8 +201,7 @@ static int mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
> mes_status_pkt.header.dwsize = API_FRAME_SIZE_IN_DWORDS;
> mes_status_pkt.api_status.api_completion_fence_addr =
> ring->fence_drv.gpu_addr;
> - mes_status_pkt.api_status.api_completion_fence_value =
> - ++ring->fence_drv.sync_seq;
> + mes_status_pkt.api_status.api_completion_fence_value = seq;
>
> amdgpu_ring_write_multiple(ring, &mes_status_pkt,
> sizeof(mes_status_pkt) / 4);
> @@ -215,7 +221,7 @@ static int mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
> dev_dbg(adev->dev, "MES msg=%d was emitted\n",
> x_pkt->header.opcode);
>
> - r = amdgpu_fence_wait_polling(ring, ring->fence_drv.sync_seq, timeout);
> + r = amdgpu_fence_wait_polling(ring, seq, timeout);
> if (r < 1 || !*status_ptr) {
>
> if (misc_op_str)
> @@ -238,6 +244,10 @@ static int mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
> amdgpu_device_wb_free(adev, status_offset);
> return 0;
>
> +error_undo:
> + dev_err(adev->dev, "MES ring buffer is full.\n");
> + amdgpu_ring_undo(ring);
> +
> error_unlock_free:
> spin_unlock_irqrestore(&mes->ring_lock, flags);
>
[-- Attachment #2: Type: text/html, Size: 13370 bytes --]
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH] drm/amdgpu/mes: fix mes ring buffer overflow
2024-07-22 3:27 ` Xiao, Jack
@ 2024-07-22 8:20 ` Christian König
2024-07-22 8:46 ` Xiao, Jack
0 siblings, 1 reply; 14+ messages in thread
From: Christian König @ 2024-07-22 8:20 UTC (permalink / raw)
To: Xiao, Jack, amd-gfx@lists.freedesktop.org, Deucher, Alexander
[-- Attachment #1: Type: text/plain, Size: 7348 bytes --]
Thx, but in that case this patch here then won't help either it just
mitigates the problem.
Can you try to reduce num_hw_submission for the MES ring?
Thanks,
Christian.
Am 22.07.24 um 05:27 schrieb Xiao, Jack:
>
> [AMD Official Use Only - AMD Internal Distribution Only]
>
>
> />> I think we rather need to increase the MES ring size instead./
>
> Unfortunately, it doesn't work. I guess mes firmware has limitation.
>
> Regards,
> Jack
>
> ------------------------------------------------------------------------
> *From:* Christian König <ckoenig.leichtzumerken@gmail.com>
> *Sent:* Friday, 19 July 2024 23:44
> *To:* Xiao, Jack <Jack.Xiao@amd.com>; amd-gfx@lists.freedesktop.org
> <amd-gfx@lists.freedesktop.org>; Deucher, Alexander
> <Alexander.Deucher@amd.com>
> *Subject:* Re: [PATCH] drm/amdgpu/mes: fix mes ring buffer overflow
> Am 19.07.24 um 11:16 schrieb Jack Xiao:
> > wait memory room until enough before writing mes packets
> > to avoid ring buffer overflow.
> >
> > Signed-off-by: Jack Xiao <Jack.Xiao@amd.com>
> > ---
> > drivers/gpu/drm/amd/amdgpu/mes_v11_0.c | 18 ++++++++++++++----
> > drivers/gpu/drm/amd/amdgpu/mes_v12_0.c | 18 ++++++++++++++----
> > 2 files changed, 28 insertions(+), 8 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
> b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
> > index 8ce51b9236c1..68c74adf79f1 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
> > @@ -168,7 +168,7 @@ static int
> mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
> > const char *op_str, *misc_op_str;
> > unsigned long flags;
> > u64 status_gpu_addr;
> > - u32 status_offset;
> > + u32 seq, status_offset;
> > u64 *status_ptr;
> > signed long r;
> > int ret;
> > @@ -196,6 +196,13 @@ static int
> mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
> > if (r)
> > goto error_unlock_free;
> >
> > + seq = ++ring->fence_drv.sync_seq;
> > + r = amdgpu_fence_wait_polling(ring,
> > + seq -
> ring->fence_drv.num_fences_mask,
> > + timeout);
> > + if (r < 1)
> > + goto error_undo;
> > +
> > api_status = (struct MES_API_STATUS *)((char *)pkt +
> api_status_off);
> > api_status->api_completion_fence_addr = status_gpu_addr;
> > api_status->api_completion_fence_value = 1;
> > @@ -208,8 +215,7 @@ static int
> mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
> > mes_status_pkt.header.dwsize = API_FRAME_SIZE_IN_DWORDS;
> > mes_status_pkt.api_status.api_completion_fence_addr =
> > ring->fence_drv.gpu_addr;
> > - mes_status_pkt.api_status.api_completion_fence_value =
> > - ++ring->fence_drv.sync_seq;
> > + mes_status_pkt.api_status.api_completion_fence_value = seq;
> >
> > amdgpu_ring_write_multiple(ring, &mes_status_pkt,
> > sizeof(mes_status_pkt) / 4);
> > @@ -229,7 +235,7 @@ static int
> mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
> > dev_dbg(adev->dev, "MES msg=%d was emitted\n",
> > x_pkt->header.opcode);
> >
> > - r = amdgpu_fence_wait_polling(ring, ring->fence_drv.sync_seq,
> timeout);
> > + r = amdgpu_fence_wait_polling(ring, seq, timeout);
> > if (r < 1 || !*status_ptr) {
> >
> > if (misc_op_str)
> > @@ -252,6 +258,10 @@ static int
> mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
> > amdgpu_device_wb_free(adev, status_offset);
> > return 0;
> >
> > +error_undo:
> > + dev_err(adev->dev, "MES ring buffer is full.\n");
> > + amdgpu_ring_undo(ring);
> > +
> > error_unlock_free:
> > spin_unlock_irqrestore(&mes->ring_lock, flags);
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
> b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
> > index c9f74231ad59..48e01206bcc4 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
> > @@ -154,7 +154,7 @@ static int
> mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
> > const char *op_str, *misc_op_str;
> > unsigned long flags;
> > u64 status_gpu_addr;
> > - u32 status_offset;
> > + u32 seq, status_offset;
> > u64 *status_ptr;
> > signed long r;
> > int ret;
> > @@ -182,6 +182,13 @@ static int
> mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
> > if (r)
> > goto error_unlock_free;
> >
> > + seq = ++ring->fence_drv.sync_seq;
> > + r = amdgpu_fence_wait_polling(ring,
> > + seq -
> ring->fence_drv.num_fences_mask,
>
> That's what's amdgpu_fence_emit_polling() does anyway.
>
> So this here just moves the polling a bit earlier.
>
> I think we rather need to increase the MES ring size instead.
>
> Regards,
> Christian.
>
>
> > + timeout);
> > + if (r < 1)
> > + goto error_undo;
> > +
> > api_status = (struct MES_API_STATUS *)((char *)pkt +
> api_status_off);
> > api_status->api_completion_fence_addr = status_gpu_addr;
> > api_status->api_completion_fence_value = 1;
> > @@ -194,8 +201,7 @@ static int
> mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
> > mes_status_pkt.header.dwsize = API_FRAME_SIZE_IN_DWORDS;
> > mes_status_pkt.api_status.api_completion_fence_addr =
> > ring->fence_drv.gpu_addr;
> > - mes_status_pkt.api_status.api_completion_fence_value =
> > - ++ring->fence_drv.sync_seq;
> > + mes_status_pkt.api_status.api_completion_fence_value = seq;
> >
> > amdgpu_ring_write_multiple(ring, &mes_status_pkt,
> > sizeof(mes_status_pkt) / 4);
> > @@ -215,7 +221,7 @@ static int
> mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
> > dev_dbg(adev->dev, "MES msg=%d was emitted\n",
> > x_pkt->header.opcode);
> >
> > - r = amdgpu_fence_wait_polling(ring, ring->fence_drv.sync_seq,
> timeout);
> > + r = amdgpu_fence_wait_polling(ring, seq, timeout);
> > if (r < 1 || !*status_ptr) {
> >
> > if (misc_op_str)
> > @@ -238,6 +244,10 @@ static int
> mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
> > amdgpu_device_wb_free(adev, status_offset);
> > return 0;
> >
> > +error_undo:
> > + dev_err(adev->dev, "MES ring buffer is full.\n");
> > + amdgpu_ring_undo(ring);
> > +
> > error_unlock_free:
> > spin_unlock_irqrestore(&mes->ring_lock, flags);
> >
>
[-- Attachment #2: Type: text/html, Size: 14561 bytes --]
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH] drm/amdgpu/mes: fix mes ring buffer overflow
2024-07-22 8:20 ` Christian König
@ 2024-07-22 8:46 ` Xiao, Jack
2024-07-22 11:20 ` Christian König
0 siblings, 1 reply; 14+ messages in thread
From: Xiao, Jack @ 2024-07-22 8:46 UTC (permalink / raw)
To: Christian König, amd-gfx@lists.freedesktop.org,
Deucher, Alexander
[-- Attachment #1: Type: text/plain, Size: 7461 bytes --]
[AMD Official Use Only - AMD Internal Distribution Only]
>> Can you try to reduce num_hw_submission for the MES ring?
Smaller num_hw_submission should not help for this issue, for Mes work without drm scheduler like legacy kiq. Smaller num_hw_submission will result in smaller mes ring size and more waiting time.
Regards,
Jack
________________________________
From: Christian König <ckoenig.leichtzumerken@gmail.com>
Sent: Monday, 22 July 2024 16:20
To: Xiao, Jack <Jack.Xiao@amd.com>; amd-gfx@lists.freedesktop.org <amd-gfx@lists.freedesktop.org>; Deucher, Alexander <Alexander.Deucher@amd.com>
Subject: Re: [PATCH] drm/amdgpu/mes: fix mes ring buffer overflow
Thx, but in that case this patch here then won't help either it just mitigates the problem.
Can you try to reduce num_hw_submission for the MES ring?
Thanks,
Christian.
Am 22.07.24 um 05:27 schrieb Xiao, Jack:
[AMD Official Use Only - AMD Internal Distribution Only]
>> I think we rather need to increase the MES ring size instead.
Unfortunately, it doesn't work. I guess mes firmware has limitation.
Regards,
Jack
________________________________
From: Christian König <ckoenig.leichtzumerken@gmail.com><mailto:ckoenig.leichtzumerken@gmail.com>
Sent: Friday, 19 July 2024 23:44
To: Xiao, Jack <Jack.Xiao@amd.com><mailto:Jack.Xiao@amd.com>; amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org> <amd-gfx@lists.freedesktop.org><mailto:amd-gfx@lists.freedesktop.org>; Deucher, Alexander <Alexander.Deucher@amd.com><mailto:Alexander.Deucher@amd.com>
Subject: Re: [PATCH] drm/amdgpu/mes: fix mes ring buffer overflow
Am 19.07.24 um 11:16 schrieb Jack Xiao:
> wait memory room until enough before writing mes packets
> to avoid ring buffer overflow.
>
> Signed-off-by: Jack Xiao <Jack.Xiao@amd.com><mailto:Jack.Xiao@amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/mes_v11_0.c | 18 ++++++++++++++----
> drivers/gpu/drm/amd/amdgpu/mes_v12_0.c | 18 ++++++++++++++----
> 2 files changed, 28 insertions(+), 8 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
> index 8ce51b9236c1..68c74adf79f1 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
> @@ -168,7 +168,7 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
> const char *op_str, *misc_op_str;
> unsigned long flags;
> u64 status_gpu_addr;
> - u32 status_offset;
> + u32 seq, status_offset;
> u64 *status_ptr;
> signed long r;
> int ret;
> @@ -196,6 +196,13 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
> if (r)
> goto error_unlock_free;
>
> + seq = ++ring->fence_drv.sync_seq;
> + r = amdgpu_fence_wait_polling(ring,
> + seq - ring->fence_drv.num_fences_mask,
> + timeout);
> + if (r < 1)
> + goto error_undo;
> +
> api_status = (struct MES_API_STATUS *)((char *)pkt + api_status_off);
> api_status->api_completion_fence_addr = status_gpu_addr;
> api_status->api_completion_fence_value = 1;
> @@ -208,8 +215,7 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
> mes_status_pkt.header.dwsize = API_FRAME_SIZE_IN_DWORDS;
> mes_status_pkt.api_status.api_completion_fence_addr =
> ring->fence_drv.gpu_addr;
> - mes_status_pkt.api_status.api_completion_fence_value =
> - ++ring->fence_drv.sync_seq;
> + mes_status_pkt.api_status.api_completion_fence_value = seq;
>
> amdgpu_ring_write_multiple(ring, &mes_status_pkt,
> sizeof(mes_status_pkt) / 4);
> @@ -229,7 +235,7 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
> dev_dbg(adev->dev, "MES msg=%d was emitted\n",
> x_pkt->header.opcode);
>
> - r = amdgpu_fence_wait_polling(ring, ring->fence_drv.sync_seq, timeout);
> + r = amdgpu_fence_wait_polling(ring, seq, timeout);
> if (r < 1 || !*status_ptr) {
>
> if (misc_op_str)
> @@ -252,6 +258,10 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
> amdgpu_device_wb_free(adev, status_offset);
> return 0;
>
> +error_undo:
> + dev_err(adev->dev, "MES ring buffer is full.\n");
> + amdgpu_ring_undo(ring);
> +
> error_unlock_free:
> spin_unlock_irqrestore(&mes->ring_lock, flags);
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
> index c9f74231ad59..48e01206bcc4 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
> @@ -154,7 +154,7 @@ static int mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
> const char *op_str, *misc_op_str;
> unsigned long flags;
> u64 status_gpu_addr;
> - u32 status_offset;
> + u32 seq, status_offset;
> u64 *status_ptr;
> signed long r;
> int ret;
> @@ -182,6 +182,13 @@ static int mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
> if (r)
> goto error_unlock_free;
>
> + seq = ++ring->fence_drv.sync_seq;
> + r = amdgpu_fence_wait_polling(ring,
> + seq - ring->fence_drv.num_fences_mask,
That's what's amdgpu_fence_emit_polling() does anyway.
So this here just moves the polling a bit earlier.
I think we rather need to increase the MES ring size instead.
Regards,
Christian.
> + timeout);
> + if (r < 1)
> + goto error_undo;
> +
> api_status = (struct MES_API_STATUS *)((char *)pkt + api_status_off);
> api_status->api_completion_fence_addr = status_gpu_addr;
> api_status->api_completion_fence_value = 1;
> @@ -194,8 +201,7 @@ static int mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
> mes_status_pkt.header.dwsize = API_FRAME_SIZE_IN_DWORDS;
> mes_status_pkt.api_status.api_completion_fence_addr =
> ring->fence_drv.gpu_addr;
> - mes_status_pkt.api_status.api_completion_fence_value =
> - ++ring->fence_drv.sync_seq;
> + mes_status_pkt.api_status.api_completion_fence_value = seq;
>
> amdgpu_ring_write_multiple(ring, &mes_status_pkt,
> sizeof(mes_status_pkt) / 4);
> @@ -215,7 +221,7 @@ static int mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
> dev_dbg(adev->dev, "MES msg=%d was emitted\n",
> x_pkt->header.opcode);
>
> - r = amdgpu_fence_wait_polling(ring, ring->fence_drv.sync_seq, timeout);
> + r = amdgpu_fence_wait_polling(ring, seq, timeout);
> if (r < 1 || !*status_ptr) {
>
> if (misc_op_str)
> @@ -238,6 +244,10 @@ static int mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
> amdgpu_device_wb_free(adev, status_offset);
> return 0;
>
> +error_undo:
> + dev_err(adev->dev, "MES ring buffer is full.\n");
> + amdgpu_ring_undo(ring);
> +
> error_unlock_free:
> spin_unlock_irqrestore(&mes->ring_lock, flags);
>
[-- Attachment #2: Type: text/html, Size: 16249 bytes --]
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH] drm/amdgpu/mes: fix mes ring buffer overflow
2024-07-22 8:46 ` Xiao, Jack
@ 2024-07-22 11:20 ` Christian König
2024-07-22 19:52 ` Alex Deucher
0 siblings, 1 reply; 14+ messages in thread
From: Christian König @ 2024-07-22 11:20 UTC (permalink / raw)
To: Xiao, Jack, amd-gfx@lists.freedesktop.org, Deucher, Alexander
[-- Attachment #1: Type: text/plain, Size: 8983 bytes --]
What I meant is that the MES ring is now to small for the number of
packets written to it.
Either reduce the num_hw_submission or increase the MES ring size.
E.g. see this code here in amdgpu_ring_init:
if (ring->funcs->type == AMDGPU_RING_TYPE_KIQ)
sched_hw_submission = max(sched_hw_submission, 256);
else if (ring == &adev->sdma.instance[0].page)
sched_hw_submission = 256;
We are basically just missing a case for the MES here as far as I can see.
Regards,
Christian.
Am 22.07.24 um 10:46 schrieb Xiao, Jack:
>
> [AMD Official Use Only - AMD Internal Distribution Only]
>
>
> />> Can you try to reduce num_hw_submission for the MES ring?
> /
> Smaller num_hw_submission should not help for this issue, for Mes work
> without drm scheduler like legacy kiq. Smaller num_hw_submission will
> result in smaller mes ring size and more waiting time.
>
> Regards,
> Jack
> ------------------------------------------------------------------------
> *From:* Christian König <ckoenig.leichtzumerken@gmail.com>
> *Sent:* Monday, 22 July 2024 16:20
> *To:* Xiao, Jack <Jack.Xiao@amd.com>; amd-gfx@lists.freedesktop.org
> <amd-gfx@lists.freedesktop.org>; Deucher, Alexander
> <Alexander.Deucher@amd.com>
> *Subject:* Re: [PATCH] drm/amdgpu/mes: fix mes ring buffer overflow
> Thx, but in that case this patch here then won't help either it just
> mitigates the problem.
>
> Can you try to reduce num_hw_submission for the MES ring?
>
> Thanks,
> Christian.
>
> Am 22.07.24 um 05:27 schrieb Xiao, Jack:
>>
>> [AMD Official Use Only - AMD Internal Distribution Only]
>>
>>
>> />> I think we rather need to increase the MES ring size instead./
>>
>> Unfortunately, it doesn't work. I guess mes firmware has limitation.
>>
>> Regards,
>> Jack
>>
>> ------------------------------------------------------------------------
>> *From:* Christian König <ckoenig.leichtzumerken@gmail.com>
>> <mailto:ckoenig.leichtzumerken@gmail.com>
>> *Sent:* Friday, 19 July 2024 23:44
>> *To:* Xiao, Jack <Jack.Xiao@amd.com> <mailto:Jack.Xiao@amd.com>;
>> amd-gfx@lists.freedesktop.org <mailto:amd-gfx@lists.freedesktop.org>
>> <amd-gfx@lists.freedesktop.org>
>> <mailto:amd-gfx@lists.freedesktop.org>; Deucher, Alexander
>> <Alexander.Deucher@amd.com> <mailto:Alexander.Deucher@amd.com>
>> *Subject:* Re: [PATCH] drm/amdgpu/mes: fix mes ring buffer overflow
>> Am 19.07.24 um 11:16 schrieb Jack Xiao:
>> > wait memory room until enough before writing mes packets
>> > to avoid ring buffer overflow.
>> >
>> > Signed-off-by: Jack Xiao <Jack.Xiao@amd.com> <mailto:Jack.Xiao@amd.com>
>> > ---
>> > drivers/gpu/drm/amd/amdgpu/mes_v11_0.c | 18 ++++++++++++++----
>> > drivers/gpu/drm/amd/amdgpu/mes_v12_0.c | 18 ++++++++++++++----
>> > 2 files changed, 28 insertions(+), 8 deletions(-)
>> >
>> > diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
>> b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
>> > index 8ce51b9236c1..68c74adf79f1 100644
>> > --- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
>> > +++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
>> > @@ -168,7 +168,7 @@ static int
>> mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
>> > const char *op_str, *misc_op_str;
>> > unsigned long flags;
>> > u64 status_gpu_addr;
>> > - u32 status_offset;
>> > + u32 seq, status_offset;
>> > u64 *status_ptr;
>> > signed long r;
>> > int ret;
>> > @@ -196,6 +196,13 @@ static int
>> mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
>> > if (r)
>> > goto error_unlock_free;
>> >
>> > + seq = ++ring->fence_drv.sync_seq;
>> > + r = amdgpu_fence_wait_polling(ring,
>> > + seq -
>> ring->fence_drv.num_fences_mask,
>> > + timeout);
>> > + if (r < 1)
>> > + goto error_undo;
>> > +
>> > api_status = (struct MES_API_STATUS *)((char *)pkt +
>> api_status_off);
>> > api_status->api_completion_fence_addr = status_gpu_addr;
>> > api_status->api_completion_fence_value = 1;
>> > @@ -208,8 +215,7 @@ static int
>> mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
>> > mes_status_pkt.header.dwsize = API_FRAME_SIZE_IN_DWORDS;
>> > mes_status_pkt.api_status.api_completion_fence_addr =
>> > ring->fence_drv.gpu_addr;
>> > - mes_status_pkt.api_status.api_completion_fence_value =
>> > - ++ring->fence_drv.sync_seq;
>> > + mes_status_pkt.api_status.api_completion_fence_value = seq;
>> >
>> > amdgpu_ring_write_multiple(ring, &mes_status_pkt,
>> > sizeof(mes_status_pkt) / 4);
>> > @@ -229,7 +235,7 @@ static int
>> mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
>> > dev_dbg(adev->dev, "MES msg=%d was emitted\n",
>> > x_pkt->header.opcode);
>> >
>> > - r = amdgpu_fence_wait_polling(ring, ring->fence_drv.sync_seq,
>> timeout);
>> > + r = amdgpu_fence_wait_polling(ring, seq, timeout);
>> > if (r < 1 || !*status_ptr) {
>> >
>> > if (misc_op_str)
>> > @@ -252,6 +258,10 @@ static int
>> mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
>> > amdgpu_device_wb_free(adev, status_offset);
>> > return 0;
>> >
>> > +error_undo:
>> > + dev_err(adev->dev, "MES ring buffer is full.\n");
>> > + amdgpu_ring_undo(ring);
>> > +
>> > error_unlock_free:
>> > spin_unlock_irqrestore(&mes->ring_lock, flags);
>> >
>> > diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
>> b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
>> > index c9f74231ad59..48e01206bcc4 100644
>> > --- a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
>> > +++ b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
>> > @@ -154,7 +154,7 @@ static int
>> mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
>> > const char *op_str, *misc_op_str;
>> > unsigned long flags;
>> > u64 status_gpu_addr;
>> > - u32 status_offset;
>> > + u32 seq, status_offset;
>> > u64 *status_ptr;
>> > signed long r;
>> > int ret;
>> > @@ -182,6 +182,13 @@ static int
>> mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
>> > if (r)
>> > goto error_unlock_free;
>> >
>> > + seq = ++ring->fence_drv.sync_seq;
>> > + r = amdgpu_fence_wait_polling(ring,
>> > + seq -
>> ring->fence_drv.num_fences_mask,
>>
>> That's what's amdgpu_fence_emit_polling() does anyway.
>>
>> So this here just moves the polling a bit earlier.
>>
>> I think we rather need to increase the MES ring size instead.
>>
>> Regards,
>> Christian.
>>
>>
>> > + timeout);
>> > + if (r < 1)
>> > + goto error_undo;
>> > +
>> > api_status = (struct MES_API_STATUS *)((char *)pkt +
>> api_status_off);
>> > api_status->api_completion_fence_addr = status_gpu_addr;
>> > api_status->api_completion_fence_value = 1;
>> > @@ -194,8 +201,7 @@ static int
>> mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
>> > mes_status_pkt.header.dwsize = API_FRAME_SIZE_IN_DWORDS;
>> > mes_status_pkt.api_status.api_completion_fence_addr =
>> > ring->fence_drv.gpu_addr;
>> > - mes_status_pkt.api_status.api_completion_fence_value =
>> > - ++ring->fence_drv.sync_seq;
>> > + mes_status_pkt.api_status.api_completion_fence_value = seq;
>> >
>> > amdgpu_ring_write_multiple(ring, &mes_status_pkt,
>> > sizeof(mes_status_pkt) / 4);
>> > @@ -215,7 +221,7 @@ static int
>> mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
>> > dev_dbg(adev->dev, "MES msg=%d was emitted\n",
>> > x_pkt->header.opcode);
>> >
>> > - r = amdgpu_fence_wait_polling(ring, ring->fence_drv.sync_seq,
>> timeout);
>> > + r = amdgpu_fence_wait_polling(ring, seq, timeout);
>> > if (r < 1 || !*status_ptr) {
>> >
>> > if (misc_op_str)
>> > @@ -238,6 +244,10 @@ static int
>> mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
>> > amdgpu_device_wb_free(adev, status_offset);
>> > return 0;
>> >
>> > +error_undo:
>> > + dev_err(adev->dev, "MES ring buffer is full.\n");
>> > + amdgpu_ring_undo(ring);
>> > +
>> > error_unlock_free:
>> > spin_unlock_irqrestore(&mes->ring_lock, flags);
>> >
>>
>
[-- Attachment #2: Type: text/html, Size: 20657 bytes --]
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH] drm/amdgpu/mes: fix mes ring buffer overflow
2024-07-22 11:20 ` Christian König
@ 2024-07-22 19:52 ` Alex Deucher
2024-07-23 3:08 ` Xiao, Jack
0 siblings, 1 reply; 14+ messages in thread
From: Alex Deucher @ 2024-07-22 19:52 UTC (permalink / raw)
To: Christian König
Cc: Xiao, Jack, amd-gfx@lists.freedesktop.org, Deucher, Alexander
Does this patch fix it?
https://patchwork.freedesktop.org/patch/605437/
Alex
On Mon, Jul 22, 2024 at 7:21 AM Christian König
<ckoenig.leichtzumerken@gmail.com> wrote:
>
> What I meant is that the MES ring is now to small for the number of packets written to it.
>
> Either reduce the num_hw_submission or increase the MES ring size.
>
> E.g. see this code here in amdgpu_ring_init:
>
> if (ring->funcs->type == AMDGPU_RING_TYPE_KIQ)
> sched_hw_submission = max(sched_hw_submission, 256);
> else if (ring == &adev->sdma.instance[0].page)
> sched_hw_submission = 256;
>
> We are basically just missing a case for the MES here as far as I can see.
>
> Regards,
> Christian.
>
> Am 22.07.24 um 10:46 schrieb Xiao, Jack:
>
> [AMD Official Use Only - AMD Internal Distribution Only]
>
>
> >> Can you try to reduce num_hw_submission for the MES ring?
>
> Smaller num_hw_submission should not help for this issue, for Mes work without drm scheduler like legacy kiq. Smaller num_hw_submission will result in smaller mes ring size and more waiting time.
>
> Regards,
> Jack
> ________________________________
> From: Christian König <ckoenig.leichtzumerken@gmail.com>
> Sent: Monday, 22 July 2024 16:20
> To: Xiao, Jack <Jack.Xiao@amd.com>; amd-gfx@lists.freedesktop.org <amd-gfx@lists.freedesktop.org>; Deucher, Alexander <Alexander.Deucher@amd.com>
> Subject: Re: [PATCH] drm/amdgpu/mes: fix mes ring buffer overflow
>
> Thx, but in that case this patch here then won't help either it just mitigates the problem.
>
> Can you try to reduce num_hw_submission for the MES ring?
>
> Thanks,
> Christian.
>
> Am 22.07.24 um 05:27 schrieb Xiao, Jack:
>
> [AMD Official Use Only - AMD Internal Distribution Only]
>
>
> >> I think we rather need to increase the MES ring size instead.
>
> Unfortunately, it doesn't work. I guess mes firmware has limitation.
>
> Regards,
> Jack
>
> ________________________________
> From: Christian König <ckoenig.leichtzumerken@gmail.com>
> Sent: Friday, 19 July 2024 23:44
> To: Xiao, Jack <Jack.Xiao@amd.com>; amd-gfx@lists.freedesktop.org <amd-gfx@lists.freedesktop.org>; Deucher, Alexander <Alexander.Deucher@amd.com>
> Subject: Re: [PATCH] drm/amdgpu/mes: fix mes ring buffer overflow
>
> Am 19.07.24 um 11:16 schrieb Jack Xiao:
> > wait memory room until enough before writing mes packets
> > to avoid ring buffer overflow.
> >
> > Signed-off-by: Jack Xiao <Jack.Xiao@amd.com>
> > ---
> > drivers/gpu/drm/amd/amdgpu/mes_v11_0.c | 18 ++++++++++++++----
> > drivers/gpu/drm/amd/amdgpu/mes_v12_0.c | 18 ++++++++++++++----
> > 2 files changed, 28 insertions(+), 8 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
> > index 8ce51b9236c1..68c74adf79f1 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
> > @@ -168,7 +168,7 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
> > const char *op_str, *misc_op_str;
> > unsigned long flags;
> > u64 status_gpu_addr;
> > - u32 status_offset;
> > + u32 seq, status_offset;
> > u64 *status_ptr;
> > signed long r;
> > int ret;
> > @@ -196,6 +196,13 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
> > if (r)
> > goto error_unlock_free;
> >
> > + seq = ++ring->fence_drv.sync_seq;
> > + r = amdgpu_fence_wait_polling(ring,
> > + seq - ring->fence_drv.num_fences_mask,
> > + timeout);
> > + if (r < 1)
> > + goto error_undo;
> > +
> > api_status = (struct MES_API_STATUS *)((char *)pkt + api_status_off);
> > api_status->api_completion_fence_addr = status_gpu_addr;
> > api_status->api_completion_fence_value = 1;
> > @@ -208,8 +215,7 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
> > mes_status_pkt.header.dwsize = API_FRAME_SIZE_IN_DWORDS;
> > mes_status_pkt.api_status.api_completion_fence_addr =
> > ring->fence_drv.gpu_addr;
> > - mes_status_pkt.api_status.api_completion_fence_value =
> > - ++ring->fence_drv.sync_seq;
> > + mes_status_pkt.api_status.api_completion_fence_value = seq;
> >
> > amdgpu_ring_write_multiple(ring, &mes_status_pkt,
> > sizeof(mes_status_pkt) / 4);
> > @@ -229,7 +235,7 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
> > dev_dbg(adev->dev, "MES msg=%d was emitted\n",
> > x_pkt->header.opcode);
> >
> > - r = amdgpu_fence_wait_polling(ring, ring->fence_drv.sync_seq, timeout);
> > + r = amdgpu_fence_wait_polling(ring, seq, timeout);
> > if (r < 1 || !*status_ptr) {
> >
> > if (misc_op_str)
> > @@ -252,6 +258,10 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
> > amdgpu_device_wb_free(adev, status_offset);
> > return 0;
> >
> > +error_undo:
> > + dev_err(adev->dev, "MES ring buffer is full.\n");
> > + amdgpu_ring_undo(ring);
> > +
> > error_unlock_free:
> > spin_unlock_irqrestore(&mes->ring_lock, flags);
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
> > index c9f74231ad59..48e01206bcc4 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
> > @@ -154,7 +154,7 @@ static int mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
> > const char *op_str, *misc_op_str;
> > unsigned long flags;
> > u64 status_gpu_addr;
> > - u32 status_offset;
> > + u32 seq, status_offset;
> > u64 *status_ptr;
> > signed long r;
> > int ret;
> > @@ -182,6 +182,13 @@ static int mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
> > if (r)
> > goto error_unlock_free;
> >
> > + seq = ++ring->fence_drv.sync_seq;
> > + r = amdgpu_fence_wait_polling(ring,
> > + seq - ring->fence_drv.num_fences_mask,
>
> That's what's amdgpu_fence_emit_polling() does anyway.
>
> So this here just moves the polling a bit earlier.
>
> I think we rather need to increase the MES ring size instead.
>
> Regards,
> Christian.
>
>
> > + timeout);
> > + if (r < 1)
> > + goto error_undo;
> > +
> > api_status = (struct MES_API_STATUS *)((char *)pkt + api_status_off);
> > api_status->api_completion_fence_addr = status_gpu_addr;
> > api_status->api_completion_fence_value = 1;
> > @@ -194,8 +201,7 @@ static int mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
> > mes_status_pkt.header.dwsize = API_FRAME_SIZE_IN_DWORDS;
> > mes_status_pkt.api_status.api_completion_fence_addr =
> > ring->fence_drv.gpu_addr;
> > - mes_status_pkt.api_status.api_completion_fence_value =
> > - ++ring->fence_drv.sync_seq;
> > + mes_status_pkt.api_status.api_completion_fence_value = seq;
> >
> > amdgpu_ring_write_multiple(ring, &mes_status_pkt,
> > sizeof(mes_status_pkt) / 4);
> > @@ -215,7 +221,7 @@ static int mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
> > dev_dbg(adev->dev, "MES msg=%d was emitted\n",
> > x_pkt->header.opcode);
> >
> > - r = amdgpu_fence_wait_polling(ring, ring->fence_drv.sync_seq, timeout);
> > + r = amdgpu_fence_wait_polling(ring, seq, timeout);
> > if (r < 1 || !*status_ptr) {
> >
> > if (misc_op_str)
> > @@ -238,6 +244,10 @@ static int mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
> > amdgpu_device_wb_free(adev, status_offset);
> > return 0;
> >
> > +error_undo:
> > + dev_err(adev->dev, "MES ring buffer is full.\n");
> > + amdgpu_ring_undo(ring);
> > +
> > error_unlock_free:
> > spin_unlock_irqrestore(&mes->ring_lock, flags);
> >
>
>
>
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH] drm/amdgpu/mes: fix mes ring buffer overflow
2024-07-22 19:52 ` Alex Deucher
@ 2024-07-23 3:08 ` Xiao, Jack
2024-07-23 8:15 ` Xiao, Jack
0 siblings, 1 reply; 14+ messages in thread
From: Xiao, Jack @ 2024-07-23 3:08 UTC (permalink / raw)
To: Alex Deucher, Christian König
Cc: amd-gfx@lists.freedesktop.org, Deucher, Alexander
[-- Attachment #1: Type: text/plain, Size: 9009 bytes --]
[AMD Official Use Only - AMD Internal Distribution Only]
> Does this patch fix it?
https://patchwork.freedesktop.org/patch/605437/
No, please do not check in the patch, it will make my fix not working.
Regards,
Jack
________________________________
From: Alex Deucher <alexdeucher@gmail.com>
Sent: Tuesday, 23 July 2024 03:52
To: Christian König <ckoenig.leichtzumerken@gmail.com>
Cc: Xiao, Jack <Jack.Xiao@amd.com>; amd-gfx@lists.freedesktop.org <amd-gfx@lists.freedesktop.org>; Deucher, Alexander <Alexander.Deucher@amd.com>
Subject: Re: [PATCH] drm/amdgpu/mes: fix mes ring buffer overflow
Does this patch fix it?
https://patchwork.freedesktop.org/patch/605437/
Alex
On Mon, Jul 22, 2024 at 7:21 AM Christian König
<ckoenig.leichtzumerken@gmail.com> wrote:
>
> What I meant is that the MES ring is now to small for the number of packets written to it.
>
> Either reduce the num_hw_submission or increase the MES ring size.
>
> E.g. see this code here in amdgpu_ring_init:
>
> if (ring->funcs->type == AMDGPU_RING_TYPE_KIQ)
> sched_hw_submission = max(sched_hw_submission, 256);
> else if (ring == &adev->sdma.instance[0].page)
> sched_hw_submission = 256;
>
> We are basically just missing a case for the MES here as far as I can see.
>
> Regards,
> Christian.
>
> Am 22.07.24 um 10:46 schrieb Xiao, Jack:
>
> [AMD Official Use Only - AMD Internal Distribution Only]
>
>
> >> Can you try to reduce num_hw_submission for the MES ring?
>
> Smaller num_hw_submission should not help for this issue, for Mes work without drm scheduler like legacy kiq. Smaller num_hw_submission will result in smaller mes ring size and more waiting time.
>
> Regards,
> Jack
> ________________________________
> From: Christian König <ckoenig.leichtzumerken@gmail.com>
> Sent: Monday, 22 July 2024 16:20
> To: Xiao, Jack <Jack.Xiao@amd.com>; amd-gfx@lists.freedesktop.org <amd-gfx@lists.freedesktop.org>; Deucher, Alexander <Alexander.Deucher@amd.com>
> Subject: Re: [PATCH] drm/amdgpu/mes: fix mes ring buffer overflow
>
> Thx, but in that case this patch here then won't help either it just mitigates the problem.
>
> Can you try to reduce num_hw_submission for the MES ring?
>
> Thanks,
> Christian.
>
> Am 22.07.24 um 05:27 schrieb Xiao, Jack:
>
> [AMD Official Use Only - AMD Internal Distribution Only]
>
>
> >> I think we rather need to increase the MES ring size instead.
>
> Unfortunately, it doesn't work. I guess mes firmware has limitation.
>
> Regards,
> Jack
>
> ________________________________
> From: Christian König <ckoenig.leichtzumerken@gmail.com>
> Sent: Friday, 19 July 2024 23:44
> To: Xiao, Jack <Jack.Xiao@amd.com>; amd-gfx@lists.freedesktop.org <amd-gfx@lists.freedesktop.org>; Deucher, Alexander <Alexander.Deucher@amd.com>
> Subject: Re: [PATCH] drm/amdgpu/mes: fix mes ring buffer overflow
>
> Am 19.07.24 um 11:16 schrieb Jack Xiao:
> > wait memory room until enough before writing mes packets
> > to avoid ring buffer overflow.
> >
> > Signed-off-by: Jack Xiao <Jack.Xiao@amd.com>
> > ---
> > drivers/gpu/drm/amd/amdgpu/mes_v11_0.c | 18 ++++++++++++++----
> > drivers/gpu/drm/amd/amdgpu/mes_v12_0.c | 18 ++++++++++++++----
> > 2 files changed, 28 insertions(+), 8 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
> > index 8ce51b9236c1..68c74adf79f1 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
> > @@ -168,7 +168,7 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
> > const char *op_str, *misc_op_str;
> > unsigned long flags;
> > u64 status_gpu_addr;
> > - u32 status_offset;
> > + u32 seq, status_offset;
> > u64 *status_ptr;
> > signed long r;
> > int ret;
> > @@ -196,6 +196,13 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
> > if (r)
> > goto error_unlock_free;
> >
> > + seq = ++ring->fence_drv.sync_seq;
> > + r = amdgpu_fence_wait_polling(ring,
> > + seq - ring->fence_drv.num_fences_mask,
> > + timeout);
> > + if (r < 1)
> > + goto error_undo;
> > +
> > api_status = (struct MES_API_STATUS *)((char *)pkt + api_status_off);
> > api_status->api_completion_fence_addr = status_gpu_addr;
> > api_status->api_completion_fence_value = 1;
> > @@ -208,8 +215,7 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
> > mes_status_pkt.header.dwsize = API_FRAME_SIZE_IN_DWORDS;
> > mes_status_pkt.api_status.api_completion_fence_addr =
> > ring->fence_drv.gpu_addr;
> > - mes_status_pkt.api_status.api_completion_fence_value =
> > - ++ring->fence_drv.sync_seq;
> > + mes_status_pkt.api_status.api_completion_fence_value = seq;
> >
> > amdgpu_ring_write_multiple(ring, &mes_status_pkt,
> > sizeof(mes_status_pkt) / 4);
> > @@ -229,7 +235,7 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
> > dev_dbg(adev->dev, "MES msg=%d was emitted\n",
> > x_pkt->header.opcode);
> >
> > - r = amdgpu_fence_wait_polling(ring, ring->fence_drv.sync_seq, timeout);
> > + r = amdgpu_fence_wait_polling(ring, seq, timeout);
> > if (r < 1 || !*status_ptr) {
> >
> > if (misc_op_str)
> > @@ -252,6 +258,10 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
> > amdgpu_device_wb_free(adev, status_offset);
> > return 0;
> >
> > +error_undo:
> > + dev_err(adev->dev, "MES ring buffer is full.\n");
> > + amdgpu_ring_undo(ring);
> > +
> > error_unlock_free:
> > spin_unlock_irqrestore(&mes->ring_lock, flags);
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
> > index c9f74231ad59..48e01206bcc4 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
> > @@ -154,7 +154,7 @@ static int mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
> > const char *op_str, *misc_op_str;
> > unsigned long flags;
> > u64 status_gpu_addr;
> > - u32 status_offset;
> > + u32 seq, status_offset;
> > u64 *status_ptr;
> > signed long r;
> > int ret;
> > @@ -182,6 +182,13 @@ static int mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
> > if (r)
> > goto error_unlock_free;
> >
> > + seq = ++ring->fence_drv.sync_seq;
> > + r = amdgpu_fence_wait_polling(ring,
> > + seq - ring->fence_drv.num_fences_mask,
>
> That's what's amdgpu_fence_emit_polling() does anyway.
>
> So this here just moves the polling a bit earlier.
>
> I think we rather need to increase the MES ring size instead.
>
> Regards,
> Christian.
>
>
> > + timeout);
> > + if (r < 1)
> > + goto error_undo;
> > +
> > api_status = (struct MES_API_STATUS *)((char *)pkt + api_status_off);
> > api_status->api_completion_fence_addr = status_gpu_addr;
> > api_status->api_completion_fence_value = 1;
> > @@ -194,8 +201,7 @@ static int mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
> > mes_status_pkt.header.dwsize = API_FRAME_SIZE_IN_DWORDS;
> > mes_status_pkt.api_status.api_completion_fence_addr =
> > ring->fence_drv.gpu_addr;
> > - mes_status_pkt.api_status.api_completion_fence_value =
> > - ++ring->fence_drv.sync_seq;
> > + mes_status_pkt.api_status.api_completion_fence_value = seq;
> >
> > amdgpu_ring_write_multiple(ring, &mes_status_pkt,
> > sizeof(mes_status_pkt) / 4);
> > @@ -215,7 +221,7 @@ static int mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
> > dev_dbg(adev->dev, "MES msg=%d was emitted\n",
> > x_pkt->header.opcode);
> >
> > - r = amdgpu_fence_wait_polling(ring, ring->fence_drv.sync_seq, timeout);
> > + r = amdgpu_fence_wait_polling(ring, seq, timeout);
> > if (r < 1 || !*status_ptr) {
> >
> > if (misc_op_str)
> > @@ -238,6 +244,10 @@ static int mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
> > amdgpu_device_wb_free(adev, status_offset);
> > return 0;
> >
> > +error_undo:
> > + dev_err(adev->dev, "MES ring buffer is full.\n");
> > + amdgpu_ring_undo(ring);
> > +
> > error_unlock_free:
> > spin_unlock_irqrestore(&mes->ring_lock, flags);
> >
>
>
>
[-- Attachment #2: Type: text/html, Size: 17378 bytes --]
^ permalink raw reply [flat|nested] 14+ messages in thread
* RE: [PATCH] drm/amdgpu/mes: fix mes ring buffer overflow
2024-07-23 3:08 ` Xiao, Jack
@ 2024-07-23 8:15 ` Xiao, Jack
0 siblings, 0 replies; 14+ messages in thread
From: Xiao, Jack @ 2024-07-23 8:15 UTC (permalink / raw)
To: Xiao, Jack, Alex Deucher, Christian König
Cc: amd-gfx@lists.freedesktop.org, Deucher, Alexander
[-- Attachment #1: Type: text/plain, Size: 10396 bytes --]
[AMD Official Use Only - AMD Internal Distribution Only]
There is a max command number for mes queue in mes_api_def.h
enum {API_NUMBER_OF_COMMAND_MAX = 32};
It should explain why the patch can’t fix the issue.
I will send out another patch to refine code according to the firmware limitation.
Regards,
Jack
From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of Xiao, Jack
Sent: Tuesday, July 23, 2024 11:08 AM
To: Alex Deucher <alexdeucher@gmail.com>; Christian König <ckoenig.leichtzumerken@gmail.com>
Cc: amd-gfx@lists.freedesktop.org; Deucher, Alexander <Alexander.Deucher@amd.com>
Subject: Re: [PATCH] drm/amdgpu/mes: fix mes ring buffer overflow
[AMD Official Use Only - AMD Internal Distribution Only]
[AMD Official Use Only - AMD Internal Distribution Only]
> Does this patch fix it?
https://patchwork.freedesktop.org/patch/605437/
No, please do not check in the patch, it will make my fix not working.
Regards,
Jack
________________________________
From: Alex Deucher <alexdeucher@gmail.com<mailto:alexdeucher@gmail.com>>
Sent: Tuesday, 23 July 2024 03:52
To: Christian König <ckoenig.leichtzumerken@gmail.com<mailto:ckoenig.leichtzumerken@gmail.com>>
Cc: Xiao, Jack <Jack.Xiao@amd.com<mailto:Jack.Xiao@amd.com>>; amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org> <amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org>>; Deucher, Alexander <Alexander.Deucher@amd.com<mailto:Alexander.Deucher@amd.com>>
Subject: Re: [PATCH] drm/amdgpu/mes: fix mes ring buffer overflow
Does this patch fix it?
https://patchwork.freedesktop.org/patch/605437/
Alex
On Mon, Jul 22, 2024 at 7:21 AM Christian König
<ckoenig.leichtzumerken@gmail.com<mailto:ckoenig.leichtzumerken@gmail.com>> wrote:
>
> What I meant is that the MES ring is now to small for the number of packets written to it.
>
> Either reduce the num_hw_submission or increase the MES ring size.
>
> E.g. see this code here in amdgpu_ring_init:
>
> if (ring->funcs->type == AMDGPU_RING_TYPE_KIQ)
> sched_hw_submission = max(sched_hw_submission, 256);
> else if (ring == &adev->sdma.instance[0].page)
> sched_hw_submission = 256;
>
> We are basically just missing a case for the MES here as far as I can see.
>
> Regards,
> Christian.
>
> Am 22.07.24 um 10:46 schrieb Xiao, Jack:
>
> [AMD Official Use Only - AMD Internal Distribution Only]
>
>
> >> Can you try to reduce num_hw_submission for the MES ring?
>
> Smaller num_hw_submission should not help for this issue, for Mes work without drm scheduler like legacy kiq. Smaller num_hw_submission will result in smaller mes ring size and more waiting time.
>
> Regards,
> Jack
> ________________________________
> From: Christian König <ckoenig.leichtzumerken@gmail.com<mailto:ckoenig.leichtzumerken@gmail.com>>
> Sent: Monday, 22 July 2024 16:20
> To: Xiao, Jack <Jack.Xiao@amd.com<mailto:Jack.Xiao@amd.com>>; amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org> <amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org>>; Deucher, Alexander <Alexander.Deucher@amd.com<mailto:Alexander.Deucher@amd.com>>
> Subject: Re: [PATCH] drm/amdgpu/mes: fix mes ring buffer overflow
>
> Thx, but in that case this patch here then won't help either it just mitigates the problem.
>
> Can you try to reduce num_hw_submission for the MES ring?
>
> Thanks,
> Christian.
>
> Am 22.07.24 um 05:27 schrieb Xiao, Jack:
>
> [AMD Official Use Only - AMD Internal Distribution Only]
>
>
> >> I think we rather need to increase the MES ring size instead.
>
> Unfortunately, it doesn't work. I guess mes firmware has limitation.
>
> Regards,
> Jack
>
> ________________________________
> From: Christian König <ckoenig.leichtzumerken@gmail.com<mailto:ckoenig.leichtzumerken@gmail.com>>
> Sent: Friday, 19 July 2024 23:44
> To: Xiao, Jack <Jack.Xiao@amd.com<mailto:Jack.Xiao@amd.com>>; amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org> <amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org>>; Deucher, Alexander <Alexander.Deucher@amd.com<mailto:Alexander.Deucher@amd.com>>
> Subject: Re: [PATCH] drm/amdgpu/mes: fix mes ring buffer overflow
>
> Am 19.07.24 um 11:16 schrieb Jack Xiao:
> > wait memory room until enough before writing mes packets
> > to avoid ring buffer overflow.
> >
> > Signed-off-by: Jack Xiao <Jack.Xiao@amd.com<mailto:Jack.Xiao@amd.com>>
> > ---
> > drivers/gpu/drm/amd/amdgpu/mes_v11_0.c | 18 ++++++++++++++----
> > drivers/gpu/drm/amd/amdgpu/mes_v12_0.c | 18 ++++++++++++++----
> > 2 files changed, 28 insertions(+), 8 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
> > index 8ce51b9236c1..68c74adf79f1 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
> > @@ -168,7 +168,7 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
> > const char *op_str, *misc_op_str;
> > unsigned long flags;
> > u64 status_gpu_addr;
> > - u32 status_offset;
> > + u32 seq, status_offset;
> > u64 *status_ptr;
> > signed long r;
> > int ret;
> > @@ -196,6 +196,13 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
> > if (r)
> > goto error_unlock_free;
> >
> > + seq = ++ring->fence_drv.sync_seq;
> > + r = amdgpu_fence_wait_polling(ring,
> > + seq - ring->fence_drv.num_fences_mask,
> > + timeout);
> > + if (r < 1)
> > + goto error_undo;
> > +
> > api_status = (struct MES_API_STATUS *)((char *)pkt + api_status_off);
> > api_status->api_completion_fence_addr = status_gpu_addr;
> > api_status->api_completion_fence_value = 1;
> > @@ -208,8 +215,7 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
> > mes_status_pkt.header.dwsize = API_FRAME_SIZE_IN_DWORDS;
> > mes_status_pkt.api_status.api_completion_fence_addr =
> > ring->fence_drv.gpu_addr;
> > - mes_status_pkt.api_status.api_completion_fence_value =
> > - ++ring->fence_drv.sync_seq;
> > + mes_status_pkt.api_status.api_completion_fence_value = seq;
> >
> > amdgpu_ring_write_multiple(ring, &mes_status_pkt,
> > sizeof(mes_status_pkt) / 4);
> > @@ -229,7 +235,7 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
> > dev_dbg(adev->dev, "MES msg=%d was emitted\n",
> > x_pkt->header.opcode);
> >
> > - r = amdgpu_fence_wait_polling(ring, ring->fence_drv.sync_seq, timeout);
> > + r = amdgpu_fence_wait_polling(ring, seq, timeout);
> > if (r < 1 || !*status_ptr) {
> >
> > if (misc_op_str)
> > @@ -252,6 +258,10 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
> > amdgpu_device_wb_free(adev, status_offset);
> > return 0;
> >
> > +error_undo:
> > + dev_err(adev->dev, "MES ring buffer is full.\n");
> > + amdgpu_ring_undo(ring);
> > +
> > error_unlock_free:
> > spin_unlock_irqrestore(&mes->ring_lock, flags);
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
> > index c9f74231ad59..48e01206bcc4 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
> > @@ -154,7 +154,7 @@ static int mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
> > const char *op_str, *misc_op_str;
> > unsigned long flags;
> > u64 status_gpu_addr;
> > - u32 status_offset;
> > + u32 seq, status_offset;
> > u64 *status_ptr;
> > signed long r;
> > int ret;
> > @@ -182,6 +182,13 @@ static int mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
> > if (r)
> > goto error_unlock_free;
> >
> > + seq = ++ring->fence_drv.sync_seq;
> > + r = amdgpu_fence_wait_polling(ring,
> > + seq - ring->fence_drv.num_fences_mask,
>
> That's what's amdgpu_fence_emit_polling() does anyway.
>
> So this here just moves the polling a bit earlier.
>
> I think we rather need to increase the MES ring size instead.
>
> Regards,
> Christian.
>
>
> > + timeout);
> > + if (r < 1)
> > + goto error_undo;
> > +
> > api_status = (struct MES_API_STATUS *)((char *)pkt + api_status_off);
> > api_status->api_completion_fence_addr = status_gpu_addr;
> > api_status->api_completion_fence_value = 1;
> > @@ -194,8 +201,7 @@ static int mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
> > mes_status_pkt.header.dwsize = API_FRAME_SIZE_IN_DWORDS;
> > mes_status_pkt.api_status.api_completion_fence_addr =
> > ring->fence_drv.gpu_addr;
> > - mes_status_pkt.api_status.api_completion_fence_value =
> > - ++ring->fence_drv.sync_seq;
> > + mes_status_pkt.api_status.api_completion_fence_value = seq;
> >
> > amdgpu_ring_write_multiple(ring, &mes_status_pkt,
> > sizeof(mes_status_pkt) / 4);
> > @@ -215,7 +221,7 @@ static int mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
> > dev_dbg(adev->dev, "MES msg=%d was emitted\n",
> > x_pkt->header.opcode);
> >
> > - r = amdgpu_fence_wait_polling(ring, ring->fence_drv.sync_seq, timeout);
> > + r = amdgpu_fence_wait_polling(ring, seq, timeout);
> > if (r < 1 || !*status_ptr) {
> >
> > if (misc_op_str)
> > @@ -238,6 +244,10 @@ static int mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
> > amdgpu_device_wb_free(adev, status_offset);
> > return 0;
> >
> > +error_undo:
> > + dev_err(adev->dev, "MES ring buffer is full.\n");
> > + amdgpu_ring_undo(ring);
> > +
> > error_unlock_free:
> > spin_unlock_irqrestore(&mes->ring_lock, flags);
> >
>
>
>
[-- Attachment #2: Type: text/html, Size: 21852 bytes --]
^ permalink raw reply [flat|nested] 14+ messages in thread
* [PATCH] drm/amdgpu/mes: fix mes ring buffer overflow
@ 2024-08-27 14:10 Alex Deucher
2024-08-27 14:21 ` Greg KH
0 siblings, 1 reply; 14+ messages in thread
From: Alex Deucher @ 2024-08-27 14:10 UTC (permalink / raw)
To: stable, gregkh, sashal; +Cc: Jack Xiao, Alex Deucher
From: Jack Xiao <Jack.Xiao@amd.com>
wait memory room until enough before writing mes packets
to avoid ring buffer overflow.
v2: squash in sched_hw_submission fix
Backport from 6.11.
Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/3571
Fixes: de3246254156 ("drm/amdgpu: cleanup MES11 command submission")
Fixes: fffe347e1478 ("drm/amdgpu: cleanup MES12 command submission")
Signed-off-by: Jack Xiao <Jack.Xiao@amd.com>
Acked-by: Alex Deucher <alexander.deucher@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
(cherry picked from commit 34e087e8920e635c62e2ed6a758b0cd27f836d13)
Cc: stable@vger.kernel.org # 6.10.x
(cherry picked from commit 11752c013f562a1124088a35bd314aa0e9f0e88f)
---
drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c | 2 ++
drivers/gpu/drm/amd/amdgpu/mes_v11_0.c | 18 ++++++++++++++----
2 files changed, 16 insertions(+), 4 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
index 06f0a6534a94..88ffb15e25cc 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
@@ -212,6 +212,8 @@ int amdgpu_ring_init(struct amdgpu_device *adev, struct amdgpu_ring *ring,
*/
if (ring->funcs->type == AMDGPU_RING_TYPE_KIQ)
sched_hw_submission = max(sched_hw_submission, 256);
+ if (ring->funcs->type == AMDGPU_RING_TYPE_MES)
+ sched_hw_submission = 8;
else if (ring == &adev->sdma.instance[0].page)
sched_hw_submission = 256;
diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
index 32d4519541c6..e1a66d585f5e 100644
--- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
@@ -163,7 +163,7 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
const char *op_str, *misc_op_str;
unsigned long flags;
u64 status_gpu_addr;
- u32 status_offset;
+ u32 seq, status_offset;
u64 *status_ptr;
signed long r;
int ret;
@@ -191,6 +191,13 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
if (r)
goto error_unlock_free;
+ seq = ++ring->fence_drv.sync_seq;
+ r = amdgpu_fence_wait_polling(ring,
+ seq - ring->fence_drv.num_fences_mask,
+ timeout);
+ if (r < 1)
+ goto error_undo;
+
api_status = (struct MES_API_STATUS *)((char *)pkt + api_status_off);
api_status->api_completion_fence_addr = status_gpu_addr;
api_status->api_completion_fence_value = 1;
@@ -203,8 +210,7 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
mes_status_pkt.header.dwsize = API_FRAME_SIZE_IN_DWORDS;
mes_status_pkt.api_status.api_completion_fence_addr =
ring->fence_drv.gpu_addr;
- mes_status_pkt.api_status.api_completion_fence_value =
- ++ring->fence_drv.sync_seq;
+ mes_status_pkt.api_status.api_completion_fence_value = seq;
amdgpu_ring_write_multiple(ring, &mes_status_pkt,
sizeof(mes_status_pkt) / 4);
@@ -224,7 +230,7 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
dev_dbg(adev->dev, "MES msg=%d was emitted\n",
x_pkt->header.opcode);
- r = amdgpu_fence_wait_polling(ring, ring->fence_drv.sync_seq, timeout);
+ r = amdgpu_fence_wait_polling(ring, seq, timeout);
if (r < 1 || !*status_ptr) {
if (misc_op_str)
@@ -247,6 +253,10 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
amdgpu_device_wb_free(adev, status_offset);
return 0;
+error_undo:
+ dev_err(adev->dev, "MES ring buffer is full.\n");
+ amdgpu_ring_undo(ring);
+
error_unlock_free:
spin_unlock_irqrestore(&mes->ring_lock, flags);
--
2.46.0
^ permalink raw reply related [flat|nested] 14+ messages in thread
* Re: [PATCH] drm/amdgpu/mes: fix mes ring buffer overflow
2024-08-27 14:10 [PATCH] drm/amdgpu/mes: fix mes ring buffer overflow Alex Deucher
@ 2024-08-27 14:21 ` Greg KH
2024-08-27 15:01 ` Deucher, Alexander
0 siblings, 1 reply; 14+ messages in thread
From: Greg KH @ 2024-08-27 14:21 UTC (permalink / raw)
To: Alex Deucher; +Cc: stable, sashal, Jack Xiao
On Tue, Aug 27, 2024 at 10:10:25AM -0400, Alex Deucher wrote:
> From: Jack Xiao <Jack.Xiao@amd.com>
>
> wait memory room until enough before writing mes packets
> to avoid ring buffer overflow.
>
> v2: squash in sched_hw_submission fix
>
> Backport from 6.11.
>
> Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/3571
> Fixes: de3246254156 ("drm/amdgpu: cleanup MES11 command submission")
> Fixes: fffe347e1478 ("drm/amdgpu: cleanup MES12 command submission")
These commits are in 6.11-rc1.
> Signed-off-by: Jack Xiao <Jack.Xiao@amd.com>
> Acked-by: Alex Deucher <alexander.deucher@amd.com>
> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
> (cherry picked from commit 34e087e8920e635c62e2ed6a758b0cd27f836d13)
> Cc: stable@vger.kernel.org # 6.10.x
So why does this need to go to 6.10.y?
confused,
greg k-h
^ permalink raw reply [flat|nested] 14+ messages in thread
* RE: [PATCH] drm/amdgpu/mes: fix mes ring buffer overflow
2024-08-27 14:21 ` Greg KH
@ 2024-08-27 15:01 ` Deucher, Alexander
2024-08-27 16:13 ` Greg KH
0 siblings, 1 reply; 14+ messages in thread
From: Deucher, Alexander @ 2024-08-27 15:01 UTC (permalink / raw)
To: Greg KH; +Cc: stable@vger.kernel.org, sashal@kernel.org, Xiao, Jack
[Public]
> -----Original Message-----
> From: Greg KH <gregkh@linuxfoundation.org>
> Sent: Tuesday, August 27, 2024 10:21 AM
> To: Deucher, Alexander <Alexander.Deucher@amd.com>
> Cc: stable@vger.kernel.org; sashal@kernel.org; Xiao, Jack
> <Jack.Xiao@amd.com>
> Subject: Re: [PATCH] drm/amdgpu/mes: fix mes ring buffer overflow
>
> On Tue, Aug 27, 2024 at 10:10:25AM -0400, Alex Deucher wrote:
> > From: Jack Xiao <Jack.Xiao@amd.com>
> >
> > wait memory room until enough before writing mes packets to avoid ring
> > buffer overflow.
> >
> > v2: squash in sched_hw_submission fix
> >
> > Backport from 6.11.
> >
> > Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/3571
> > Fixes: de3246254156 ("drm/amdgpu: cleanup MES11 command
> submission")
> > Fixes: fffe347e1478 ("drm/amdgpu: cleanup MES12 command submission")
>
> These commits are in 6.11-rc1.
de3246254156 ("drm/amdgpu: cleanup MES11 command submission")
was ported to 6.10 as well:
https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c?h=linux-6.10.y&id=e356d321d0240663a09b139fa3658ddbca163e27
So this fix is applicable there.
Alex
>
> > Signed-off-by: Jack Xiao <Jack.Xiao@amd.com>
> > Acked-by: Alex Deucher <alexander.deucher@amd.com>
> > Signed-off-by: Alex Deucher <alexander.deucher@amd.com> (cherry picked
> > from commit 34e087e8920e635c62e2ed6a758b0cd27f836d13)
> > Cc: stable@vger.kernel.org # 6.10.x
>
> So why does this need to go to 6.10.y?
>
> confused,
>
> greg k-h
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH] drm/amdgpu/mes: fix mes ring buffer overflow
2024-08-27 15:01 ` Deucher, Alexander
@ 2024-08-27 16:13 ` Greg KH
0 siblings, 0 replies; 14+ messages in thread
From: Greg KH @ 2024-08-27 16:13 UTC (permalink / raw)
To: Deucher, Alexander; +Cc: stable@vger.kernel.org, sashal@kernel.org, Xiao, Jack
On Tue, Aug 27, 2024 at 03:01:54PM +0000, Deucher, Alexander wrote:
> [Public]
>
> > -----Original Message-----
> > From: Greg KH <gregkh@linuxfoundation.org>
> > Sent: Tuesday, August 27, 2024 10:21 AM
> > To: Deucher, Alexander <Alexander.Deucher@amd.com>
> > Cc: stable@vger.kernel.org; sashal@kernel.org; Xiao, Jack
> > <Jack.Xiao@amd.com>
> > Subject: Re: [PATCH] drm/amdgpu/mes: fix mes ring buffer overflow
> >
> > On Tue, Aug 27, 2024 at 10:10:25AM -0400, Alex Deucher wrote:
> > > From: Jack Xiao <Jack.Xiao@amd.com>
> > >
> > > wait memory room until enough before writing mes packets to avoid ring
> > > buffer overflow.
> > >
> > > v2: squash in sched_hw_submission fix
> > >
> > > Backport from 6.11.
> > >
> > > Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/3571
> > > Fixes: de3246254156 ("drm/amdgpu: cleanup MES11 command
> > submission")
> > > Fixes: fffe347e1478 ("drm/amdgpu: cleanup MES12 command submission")
> >
> > These commits are in 6.11-rc1.
>
> de3246254156 ("drm/amdgpu: cleanup MES11 command submission")
> was ported to 6.10 as well:
> https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c?h=linux-6.10.y&id=e356d321d0240663a09b139fa3658ddbca163e27
> So this fix is applicable there.
No, commit e356d321d024 ("drm/amdgpu: cleanup MES11 command submission")
is in the 6.10 release, but commit de3246254156 ("drm/amdgpu: cleanup
MES11 command submission") is in 6.11-rc1!
So how in the world are we supposed to know anything here?
See how broken this all is?
I give up.
If you all want any AMD patches applied to stable trees, manually send
us a set of backported patches, AND be sure to get the git ids right.
I'll leave what I have right now in the queues, but after this round of
-rc releases, all AMD patches with cc: stable are going to be
automatically dropped and ignored. I NEED you all to manually send them
to me now as this is just insane.
Time to go buy a Intel gpu card as there's no way this is going to work
out well over time...
{sigh}
greg k-h
^ permalink raw reply [flat|nested] 14+ messages in thread
end of thread, other threads:[~2024-08-27 16:13 UTC | newest]
Thread overview: 14+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2024-08-27 14:10 [PATCH] drm/amdgpu/mes: fix mes ring buffer overflow Alex Deucher
2024-08-27 14:21 ` Greg KH
2024-08-27 15:01 ` Deucher, Alexander
2024-08-27 16:13 ` Greg KH
-- strict thread matches above, loose matches on Subject: below --
2024-07-19 9:16 Jack Xiao
2024-07-19 13:56 ` Alex Deucher
2024-07-19 15:44 ` Christian König
2024-07-22 3:27 ` Xiao, Jack
2024-07-22 8:20 ` Christian König
2024-07-22 8:46 ` Xiao, Jack
2024-07-22 11:20 ` Christian König
2024-07-22 19:52 ` Alex Deucher
2024-07-23 3:08 ` Xiao, Jack
2024-07-23 8:15 ` Xiao, Jack
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.