All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] drm/amdgpu/mes: fix mes ring buffer overflow
@ 2024-07-19  9:16 Jack Xiao
  2024-07-19 13:56 ` Alex Deucher
  2024-07-19 15:44 ` Christian König
  0 siblings, 2 replies; 14+ messages in thread
From: Jack Xiao @ 2024-07-19  9:16 UTC (permalink / raw)
  To: amd-gfx, Alexander.Deucher; +Cc: Jack Xiao

wait memory room until enough before writing mes packets
to avoid ring buffer overflow.

Signed-off-by: Jack Xiao <Jack.Xiao@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/mes_v11_0.c | 18 ++++++++++++++----
 drivers/gpu/drm/amd/amdgpu/mes_v12_0.c | 18 ++++++++++++++----
 2 files changed, 28 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
index 8ce51b9236c1..68c74adf79f1 100644
--- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
@@ -168,7 +168,7 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
 	const char *op_str, *misc_op_str;
 	unsigned long flags;
 	u64 status_gpu_addr;
-	u32 status_offset;
+	u32 seq, status_offset;
 	u64 *status_ptr;
 	signed long r;
 	int ret;
@@ -196,6 +196,13 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
 	if (r)
 		goto error_unlock_free;
 
+	seq = ++ring->fence_drv.sync_seq;
+	r = amdgpu_fence_wait_polling(ring,
+				      seq - ring->fence_drv.num_fences_mask,
+				      timeout);
+	if (r < 1)
+		goto error_undo;
+
 	api_status = (struct MES_API_STATUS *)((char *)pkt + api_status_off);
 	api_status->api_completion_fence_addr = status_gpu_addr;
 	api_status->api_completion_fence_value = 1;
@@ -208,8 +215,7 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
 	mes_status_pkt.header.dwsize = API_FRAME_SIZE_IN_DWORDS;
 	mes_status_pkt.api_status.api_completion_fence_addr =
 		ring->fence_drv.gpu_addr;
-	mes_status_pkt.api_status.api_completion_fence_value =
-		++ring->fence_drv.sync_seq;
+	mes_status_pkt.api_status.api_completion_fence_value = seq;
 
 	amdgpu_ring_write_multiple(ring, &mes_status_pkt,
 				   sizeof(mes_status_pkt) / 4);
@@ -229,7 +235,7 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
 		dev_dbg(adev->dev, "MES msg=%d was emitted\n",
 			x_pkt->header.opcode);
 
-	r = amdgpu_fence_wait_polling(ring, ring->fence_drv.sync_seq, timeout);
+	r = amdgpu_fence_wait_polling(ring, seq, timeout);
 	if (r < 1 || !*status_ptr) {
 
 		if (misc_op_str)
@@ -252,6 +258,10 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
 	amdgpu_device_wb_free(adev, status_offset);
 	return 0;
 
+error_undo:
+	dev_err(adev->dev, "MES ring buffer is full.\n");
+	amdgpu_ring_undo(ring);
+
 error_unlock_free:
 	spin_unlock_irqrestore(&mes->ring_lock, flags);
 
diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
index c9f74231ad59..48e01206bcc4 100644
--- a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
@@ -154,7 +154,7 @@ static int mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
 	const char *op_str, *misc_op_str;
 	unsigned long flags;
 	u64 status_gpu_addr;
-	u32 status_offset;
+	u32 seq, status_offset;
 	u64 *status_ptr;
 	signed long r;
 	int ret;
@@ -182,6 +182,13 @@ static int mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
 	if (r)
 		goto error_unlock_free;
 
+	seq = ++ring->fence_drv.sync_seq;
+	r = amdgpu_fence_wait_polling(ring,
+				      seq - ring->fence_drv.num_fences_mask,
+				      timeout);
+	if (r < 1)
+		goto error_undo;
+
 	api_status = (struct MES_API_STATUS *)((char *)pkt + api_status_off);
 	api_status->api_completion_fence_addr = status_gpu_addr;
 	api_status->api_completion_fence_value = 1;
@@ -194,8 +201,7 @@ static int mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
 	mes_status_pkt.header.dwsize = API_FRAME_SIZE_IN_DWORDS;
 	mes_status_pkt.api_status.api_completion_fence_addr =
 		ring->fence_drv.gpu_addr;
-	mes_status_pkt.api_status.api_completion_fence_value =
-		++ring->fence_drv.sync_seq;
+	mes_status_pkt.api_status.api_completion_fence_value = seq;
 
 	amdgpu_ring_write_multiple(ring, &mes_status_pkt,
 				   sizeof(mes_status_pkt) / 4);
@@ -215,7 +221,7 @@ static int mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
 		dev_dbg(adev->dev, "MES msg=%d was emitted\n",
 			x_pkt->header.opcode);
 
-	r = amdgpu_fence_wait_polling(ring, ring->fence_drv.sync_seq, timeout);
+	r = amdgpu_fence_wait_polling(ring, seq, timeout);
 	if (r < 1 || !*status_ptr) {
 
 		if (misc_op_str)
@@ -238,6 +244,10 @@ static int mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
 	amdgpu_device_wb_free(adev, status_offset);
 	return 0;
 
+error_undo:
+	dev_err(adev->dev, "MES ring buffer is full.\n");
+	amdgpu_ring_undo(ring);
+
 error_unlock_free:
 	spin_unlock_irqrestore(&mes->ring_lock, flags);
 
-- 
2.41.0


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH] drm/amdgpu/mes: fix mes ring buffer overflow
  2024-07-19  9:16 Jack Xiao
@ 2024-07-19 13:56 ` Alex Deucher
  2024-07-19 15:44 ` Christian König
  1 sibling, 0 replies; 14+ messages in thread
From: Alex Deucher @ 2024-07-19 13:56 UTC (permalink / raw)
  To: Jack Xiao; +Cc: amd-gfx, Alexander.Deucher

On Fri, Jul 19, 2024 at 5:35 AM Jack Xiao <Jack.Xiao@amd.com> wrote:
>
> wait memory room until enough before writing mes packets
> to avoid ring buffer overflow.
>
> Signed-off-by: Jack Xiao <Jack.Xiao@amd.com>

Fixes: de3246254156 ("drm/amdgpu: cleanup MES11 command submission")
Fixes: fffe347e1478 ("drm/amdgpu: cleanup MES12 command submission")

Acked-by: Alex Deucher <alexander.deucher@amd.com>

> ---
>  drivers/gpu/drm/amd/amdgpu/mes_v11_0.c | 18 ++++++++++++++----
>  drivers/gpu/drm/amd/amdgpu/mes_v12_0.c | 18 ++++++++++++++----
>  2 files changed, 28 insertions(+), 8 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
> index 8ce51b9236c1..68c74adf79f1 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
> @@ -168,7 +168,7 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
>         const char *op_str, *misc_op_str;
>         unsigned long flags;
>         u64 status_gpu_addr;
> -       u32 status_offset;
> +       u32 seq, status_offset;
>         u64 *status_ptr;
>         signed long r;
>         int ret;
> @@ -196,6 +196,13 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
>         if (r)
>                 goto error_unlock_free;
>
> +       seq = ++ring->fence_drv.sync_seq;
> +       r = amdgpu_fence_wait_polling(ring,
> +                                     seq - ring->fence_drv.num_fences_mask,
> +                                     timeout);
> +       if (r < 1)
> +               goto error_undo;
> +
>         api_status = (struct MES_API_STATUS *)((char *)pkt + api_status_off);
>         api_status->api_completion_fence_addr = status_gpu_addr;
>         api_status->api_completion_fence_value = 1;
> @@ -208,8 +215,7 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
>         mes_status_pkt.header.dwsize = API_FRAME_SIZE_IN_DWORDS;
>         mes_status_pkt.api_status.api_completion_fence_addr =
>                 ring->fence_drv.gpu_addr;
> -       mes_status_pkt.api_status.api_completion_fence_value =
> -               ++ring->fence_drv.sync_seq;
> +       mes_status_pkt.api_status.api_completion_fence_value = seq;
>
>         amdgpu_ring_write_multiple(ring, &mes_status_pkt,
>                                    sizeof(mes_status_pkt) / 4);
> @@ -229,7 +235,7 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
>                 dev_dbg(adev->dev, "MES msg=%d was emitted\n",
>                         x_pkt->header.opcode);
>
> -       r = amdgpu_fence_wait_polling(ring, ring->fence_drv.sync_seq, timeout);
> +       r = amdgpu_fence_wait_polling(ring, seq, timeout);
>         if (r < 1 || !*status_ptr) {
>
>                 if (misc_op_str)
> @@ -252,6 +258,10 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
>         amdgpu_device_wb_free(adev, status_offset);
>         return 0;
>
> +error_undo:
> +       dev_err(adev->dev, "MES ring buffer is full.\n");
> +       amdgpu_ring_undo(ring);
> +
>  error_unlock_free:
>         spin_unlock_irqrestore(&mes->ring_lock, flags);
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
> index c9f74231ad59..48e01206bcc4 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
> @@ -154,7 +154,7 @@ static int mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
>         const char *op_str, *misc_op_str;
>         unsigned long flags;
>         u64 status_gpu_addr;
> -       u32 status_offset;
> +       u32 seq, status_offset;
>         u64 *status_ptr;
>         signed long r;
>         int ret;
> @@ -182,6 +182,13 @@ static int mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
>         if (r)
>                 goto error_unlock_free;
>
> +       seq = ++ring->fence_drv.sync_seq;
> +       r = amdgpu_fence_wait_polling(ring,
> +                                     seq - ring->fence_drv.num_fences_mask,
> +                                     timeout);
> +       if (r < 1)
> +               goto error_undo;
> +
>         api_status = (struct MES_API_STATUS *)((char *)pkt + api_status_off);
>         api_status->api_completion_fence_addr = status_gpu_addr;
>         api_status->api_completion_fence_value = 1;
> @@ -194,8 +201,7 @@ static int mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
>         mes_status_pkt.header.dwsize = API_FRAME_SIZE_IN_DWORDS;
>         mes_status_pkt.api_status.api_completion_fence_addr =
>                 ring->fence_drv.gpu_addr;
> -       mes_status_pkt.api_status.api_completion_fence_value =
> -               ++ring->fence_drv.sync_seq;
> +       mes_status_pkt.api_status.api_completion_fence_value = seq;
>
>         amdgpu_ring_write_multiple(ring, &mes_status_pkt,
>                                    sizeof(mes_status_pkt) / 4);
> @@ -215,7 +221,7 @@ static int mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
>                 dev_dbg(adev->dev, "MES msg=%d was emitted\n",
>                         x_pkt->header.opcode);
>
> -       r = amdgpu_fence_wait_polling(ring, ring->fence_drv.sync_seq, timeout);
> +       r = amdgpu_fence_wait_polling(ring, seq, timeout);
>         if (r < 1 || !*status_ptr) {
>
>                 if (misc_op_str)
> @@ -238,6 +244,10 @@ static int mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
>         amdgpu_device_wb_free(adev, status_offset);
>         return 0;
>
> +error_undo:
> +       dev_err(adev->dev, "MES ring buffer is full.\n");
> +       amdgpu_ring_undo(ring);
> +
>  error_unlock_free:
>         spin_unlock_irqrestore(&mes->ring_lock, flags);
>
> --
> 2.41.0
>

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] drm/amdgpu/mes: fix mes ring buffer overflow
  2024-07-19  9:16 Jack Xiao
  2024-07-19 13:56 ` Alex Deucher
@ 2024-07-19 15:44 ` Christian König
  2024-07-22  3:27   ` Xiao, Jack
  1 sibling, 1 reply; 14+ messages in thread
From: Christian König @ 2024-07-19 15:44 UTC (permalink / raw)
  To: Jack Xiao, amd-gfx, Alexander.Deucher

Am 19.07.24 um 11:16 schrieb Jack Xiao:
> wait memory room until enough before writing mes packets
> to avoid ring buffer overflow.
>
> Signed-off-by: Jack Xiao <Jack.Xiao@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/mes_v11_0.c | 18 ++++++++++++++----
>   drivers/gpu/drm/amd/amdgpu/mes_v12_0.c | 18 ++++++++++++++----
>   2 files changed, 28 insertions(+), 8 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
> index 8ce51b9236c1..68c74adf79f1 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
> @@ -168,7 +168,7 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
>   	const char *op_str, *misc_op_str;
>   	unsigned long flags;
>   	u64 status_gpu_addr;
> -	u32 status_offset;
> +	u32 seq, status_offset;
>   	u64 *status_ptr;
>   	signed long r;
>   	int ret;
> @@ -196,6 +196,13 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
>   	if (r)
>   		goto error_unlock_free;
>   
> +	seq = ++ring->fence_drv.sync_seq;
> +	r = amdgpu_fence_wait_polling(ring,
> +				      seq - ring->fence_drv.num_fences_mask,
> +				      timeout);
> +	if (r < 1)
> +		goto error_undo;
> +
>   	api_status = (struct MES_API_STATUS *)((char *)pkt + api_status_off);
>   	api_status->api_completion_fence_addr = status_gpu_addr;
>   	api_status->api_completion_fence_value = 1;
> @@ -208,8 +215,7 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
>   	mes_status_pkt.header.dwsize = API_FRAME_SIZE_IN_DWORDS;
>   	mes_status_pkt.api_status.api_completion_fence_addr =
>   		ring->fence_drv.gpu_addr;
> -	mes_status_pkt.api_status.api_completion_fence_value =
> -		++ring->fence_drv.sync_seq;
> +	mes_status_pkt.api_status.api_completion_fence_value = seq;
>   
>   	amdgpu_ring_write_multiple(ring, &mes_status_pkt,
>   				   sizeof(mes_status_pkt) / 4);
> @@ -229,7 +235,7 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
>   		dev_dbg(adev->dev, "MES msg=%d was emitted\n",
>   			x_pkt->header.opcode);
>   
> -	r = amdgpu_fence_wait_polling(ring, ring->fence_drv.sync_seq, timeout);
> +	r = amdgpu_fence_wait_polling(ring, seq, timeout);
>   	if (r < 1 || !*status_ptr) {
>   
>   		if (misc_op_str)
> @@ -252,6 +258,10 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
>   	amdgpu_device_wb_free(adev, status_offset);
>   	return 0;
>   
> +error_undo:
> +	dev_err(adev->dev, "MES ring buffer is full.\n");
> +	amdgpu_ring_undo(ring);
> +
>   error_unlock_free:
>   	spin_unlock_irqrestore(&mes->ring_lock, flags);
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
> index c9f74231ad59..48e01206bcc4 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
> @@ -154,7 +154,7 @@ static int mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
>   	const char *op_str, *misc_op_str;
>   	unsigned long flags;
>   	u64 status_gpu_addr;
> -	u32 status_offset;
> +	u32 seq, status_offset;
>   	u64 *status_ptr;
>   	signed long r;
>   	int ret;
> @@ -182,6 +182,13 @@ static int mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
>   	if (r)
>   		goto error_unlock_free;
>   
> +	seq = ++ring->fence_drv.sync_seq;
> +	r = amdgpu_fence_wait_polling(ring,
> +				      seq - ring->fence_drv.num_fences_mask,

That's what's amdgpu_fence_emit_polling() does anyway.

So this here just moves the polling a bit earlier.

I think we rather need to increase the MES ring size instead.

Regards,
Christian.


> +				      timeout);
> +	if (r < 1)
> +		goto error_undo;
> +
>   	api_status = (struct MES_API_STATUS *)((char *)pkt + api_status_off);
>   	api_status->api_completion_fence_addr = status_gpu_addr;
>   	api_status->api_completion_fence_value = 1;
> @@ -194,8 +201,7 @@ static int mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
>   	mes_status_pkt.header.dwsize = API_FRAME_SIZE_IN_DWORDS;
>   	mes_status_pkt.api_status.api_completion_fence_addr =
>   		ring->fence_drv.gpu_addr;
> -	mes_status_pkt.api_status.api_completion_fence_value =
> -		++ring->fence_drv.sync_seq;
> +	mes_status_pkt.api_status.api_completion_fence_value = seq;
>   
>   	amdgpu_ring_write_multiple(ring, &mes_status_pkt,
>   				   sizeof(mes_status_pkt) / 4);
> @@ -215,7 +221,7 @@ static int mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
>   		dev_dbg(adev->dev, "MES msg=%d was emitted\n",
>   			x_pkt->header.opcode);
>   
> -	r = amdgpu_fence_wait_polling(ring, ring->fence_drv.sync_seq, timeout);
> +	r = amdgpu_fence_wait_polling(ring, seq, timeout);
>   	if (r < 1 || !*status_ptr) {
>   
>   		if (misc_op_str)
> @@ -238,6 +244,10 @@ static int mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
>   	amdgpu_device_wb_free(adev, status_offset);
>   	return 0;
>   
> +error_undo:
> +	dev_err(adev->dev, "MES ring buffer is full.\n");
> +	amdgpu_ring_undo(ring);
> +
>   error_unlock_free:
>   	spin_unlock_irqrestore(&mes->ring_lock, flags);
>   


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] drm/amdgpu/mes: fix mes ring buffer overflow
  2024-07-19 15:44 ` Christian König
@ 2024-07-22  3:27   ` Xiao, Jack
  2024-07-22  8:20     ` Christian König
  0 siblings, 1 reply; 14+ messages in thread
From: Xiao, Jack @ 2024-07-22  3:27 UTC (permalink / raw)
  To: Christian König, amd-gfx@lists.freedesktop.org,
	Deucher, Alexander

[-- Attachment #1: Type: text/plain, Size: 6352 bytes --]

[AMD Official Use Only - AMD Internal Distribution Only]

>> I think we rather need to increase the MES ring size instead.

Unfortunately, it doesn't work. I guess mes firmware has limitation.

Regards,
Jack

________________________________
From: Christian König <ckoenig.leichtzumerken@gmail.com>
Sent: Friday, 19 July 2024 23:44
To: Xiao, Jack <Jack.Xiao@amd.com>; amd-gfx@lists.freedesktop.org <amd-gfx@lists.freedesktop.org>; Deucher, Alexander <Alexander.Deucher@amd.com>
Subject: Re: [PATCH] drm/amdgpu/mes: fix mes ring buffer overflow

Am 19.07.24 um 11:16 schrieb Jack Xiao:
> wait memory room until enough before writing mes packets
> to avoid ring buffer overflow.
>
> Signed-off-by: Jack Xiao <Jack.Xiao@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/mes_v11_0.c | 18 ++++++++++++++----
>   drivers/gpu/drm/amd/amdgpu/mes_v12_0.c | 18 ++++++++++++++----
>   2 files changed, 28 insertions(+), 8 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
> index 8ce51b9236c1..68c74adf79f1 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
> @@ -168,7 +168,7 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
>        const char *op_str, *misc_op_str;
>        unsigned long flags;
>        u64 status_gpu_addr;
> -     u32 status_offset;
> +     u32 seq, status_offset;
>        u64 *status_ptr;
>        signed long r;
>        int ret;
> @@ -196,6 +196,13 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
>        if (r)
>                goto error_unlock_free;
>
> +     seq = ++ring->fence_drv.sync_seq;
> +     r = amdgpu_fence_wait_polling(ring,
> +                                   seq - ring->fence_drv.num_fences_mask,
> +                                   timeout);
> +     if (r < 1)
> +             goto error_undo;
> +
>        api_status = (struct MES_API_STATUS *)((char *)pkt + api_status_off);
>        api_status->api_completion_fence_addr = status_gpu_addr;
>        api_status->api_completion_fence_value = 1;
> @@ -208,8 +215,7 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
>        mes_status_pkt.header.dwsize = API_FRAME_SIZE_IN_DWORDS;
>        mes_status_pkt.api_status.api_completion_fence_addr =
>                ring->fence_drv.gpu_addr;
> -     mes_status_pkt.api_status.api_completion_fence_value =
> -             ++ring->fence_drv.sync_seq;
> +     mes_status_pkt.api_status.api_completion_fence_value = seq;
>
>        amdgpu_ring_write_multiple(ring, &mes_status_pkt,
>                                   sizeof(mes_status_pkt) / 4);
> @@ -229,7 +235,7 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
>                dev_dbg(adev->dev, "MES msg=%d was emitted\n",
>                        x_pkt->header.opcode);
>
> -     r = amdgpu_fence_wait_polling(ring, ring->fence_drv.sync_seq, timeout);
> +     r = amdgpu_fence_wait_polling(ring, seq, timeout);
>        if (r < 1 || !*status_ptr) {
>
>                if (misc_op_str)
> @@ -252,6 +258,10 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
>        amdgpu_device_wb_free(adev, status_offset);
>        return 0;
>
> +error_undo:
> +     dev_err(adev->dev, "MES ring buffer is full.\n");
> +     amdgpu_ring_undo(ring);
> +
>   error_unlock_free:
>        spin_unlock_irqrestore(&mes->ring_lock, flags);
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
> index c9f74231ad59..48e01206bcc4 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
> @@ -154,7 +154,7 @@ static int mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
>        const char *op_str, *misc_op_str;
>        unsigned long flags;
>        u64 status_gpu_addr;
> -     u32 status_offset;
> +     u32 seq, status_offset;
>        u64 *status_ptr;
>        signed long r;
>        int ret;
> @@ -182,6 +182,13 @@ static int mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
>        if (r)
>                goto error_unlock_free;
>
> +     seq = ++ring->fence_drv.sync_seq;
> +     r = amdgpu_fence_wait_polling(ring,
> +                                   seq - ring->fence_drv.num_fences_mask,

That's what's amdgpu_fence_emit_polling() does anyway.

So this here just moves the polling a bit earlier.

I think we rather need to increase the MES ring size instead.

Regards,
Christian.


> +                                   timeout);
> +     if (r < 1)
> +             goto error_undo;
> +
>        api_status = (struct MES_API_STATUS *)((char *)pkt + api_status_off);
>        api_status->api_completion_fence_addr = status_gpu_addr;
>        api_status->api_completion_fence_value = 1;
> @@ -194,8 +201,7 @@ static int mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
>        mes_status_pkt.header.dwsize = API_FRAME_SIZE_IN_DWORDS;
>        mes_status_pkt.api_status.api_completion_fence_addr =
>                ring->fence_drv.gpu_addr;
> -     mes_status_pkt.api_status.api_completion_fence_value =
> -             ++ring->fence_drv.sync_seq;
> +     mes_status_pkt.api_status.api_completion_fence_value = seq;
>
>        amdgpu_ring_write_multiple(ring, &mes_status_pkt,
>                                   sizeof(mes_status_pkt) / 4);
> @@ -215,7 +221,7 @@ static int mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
>                dev_dbg(adev->dev, "MES msg=%d was emitted\n",
>                        x_pkt->header.opcode);
>
> -     r = amdgpu_fence_wait_polling(ring, ring->fence_drv.sync_seq, timeout);
> +     r = amdgpu_fence_wait_polling(ring, seq, timeout);
>        if (r < 1 || !*status_ptr) {
>
>                if (misc_op_str)
> @@ -238,6 +244,10 @@ static int mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
>        amdgpu_device_wb_free(adev, status_offset);
>        return 0;
>
> +error_undo:
> +     dev_err(adev->dev, "MES ring buffer is full.\n");
> +     amdgpu_ring_undo(ring);
> +
>   error_unlock_free:
>        spin_unlock_irqrestore(&mes->ring_lock, flags);
>


[-- Attachment #2: Type: text/html, Size: 13370 bytes --]

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] drm/amdgpu/mes: fix mes ring buffer overflow
  2024-07-22  3:27   ` Xiao, Jack
@ 2024-07-22  8:20     ` Christian König
  2024-07-22  8:46       ` Xiao, Jack
  0 siblings, 1 reply; 14+ messages in thread
From: Christian König @ 2024-07-22  8:20 UTC (permalink / raw)
  To: Xiao, Jack, amd-gfx@lists.freedesktop.org, Deucher, Alexander

[-- Attachment #1: Type: text/plain, Size: 7348 bytes --]

Thx, but in that case this patch here then won't help either it just 
mitigates the problem.

Can you try to reduce num_hw_submission for the MES ring?

Thanks,
Christian.

Am 22.07.24 um 05:27 schrieb Xiao, Jack:
>
> [AMD Official Use Only - AMD Internal Distribution Only]
>
>
> />> I think we rather need to increase the MES ring size instead./
>
> Unfortunately, it doesn't work. I guess mes firmware has limitation.
>
> Regards,
> Jack
>
> ------------------------------------------------------------------------
> *From:* Christian König <ckoenig.leichtzumerken@gmail.com>
> *Sent:* Friday, 19 July 2024 23:44
> *To:* Xiao, Jack <Jack.Xiao@amd.com>; amd-gfx@lists.freedesktop.org 
> <amd-gfx@lists.freedesktop.org>; Deucher, Alexander 
> <Alexander.Deucher@amd.com>
> *Subject:* Re: [PATCH] drm/amdgpu/mes: fix mes ring buffer overflow
> Am 19.07.24 um 11:16 schrieb Jack Xiao:
> > wait memory room until enough before writing mes packets
> > to avoid ring buffer overflow.
> >
> > Signed-off-by: Jack Xiao <Jack.Xiao@amd.com>
> > ---
> >   drivers/gpu/drm/amd/amdgpu/mes_v11_0.c | 18 ++++++++++++++----
> >   drivers/gpu/drm/amd/amdgpu/mes_v12_0.c | 18 ++++++++++++++----
> >   2 files changed, 28 insertions(+), 8 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c 
> b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
> > index 8ce51b9236c1..68c74adf79f1 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
> > @@ -168,7 +168,7 @@ static int 
> mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
> >        const char *op_str, *misc_op_str;
> >        unsigned long flags;
> >        u64 status_gpu_addr;
> > -     u32 status_offset;
> > +     u32 seq, status_offset;
> >        u64 *status_ptr;
> >        signed long r;
> >        int ret;
> > @@ -196,6 +196,13 @@ static int 
> mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
> >        if (r)
> >                goto error_unlock_free;
> >
> > +     seq = ++ring->fence_drv.sync_seq;
> > +     r = amdgpu_fence_wait_polling(ring,
> > +                                   seq - 
> ring->fence_drv.num_fences_mask,
> > +                                   timeout);
> > +     if (r < 1)
> > +             goto error_undo;
> > +
> >        api_status = (struct MES_API_STATUS *)((char *)pkt + 
> api_status_off);
> >        api_status->api_completion_fence_addr = status_gpu_addr;
> >        api_status->api_completion_fence_value = 1;
> > @@ -208,8 +215,7 @@ static int 
> mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
> >        mes_status_pkt.header.dwsize = API_FRAME_SIZE_IN_DWORDS;
> > mes_status_pkt.api_status.api_completion_fence_addr =
> >                ring->fence_drv.gpu_addr;
> > - mes_status_pkt.api_status.api_completion_fence_value =
> > -             ++ring->fence_drv.sync_seq;
> > + mes_status_pkt.api_status.api_completion_fence_value = seq;
> >
> >        amdgpu_ring_write_multiple(ring, &mes_status_pkt,
> > sizeof(mes_status_pkt) / 4);
> > @@ -229,7 +235,7 @@ static int 
> mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
> >                dev_dbg(adev->dev, "MES msg=%d was emitted\n",
> >                        x_pkt->header.opcode);
> >
> > -     r = amdgpu_fence_wait_polling(ring, ring->fence_drv.sync_seq, 
> timeout);
> > +     r = amdgpu_fence_wait_polling(ring, seq, timeout);
> >        if (r < 1 || !*status_ptr) {
> >
> >                if (misc_op_str)
> > @@ -252,6 +258,10 @@ static int 
> mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
> >        amdgpu_device_wb_free(adev, status_offset);
> >        return 0;
> >
> > +error_undo:
> > +     dev_err(adev->dev, "MES ring buffer is full.\n");
> > +     amdgpu_ring_undo(ring);
> > +
> >   error_unlock_free:
> > spin_unlock_irqrestore(&mes->ring_lock, flags);
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c 
> b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
> > index c9f74231ad59..48e01206bcc4 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
> > @@ -154,7 +154,7 @@ static int 
> mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
> >        const char *op_str, *misc_op_str;
> >        unsigned long flags;
> >        u64 status_gpu_addr;
> > -     u32 status_offset;
> > +     u32 seq, status_offset;
> >        u64 *status_ptr;
> >        signed long r;
> >        int ret;
> > @@ -182,6 +182,13 @@ static int 
> mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
> >        if (r)
> >                goto error_unlock_free;
> >
> > +     seq = ++ring->fence_drv.sync_seq;
> > +     r = amdgpu_fence_wait_polling(ring,
> > +                                   seq - 
> ring->fence_drv.num_fences_mask,
>
> That's what's amdgpu_fence_emit_polling() does anyway.
>
> So this here just moves the polling a bit earlier.
>
> I think we rather need to increase the MES ring size instead.
>
> Regards,
> Christian.
>
>
> > +                                   timeout);
> > +     if (r < 1)
> > +             goto error_undo;
> > +
> >        api_status = (struct MES_API_STATUS *)((char *)pkt + 
> api_status_off);
> >        api_status->api_completion_fence_addr = status_gpu_addr;
> >        api_status->api_completion_fence_value = 1;
> > @@ -194,8 +201,7 @@ static int 
> mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
> >        mes_status_pkt.header.dwsize = API_FRAME_SIZE_IN_DWORDS;
> > mes_status_pkt.api_status.api_completion_fence_addr =
> >                ring->fence_drv.gpu_addr;
> > - mes_status_pkt.api_status.api_completion_fence_value =
> > -             ++ring->fence_drv.sync_seq;
> > + mes_status_pkt.api_status.api_completion_fence_value = seq;
> >
> >        amdgpu_ring_write_multiple(ring, &mes_status_pkt,
> > sizeof(mes_status_pkt) / 4);
> > @@ -215,7 +221,7 @@ static int 
> mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
> >                dev_dbg(adev->dev, "MES msg=%d was emitted\n",
> >                        x_pkt->header.opcode);
> >
> > -     r = amdgpu_fence_wait_polling(ring, ring->fence_drv.sync_seq, 
> timeout);
> > +     r = amdgpu_fence_wait_polling(ring, seq, timeout);
> >        if (r < 1 || !*status_ptr) {
> >
> >                if (misc_op_str)
> > @@ -238,6 +244,10 @@ static int 
> mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
> >        amdgpu_device_wb_free(adev, status_offset);
> >        return 0;
> >
> > +error_undo:
> > +     dev_err(adev->dev, "MES ring buffer is full.\n");
> > +     amdgpu_ring_undo(ring);
> > +
> >   error_unlock_free:
> > spin_unlock_irqrestore(&mes->ring_lock, flags);
> >
>

[-- Attachment #2: Type: text/html, Size: 14561 bytes --]

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] drm/amdgpu/mes: fix mes ring buffer overflow
  2024-07-22  8:20     ` Christian König
@ 2024-07-22  8:46       ` Xiao, Jack
  2024-07-22 11:20         ` Christian König
  0 siblings, 1 reply; 14+ messages in thread
From: Xiao, Jack @ 2024-07-22  8:46 UTC (permalink / raw)
  To: Christian König, amd-gfx@lists.freedesktop.org,
	Deucher, Alexander

[-- Attachment #1: Type: text/plain, Size: 7461 bytes --]

[AMD Official Use Only - AMD Internal Distribution Only]

>> Can you try to reduce num_hw_submission for the MES ring?

Smaller num_hw_submission should not help for this issue, for Mes work without drm scheduler like legacy kiq. Smaller num_hw_submission will result in smaller mes ring size and more waiting time.

Regards,
Jack
________________________________
From: Christian König <ckoenig.leichtzumerken@gmail.com>
Sent: Monday, 22 July 2024 16:20
To: Xiao, Jack <Jack.Xiao@amd.com>; amd-gfx@lists.freedesktop.org <amd-gfx@lists.freedesktop.org>; Deucher, Alexander <Alexander.Deucher@amd.com>
Subject: Re: [PATCH] drm/amdgpu/mes: fix mes ring buffer overflow

Thx, but in that case this patch here then won't help either it just mitigates the problem.

Can you try to reduce num_hw_submission for the MES ring?

Thanks,
Christian.

Am 22.07.24 um 05:27 schrieb Xiao, Jack:

[AMD Official Use Only - AMD Internal Distribution Only]

>> I think we rather need to increase the MES ring size instead.

Unfortunately, it doesn't work. I guess mes firmware has limitation.

Regards,
Jack

________________________________
From: Christian König <ckoenig.leichtzumerken@gmail.com><mailto:ckoenig.leichtzumerken@gmail.com>
Sent: Friday, 19 July 2024 23:44
To: Xiao, Jack <Jack.Xiao@amd.com><mailto:Jack.Xiao@amd.com>; amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org> <amd-gfx@lists.freedesktop.org><mailto:amd-gfx@lists.freedesktop.org>; Deucher, Alexander <Alexander.Deucher@amd.com><mailto:Alexander.Deucher@amd.com>
Subject: Re: [PATCH] drm/amdgpu/mes: fix mes ring buffer overflow

Am 19.07.24 um 11:16 schrieb Jack Xiao:
> wait memory room until enough before writing mes packets
> to avoid ring buffer overflow.
>
> Signed-off-by: Jack Xiao <Jack.Xiao@amd.com><mailto:Jack.Xiao@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/mes_v11_0.c | 18 ++++++++++++++----
>   drivers/gpu/drm/amd/amdgpu/mes_v12_0.c | 18 ++++++++++++++----
>   2 files changed, 28 insertions(+), 8 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
> index 8ce51b9236c1..68c74adf79f1 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
> @@ -168,7 +168,7 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
>        const char *op_str, *misc_op_str;
>        unsigned long flags;
>        u64 status_gpu_addr;
> -     u32 status_offset;
> +     u32 seq, status_offset;
>        u64 *status_ptr;
>        signed long r;
>        int ret;
> @@ -196,6 +196,13 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
>        if (r)
>                goto error_unlock_free;
>
> +     seq = ++ring->fence_drv.sync_seq;
> +     r = amdgpu_fence_wait_polling(ring,
> +                                   seq - ring->fence_drv.num_fences_mask,
> +                                   timeout);
> +     if (r < 1)
> +             goto error_undo;
> +
>        api_status = (struct MES_API_STATUS *)((char *)pkt + api_status_off);
>        api_status->api_completion_fence_addr = status_gpu_addr;
>        api_status->api_completion_fence_value = 1;
> @@ -208,8 +215,7 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
>        mes_status_pkt.header.dwsize = API_FRAME_SIZE_IN_DWORDS;
>        mes_status_pkt.api_status.api_completion_fence_addr =
>                ring->fence_drv.gpu_addr;
> -     mes_status_pkt.api_status.api_completion_fence_value =
> -             ++ring->fence_drv.sync_seq;
> +     mes_status_pkt.api_status.api_completion_fence_value = seq;
>
>        amdgpu_ring_write_multiple(ring, &mes_status_pkt,
>                                   sizeof(mes_status_pkt) / 4);
> @@ -229,7 +235,7 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
>                dev_dbg(adev->dev, "MES msg=%d was emitted\n",
>                        x_pkt->header.opcode);
>
> -     r = amdgpu_fence_wait_polling(ring, ring->fence_drv.sync_seq, timeout);
> +     r = amdgpu_fence_wait_polling(ring, seq, timeout);
>        if (r < 1 || !*status_ptr) {
>
>                if (misc_op_str)
> @@ -252,6 +258,10 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
>        amdgpu_device_wb_free(adev, status_offset);
>        return 0;
>
> +error_undo:
> +     dev_err(adev->dev, "MES ring buffer is full.\n");
> +     amdgpu_ring_undo(ring);
> +
>   error_unlock_free:
>        spin_unlock_irqrestore(&mes->ring_lock, flags);
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
> index c9f74231ad59..48e01206bcc4 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
> @@ -154,7 +154,7 @@ static int mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
>        const char *op_str, *misc_op_str;
>        unsigned long flags;
>        u64 status_gpu_addr;
> -     u32 status_offset;
> +     u32 seq, status_offset;
>        u64 *status_ptr;
>        signed long r;
>        int ret;
> @@ -182,6 +182,13 @@ static int mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
>        if (r)
>                goto error_unlock_free;
>
> +     seq = ++ring->fence_drv.sync_seq;
> +     r = amdgpu_fence_wait_polling(ring,
> +                                   seq - ring->fence_drv.num_fences_mask,

That's what's amdgpu_fence_emit_polling() does anyway.

So this here just moves the polling a bit earlier.

I think we rather need to increase the MES ring size instead.

Regards,
Christian.


> +                                   timeout);
> +     if (r < 1)
> +             goto error_undo;
> +
>        api_status = (struct MES_API_STATUS *)((char *)pkt + api_status_off);
>        api_status->api_completion_fence_addr = status_gpu_addr;
>        api_status->api_completion_fence_value = 1;
> @@ -194,8 +201,7 @@ static int mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
>        mes_status_pkt.header.dwsize = API_FRAME_SIZE_IN_DWORDS;
>        mes_status_pkt.api_status.api_completion_fence_addr =
>                ring->fence_drv.gpu_addr;
> -     mes_status_pkt.api_status.api_completion_fence_value =
> -             ++ring->fence_drv.sync_seq;
> +     mes_status_pkt.api_status.api_completion_fence_value = seq;
>
>        amdgpu_ring_write_multiple(ring, &mes_status_pkt,
>                                   sizeof(mes_status_pkt) / 4);
> @@ -215,7 +221,7 @@ static int mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
>                dev_dbg(adev->dev, "MES msg=%d was emitted\n",
>                        x_pkt->header.opcode);
>
> -     r = amdgpu_fence_wait_polling(ring, ring->fence_drv.sync_seq, timeout);
> +     r = amdgpu_fence_wait_polling(ring, seq, timeout);
>        if (r < 1 || !*status_ptr) {
>
>                if (misc_op_str)
> @@ -238,6 +244,10 @@ static int mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
>        amdgpu_device_wb_free(adev, status_offset);
>        return 0;
>
> +error_undo:
> +     dev_err(adev->dev, "MES ring buffer is full.\n");
> +     amdgpu_ring_undo(ring);
> +
>   error_unlock_free:
>        spin_unlock_irqrestore(&mes->ring_lock, flags);
>



[-- Attachment #2: Type: text/html, Size: 16249 bytes --]

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] drm/amdgpu/mes: fix mes ring buffer overflow
  2024-07-22  8:46       ` Xiao, Jack
@ 2024-07-22 11:20         ` Christian König
  2024-07-22 19:52           ` Alex Deucher
  0 siblings, 1 reply; 14+ messages in thread
From: Christian König @ 2024-07-22 11:20 UTC (permalink / raw)
  To: Xiao, Jack, amd-gfx@lists.freedesktop.org, Deucher, Alexander

[-- Attachment #1: Type: text/plain, Size: 8983 bytes --]

What I meant is that the MES ring is now to small for the number of 
packets written to it.

Either reduce the num_hw_submission or increase the MES ring size.

E.g. see this code here in amdgpu_ring_init:

         if (ring->funcs->type == AMDGPU_RING_TYPE_KIQ)
                 sched_hw_submission = max(sched_hw_submission, 256);
         else if (ring == &adev->sdma.instance[0].page)
                 sched_hw_submission = 256;

We are basically just missing a case for the MES here as far as I can see.

Regards,
Christian.

Am 22.07.24 um 10:46 schrieb Xiao, Jack:
>
> [AMD Official Use Only - AMD Internal Distribution Only]
>
>
> />> Can you try to reduce num_hw_submission for the MES ring?
> /
> Smaller num_hw_submission should not help for this issue, for Mes work 
> without drm scheduler like legacy kiq. Smaller num_hw_submission will 
> result in smaller mes ring size and more waiting time.
>
> Regards,
> Jack
> ------------------------------------------------------------------------
> *From:* Christian König <ckoenig.leichtzumerken@gmail.com>
> *Sent:* Monday, 22 July 2024 16:20
> *To:* Xiao, Jack <Jack.Xiao@amd.com>; amd-gfx@lists.freedesktop.org 
> <amd-gfx@lists.freedesktop.org>; Deucher, Alexander 
> <Alexander.Deucher@amd.com>
> *Subject:* Re: [PATCH] drm/amdgpu/mes: fix mes ring buffer overflow
> Thx, but in that case this patch here then won't help either it just 
> mitigates the problem.
>
> Can you try to reduce num_hw_submission for the MES ring?
>
> Thanks,
> Christian.
>
> Am 22.07.24 um 05:27 schrieb Xiao, Jack:
>>
>> [AMD Official Use Only - AMD Internal Distribution Only]
>>
>>
>> />> I think we rather need to increase the MES ring size instead./
>>
>> Unfortunately, it doesn't work. I guess mes firmware has limitation.
>>
>> Regards,
>> Jack
>>
>> ------------------------------------------------------------------------
>> *From:* Christian König <ckoenig.leichtzumerken@gmail.com> 
>> <mailto:ckoenig.leichtzumerken@gmail.com>
>> *Sent:* Friday, 19 July 2024 23:44
>> *To:* Xiao, Jack <Jack.Xiao@amd.com> <mailto:Jack.Xiao@amd.com>; 
>> amd-gfx@lists.freedesktop.org <mailto:amd-gfx@lists.freedesktop.org> 
>> <amd-gfx@lists.freedesktop.org> 
>> <mailto:amd-gfx@lists.freedesktop.org>; Deucher, Alexander 
>> <Alexander.Deucher@amd.com> <mailto:Alexander.Deucher@amd.com>
>> *Subject:* Re: [PATCH] drm/amdgpu/mes: fix mes ring buffer overflow
>> Am 19.07.24 um 11:16 schrieb Jack Xiao:
>> > wait memory room until enough before writing mes packets
>> > to avoid ring buffer overflow.
>> >
>> > Signed-off-by: Jack Xiao <Jack.Xiao@amd.com> <mailto:Jack.Xiao@amd.com>
>> > ---
>> >   drivers/gpu/drm/amd/amdgpu/mes_v11_0.c | 18 ++++++++++++++----
>> >   drivers/gpu/drm/amd/amdgpu/mes_v12_0.c | 18 ++++++++++++++----
>> >   2 files changed, 28 insertions(+), 8 deletions(-)
>> >
>> > diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c 
>> b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
>> > index 8ce51b9236c1..68c74adf79f1 100644
>> > --- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
>> > +++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
>> > @@ -168,7 +168,7 @@ static int 
>> mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
>> >        const char *op_str, *misc_op_str;
>> >        unsigned long flags;
>> >        u64 status_gpu_addr;
>> > -     u32 status_offset;
>> > +     u32 seq, status_offset;
>> >        u64 *status_ptr;
>> >        signed long r;
>> >        int ret;
>> > @@ -196,6 +196,13 @@ static int 
>> mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
>> >        if (r)
>> >                goto error_unlock_free;
>> >
>> > +     seq = ++ring->fence_drv.sync_seq;
>> > +     r = amdgpu_fence_wait_polling(ring,
>> > +                                   seq - 
>> ring->fence_drv.num_fences_mask,
>> > +                                   timeout);
>> > +     if (r < 1)
>> > +             goto error_undo;
>> > +
>> >        api_status = (struct MES_API_STATUS *)((char *)pkt + 
>> api_status_off);
>> > api_status->api_completion_fence_addr = status_gpu_addr;
>> > api_status->api_completion_fence_value = 1;
>> > @@ -208,8 +215,7 @@ static int 
>> mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
>> >        mes_status_pkt.header.dwsize = API_FRAME_SIZE_IN_DWORDS;
>> > mes_status_pkt.api_status.api_completion_fence_addr =
>> >                ring->fence_drv.gpu_addr;
>> > - mes_status_pkt.api_status.api_completion_fence_value =
>> > -             ++ring->fence_drv.sync_seq;
>> > + mes_status_pkt.api_status.api_completion_fence_value = seq;
>> >
>> >        amdgpu_ring_write_multiple(ring, &mes_status_pkt,
>> > sizeof(mes_status_pkt) / 4);
>> > @@ -229,7 +235,7 @@ static int 
>> mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
>> >                dev_dbg(adev->dev, "MES msg=%d was emitted\n",
>> > x_pkt->header.opcode);
>> >
>> > -     r = amdgpu_fence_wait_polling(ring, ring->fence_drv.sync_seq, 
>> timeout);
>> > +     r = amdgpu_fence_wait_polling(ring, seq, timeout);
>> >        if (r < 1 || !*status_ptr) {
>> >
>> >                if (misc_op_str)
>> > @@ -252,6 +258,10 @@ static int 
>> mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
>> >        amdgpu_device_wb_free(adev, status_offset);
>> >        return 0;
>> >
>> > +error_undo:
>> > +     dev_err(adev->dev, "MES ring buffer is full.\n");
>> > +     amdgpu_ring_undo(ring);
>> > +
>> >   error_unlock_free:
>> > spin_unlock_irqrestore(&mes->ring_lock, flags);
>> >
>> > diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c 
>> b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
>> > index c9f74231ad59..48e01206bcc4 100644
>> > --- a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
>> > +++ b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
>> > @@ -154,7 +154,7 @@ static int 
>> mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
>> >        const char *op_str, *misc_op_str;
>> >        unsigned long flags;
>> >        u64 status_gpu_addr;
>> > -     u32 status_offset;
>> > +     u32 seq, status_offset;
>> >        u64 *status_ptr;
>> >        signed long r;
>> >        int ret;
>> > @@ -182,6 +182,13 @@ static int 
>> mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
>> >        if (r)
>> >                goto error_unlock_free;
>> >
>> > +     seq = ++ring->fence_drv.sync_seq;
>> > +     r = amdgpu_fence_wait_polling(ring,
>> > +                                   seq - 
>> ring->fence_drv.num_fences_mask,
>>
>> That's what's amdgpu_fence_emit_polling() does anyway.
>>
>> So this here just moves the polling a bit earlier.
>>
>> I think we rather need to increase the MES ring size instead.
>>
>> Regards,
>> Christian.
>>
>>
>> > +                                   timeout);
>> > +     if (r < 1)
>> > +             goto error_undo;
>> > +
>> >        api_status = (struct MES_API_STATUS *)((char *)pkt + 
>> api_status_off);
>> > api_status->api_completion_fence_addr = status_gpu_addr;
>> > api_status->api_completion_fence_value = 1;
>> > @@ -194,8 +201,7 @@ static int 
>> mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
>> >        mes_status_pkt.header.dwsize = API_FRAME_SIZE_IN_DWORDS;
>> > mes_status_pkt.api_status.api_completion_fence_addr =
>> >                ring->fence_drv.gpu_addr;
>> > - mes_status_pkt.api_status.api_completion_fence_value =
>> > -             ++ring->fence_drv.sync_seq;
>> > + mes_status_pkt.api_status.api_completion_fence_value = seq;
>> >
>> >        amdgpu_ring_write_multiple(ring, &mes_status_pkt,
>> > sizeof(mes_status_pkt) / 4);
>> > @@ -215,7 +221,7 @@ static int 
>> mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
>> >                dev_dbg(adev->dev, "MES msg=%d was emitted\n",
>> > x_pkt->header.opcode);
>> >
>> > -     r = amdgpu_fence_wait_polling(ring, ring->fence_drv.sync_seq, 
>> timeout);
>> > +     r = amdgpu_fence_wait_polling(ring, seq, timeout);
>> >        if (r < 1 || !*status_ptr) {
>> >
>> >                if (misc_op_str)
>> > @@ -238,6 +244,10 @@ static int 
>> mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
>> >        amdgpu_device_wb_free(adev, status_offset);
>> >        return 0;
>> >
>> > +error_undo:
>> > +     dev_err(adev->dev, "MES ring buffer is full.\n");
>> > +     amdgpu_ring_undo(ring);
>> > +
>> >   error_unlock_free:
>> > spin_unlock_irqrestore(&mes->ring_lock, flags);
>> >
>>
>

[-- Attachment #2: Type: text/html, Size: 20657 bytes --]

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] drm/amdgpu/mes: fix mes ring buffer overflow
  2024-07-22 11:20         ` Christian König
@ 2024-07-22 19:52           ` Alex Deucher
  2024-07-23  3:08             ` Xiao, Jack
  0 siblings, 1 reply; 14+ messages in thread
From: Alex Deucher @ 2024-07-22 19:52 UTC (permalink / raw)
  To: Christian König
  Cc: Xiao, Jack, amd-gfx@lists.freedesktop.org, Deucher, Alexander

Does this patch fix it?
https://patchwork.freedesktop.org/patch/605437/

Alex

On Mon, Jul 22, 2024 at 7:21 AM Christian König
<ckoenig.leichtzumerken@gmail.com> wrote:
>
> What I meant is that the MES ring is now to small for the number of packets written to it.
>
> Either reduce the num_hw_submission or increase the MES ring size.
>
> E.g. see this code here in amdgpu_ring_init:
>
>         if (ring->funcs->type == AMDGPU_RING_TYPE_KIQ)
>                 sched_hw_submission = max(sched_hw_submission, 256);
>         else if (ring == &adev->sdma.instance[0].page)
>                 sched_hw_submission = 256;
>
> We are basically just missing a case for the MES here as far as I can see.
>
> Regards,
> Christian.
>
> Am 22.07.24 um 10:46 schrieb Xiao, Jack:
>
> [AMD Official Use Only - AMD Internal Distribution Only]
>
>
> >> Can you try to reduce num_hw_submission for the MES ring?
>
> Smaller num_hw_submission should not help for this issue, for Mes work without drm scheduler like legacy kiq. Smaller num_hw_submission will result in smaller mes ring size and more waiting time.
>
> Regards,
> Jack
> ________________________________
> From: Christian König <ckoenig.leichtzumerken@gmail.com>
> Sent: Monday, 22 July 2024 16:20
> To: Xiao, Jack <Jack.Xiao@amd.com>; amd-gfx@lists.freedesktop.org <amd-gfx@lists.freedesktop.org>; Deucher, Alexander <Alexander.Deucher@amd.com>
> Subject: Re: [PATCH] drm/amdgpu/mes: fix mes ring buffer overflow
>
> Thx, but in that case this patch here then won't help either it just mitigates the problem.
>
> Can you try to reduce num_hw_submission for the MES ring?
>
> Thanks,
> Christian.
>
> Am 22.07.24 um 05:27 schrieb Xiao, Jack:
>
> [AMD Official Use Only - AMD Internal Distribution Only]
>
>
> >> I think we rather need to increase the MES ring size instead.
>
> Unfortunately, it doesn't work. I guess mes firmware has limitation.
>
> Regards,
> Jack
>
> ________________________________
> From: Christian König <ckoenig.leichtzumerken@gmail.com>
> Sent: Friday, 19 July 2024 23:44
> To: Xiao, Jack <Jack.Xiao@amd.com>; amd-gfx@lists.freedesktop.org <amd-gfx@lists.freedesktop.org>; Deucher, Alexander <Alexander.Deucher@amd.com>
> Subject: Re: [PATCH] drm/amdgpu/mes: fix mes ring buffer overflow
>
> Am 19.07.24 um 11:16 schrieb Jack Xiao:
> > wait memory room until enough before writing mes packets
> > to avoid ring buffer overflow.
> >
> > Signed-off-by: Jack Xiao <Jack.Xiao@amd.com>
> > ---
> >   drivers/gpu/drm/amd/amdgpu/mes_v11_0.c | 18 ++++++++++++++----
> >   drivers/gpu/drm/amd/amdgpu/mes_v12_0.c | 18 ++++++++++++++----
> >   2 files changed, 28 insertions(+), 8 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
> > index 8ce51b9236c1..68c74adf79f1 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
> > @@ -168,7 +168,7 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
> >        const char *op_str, *misc_op_str;
> >        unsigned long flags;
> >        u64 status_gpu_addr;
> > -     u32 status_offset;
> > +     u32 seq, status_offset;
> >        u64 *status_ptr;
> >        signed long r;
> >        int ret;
> > @@ -196,6 +196,13 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
> >        if (r)
> >                goto error_unlock_free;
> >
> > +     seq = ++ring->fence_drv.sync_seq;
> > +     r = amdgpu_fence_wait_polling(ring,
> > +                                   seq - ring->fence_drv.num_fences_mask,
> > +                                   timeout);
> > +     if (r < 1)
> > +             goto error_undo;
> > +
> >        api_status = (struct MES_API_STATUS *)((char *)pkt + api_status_off);
> >        api_status->api_completion_fence_addr = status_gpu_addr;
> >        api_status->api_completion_fence_value = 1;
> > @@ -208,8 +215,7 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
> >        mes_status_pkt.header.dwsize = API_FRAME_SIZE_IN_DWORDS;
> >        mes_status_pkt.api_status.api_completion_fence_addr =
> >                ring->fence_drv.gpu_addr;
> > -     mes_status_pkt.api_status.api_completion_fence_value =
> > -             ++ring->fence_drv.sync_seq;
> > +     mes_status_pkt.api_status.api_completion_fence_value = seq;
> >
> >        amdgpu_ring_write_multiple(ring, &mes_status_pkt,
> >                                   sizeof(mes_status_pkt) / 4);
> > @@ -229,7 +235,7 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
> >                dev_dbg(adev->dev, "MES msg=%d was emitted\n",
> >                        x_pkt->header.opcode);
> >
> > -     r = amdgpu_fence_wait_polling(ring, ring->fence_drv.sync_seq, timeout);
> > +     r = amdgpu_fence_wait_polling(ring, seq, timeout);
> >        if (r < 1 || !*status_ptr) {
> >
> >                if (misc_op_str)
> > @@ -252,6 +258,10 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
> >        amdgpu_device_wb_free(adev, status_offset);
> >        return 0;
> >
> > +error_undo:
> > +     dev_err(adev->dev, "MES ring buffer is full.\n");
> > +     amdgpu_ring_undo(ring);
> > +
> >   error_unlock_free:
> >        spin_unlock_irqrestore(&mes->ring_lock, flags);
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
> > index c9f74231ad59..48e01206bcc4 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
> > @@ -154,7 +154,7 @@ static int mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
> >        const char *op_str, *misc_op_str;
> >        unsigned long flags;
> >        u64 status_gpu_addr;
> > -     u32 status_offset;
> > +     u32 seq, status_offset;
> >        u64 *status_ptr;
> >        signed long r;
> >        int ret;
> > @@ -182,6 +182,13 @@ static int mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
> >        if (r)
> >                goto error_unlock_free;
> >
> > +     seq = ++ring->fence_drv.sync_seq;
> > +     r = amdgpu_fence_wait_polling(ring,
> > +                                   seq - ring->fence_drv.num_fences_mask,
>
> That's what's amdgpu_fence_emit_polling() does anyway.
>
> So this here just moves the polling a bit earlier.
>
> I think we rather need to increase the MES ring size instead.
>
> Regards,
> Christian.
>
>
> > +                                   timeout);
> > +     if (r < 1)
> > +             goto error_undo;
> > +
> >        api_status = (struct MES_API_STATUS *)((char *)pkt + api_status_off);
> >        api_status->api_completion_fence_addr = status_gpu_addr;
> >        api_status->api_completion_fence_value = 1;
> > @@ -194,8 +201,7 @@ static int mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
> >        mes_status_pkt.header.dwsize = API_FRAME_SIZE_IN_DWORDS;
> >        mes_status_pkt.api_status.api_completion_fence_addr =
> >                ring->fence_drv.gpu_addr;
> > -     mes_status_pkt.api_status.api_completion_fence_value =
> > -             ++ring->fence_drv.sync_seq;
> > +     mes_status_pkt.api_status.api_completion_fence_value = seq;
> >
> >        amdgpu_ring_write_multiple(ring, &mes_status_pkt,
> >                                   sizeof(mes_status_pkt) / 4);
> > @@ -215,7 +221,7 @@ static int mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
> >                dev_dbg(adev->dev, "MES msg=%d was emitted\n",
> >                        x_pkt->header.opcode);
> >
> > -     r = amdgpu_fence_wait_polling(ring, ring->fence_drv.sync_seq, timeout);
> > +     r = amdgpu_fence_wait_polling(ring, seq, timeout);
> >        if (r < 1 || !*status_ptr) {
> >
> >                if (misc_op_str)
> > @@ -238,6 +244,10 @@ static int mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
> >        amdgpu_device_wb_free(adev, status_offset);
> >        return 0;
> >
> > +error_undo:
> > +     dev_err(adev->dev, "MES ring buffer is full.\n");
> > +     amdgpu_ring_undo(ring);
> > +
> >   error_unlock_free:
> >        spin_unlock_irqrestore(&mes->ring_lock, flags);
> >
>
>
>

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] drm/amdgpu/mes: fix mes ring buffer overflow
  2024-07-22 19:52           ` Alex Deucher
@ 2024-07-23  3:08             ` Xiao, Jack
  2024-07-23  8:15               ` Xiao, Jack
  0 siblings, 1 reply; 14+ messages in thread
From: Xiao, Jack @ 2024-07-23  3:08 UTC (permalink / raw)
  To: Alex Deucher, Christian König
  Cc: amd-gfx@lists.freedesktop.org, Deucher,  Alexander

[-- Attachment #1: Type: text/plain, Size: 9009 bytes --]

[AMD Official Use Only - AMD Internal Distribution Only]

> Does this patch fix it?
https://patchwork.freedesktop.org/patch/605437/

No, please do not check in the patch, it will make my fix not working.

Regards,
Jack
________________________________
From: Alex Deucher <alexdeucher@gmail.com>
Sent: Tuesday, 23 July 2024 03:52
To: Christian König <ckoenig.leichtzumerken@gmail.com>
Cc: Xiao, Jack <Jack.Xiao@amd.com>; amd-gfx@lists.freedesktop.org <amd-gfx@lists.freedesktop.org>; Deucher, Alexander <Alexander.Deucher@amd.com>
Subject: Re: [PATCH] drm/amdgpu/mes: fix mes ring buffer overflow

Does this patch fix it?
https://patchwork.freedesktop.org/patch/605437/

Alex

On Mon, Jul 22, 2024 at 7:21 AM Christian König
<ckoenig.leichtzumerken@gmail.com> wrote:
>
> What I meant is that the MES ring is now to small for the number of packets written to it.
>
> Either reduce the num_hw_submission or increase the MES ring size.
>
> E.g. see this code here in amdgpu_ring_init:
>
>         if (ring->funcs->type == AMDGPU_RING_TYPE_KIQ)
>                 sched_hw_submission = max(sched_hw_submission, 256);
>         else if (ring == &adev->sdma.instance[0].page)
>                 sched_hw_submission = 256;
>
> We are basically just missing a case for the MES here as far as I can see.
>
> Regards,
> Christian.
>
> Am 22.07.24 um 10:46 schrieb Xiao, Jack:
>
> [AMD Official Use Only - AMD Internal Distribution Only]
>
>
> >> Can you try to reduce num_hw_submission for the MES ring?
>
> Smaller num_hw_submission should not help for this issue, for Mes work without drm scheduler like legacy kiq. Smaller num_hw_submission will result in smaller mes ring size and more waiting time.
>
> Regards,
> Jack
> ________________________________
> From: Christian König <ckoenig.leichtzumerken@gmail.com>
> Sent: Monday, 22 July 2024 16:20
> To: Xiao, Jack <Jack.Xiao@amd.com>; amd-gfx@lists.freedesktop.org <amd-gfx@lists.freedesktop.org>; Deucher, Alexander <Alexander.Deucher@amd.com>
> Subject: Re: [PATCH] drm/amdgpu/mes: fix mes ring buffer overflow
>
> Thx, but in that case this patch here then won't help either it just mitigates the problem.
>
> Can you try to reduce num_hw_submission for the MES ring?
>
> Thanks,
> Christian.
>
> Am 22.07.24 um 05:27 schrieb Xiao, Jack:
>
> [AMD Official Use Only - AMD Internal Distribution Only]
>
>
> >> I think we rather need to increase the MES ring size instead.
>
> Unfortunately, it doesn't work. I guess mes firmware has limitation.
>
> Regards,
> Jack
>
> ________________________________
> From: Christian König <ckoenig.leichtzumerken@gmail.com>
> Sent: Friday, 19 July 2024 23:44
> To: Xiao, Jack <Jack.Xiao@amd.com>; amd-gfx@lists.freedesktop.org <amd-gfx@lists.freedesktop.org>; Deucher, Alexander <Alexander.Deucher@amd.com>
> Subject: Re: [PATCH] drm/amdgpu/mes: fix mes ring buffer overflow
>
> Am 19.07.24 um 11:16 schrieb Jack Xiao:
> > wait memory room until enough before writing mes packets
> > to avoid ring buffer overflow.
> >
> > Signed-off-by: Jack Xiao <Jack.Xiao@amd.com>
> > ---
> >   drivers/gpu/drm/amd/amdgpu/mes_v11_0.c | 18 ++++++++++++++----
> >   drivers/gpu/drm/amd/amdgpu/mes_v12_0.c | 18 ++++++++++++++----
> >   2 files changed, 28 insertions(+), 8 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
> > index 8ce51b9236c1..68c74adf79f1 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
> > @@ -168,7 +168,7 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
> >        const char *op_str, *misc_op_str;
> >        unsigned long flags;
> >        u64 status_gpu_addr;
> > -     u32 status_offset;
> > +     u32 seq, status_offset;
> >        u64 *status_ptr;
> >        signed long r;
> >        int ret;
> > @@ -196,6 +196,13 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
> >        if (r)
> >                goto error_unlock_free;
> >
> > +     seq = ++ring->fence_drv.sync_seq;
> > +     r = amdgpu_fence_wait_polling(ring,
> > +                                   seq - ring->fence_drv.num_fences_mask,
> > +                                   timeout);
> > +     if (r < 1)
> > +             goto error_undo;
> > +
> >        api_status = (struct MES_API_STATUS *)((char *)pkt + api_status_off);
> >        api_status->api_completion_fence_addr = status_gpu_addr;
> >        api_status->api_completion_fence_value = 1;
> > @@ -208,8 +215,7 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
> >        mes_status_pkt.header.dwsize = API_FRAME_SIZE_IN_DWORDS;
> >        mes_status_pkt.api_status.api_completion_fence_addr =
> >                ring->fence_drv.gpu_addr;
> > -     mes_status_pkt.api_status.api_completion_fence_value =
> > -             ++ring->fence_drv.sync_seq;
> > +     mes_status_pkt.api_status.api_completion_fence_value = seq;
> >
> >        amdgpu_ring_write_multiple(ring, &mes_status_pkt,
> >                                   sizeof(mes_status_pkt) / 4);
> > @@ -229,7 +235,7 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
> >                dev_dbg(adev->dev, "MES msg=%d was emitted\n",
> >                        x_pkt->header.opcode);
> >
> > -     r = amdgpu_fence_wait_polling(ring, ring->fence_drv.sync_seq, timeout);
> > +     r = amdgpu_fence_wait_polling(ring, seq, timeout);
> >        if (r < 1 || !*status_ptr) {
> >
> >                if (misc_op_str)
> > @@ -252,6 +258,10 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
> >        amdgpu_device_wb_free(adev, status_offset);
> >        return 0;
> >
> > +error_undo:
> > +     dev_err(adev->dev, "MES ring buffer is full.\n");
> > +     amdgpu_ring_undo(ring);
> > +
> >   error_unlock_free:
> >        spin_unlock_irqrestore(&mes->ring_lock, flags);
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
> > index c9f74231ad59..48e01206bcc4 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
> > @@ -154,7 +154,7 @@ static int mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
> >        const char *op_str, *misc_op_str;
> >        unsigned long flags;
> >        u64 status_gpu_addr;
> > -     u32 status_offset;
> > +     u32 seq, status_offset;
> >        u64 *status_ptr;
> >        signed long r;
> >        int ret;
> > @@ -182,6 +182,13 @@ static int mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
> >        if (r)
> >                goto error_unlock_free;
> >
> > +     seq = ++ring->fence_drv.sync_seq;
> > +     r = amdgpu_fence_wait_polling(ring,
> > +                                   seq - ring->fence_drv.num_fences_mask,
>
> That's what's amdgpu_fence_emit_polling() does anyway.
>
> So this here just moves the polling a bit earlier.
>
> I think we rather need to increase the MES ring size instead.
>
> Regards,
> Christian.
>
>
> > +                                   timeout);
> > +     if (r < 1)
> > +             goto error_undo;
> > +
> >        api_status = (struct MES_API_STATUS *)((char *)pkt + api_status_off);
> >        api_status->api_completion_fence_addr = status_gpu_addr;
> >        api_status->api_completion_fence_value = 1;
> > @@ -194,8 +201,7 @@ static int mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
> >        mes_status_pkt.header.dwsize = API_FRAME_SIZE_IN_DWORDS;
> >        mes_status_pkt.api_status.api_completion_fence_addr =
> >                ring->fence_drv.gpu_addr;
> > -     mes_status_pkt.api_status.api_completion_fence_value =
> > -             ++ring->fence_drv.sync_seq;
> > +     mes_status_pkt.api_status.api_completion_fence_value = seq;
> >
> >        amdgpu_ring_write_multiple(ring, &mes_status_pkt,
> >                                   sizeof(mes_status_pkt) / 4);
> > @@ -215,7 +221,7 @@ static int mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
> >                dev_dbg(adev->dev, "MES msg=%d was emitted\n",
> >                        x_pkt->header.opcode);
> >
> > -     r = amdgpu_fence_wait_polling(ring, ring->fence_drv.sync_seq, timeout);
> > +     r = amdgpu_fence_wait_polling(ring, seq, timeout);
> >        if (r < 1 || !*status_ptr) {
> >
> >                if (misc_op_str)
> > @@ -238,6 +244,10 @@ static int mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
> >        amdgpu_device_wb_free(adev, status_offset);
> >        return 0;
> >
> > +error_undo:
> > +     dev_err(adev->dev, "MES ring buffer is full.\n");
> > +     amdgpu_ring_undo(ring);
> > +
> >   error_unlock_free:
> >        spin_unlock_irqrestore(&mes->ring_lock, flags);
> >
>
>
>

[-- Attachment #2: Type: text/html, Size: 17378 bytes --]

^ permalink raw reply	[flat|nested] 14+ messages in thread

* RE: [PATCH] drm/amdgpu/mes: fix mes ring buffer overflow
  2024-07-23  3:08             ` Xiao, Jack
@ 2024-07-23  8:15               ` Xiao, Jack
  0 siblings, 0 replies; 14+ messages in thread
From: Xiao, Jack @ 2024-07-23  8:15 UTC (permalink / raw)
  To: Xiao, Jack, Alex Deucher, Christian König
  Cc: amd-gfx@lists.freedesktop.org, Deucher,  Alexander

[-- Attachment #1: Type: text/plain, Size: 10396 bytes --]

[AMD Official Use Only - AMD Internal Distribution Only]

There is a max command number for mes queue in mes_api_def.h

enum {API_NUMBER_OF_COMMAND_MAX   = 32};

It should explain why the patch can’t fix the issue.

I will send out another patch to refine code according to the firmware limitation.

Regards,
Jack

From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of Xiao, Jack
Sent: Tuesday, July 23, 2024 11:08 AM
To: Alex Deucher <alexdeucher@gmail.com>; Christian König <ckoenig.leichtzumerken@gmail.com>
Cc: amd-gfx@lists.freedesktop.org; Deucher, Alexander <Alexander.Deucher@amd.com>
Subject: Re: [PATCH] drm/amdgpu/mes: fix mes ring buffer overflow


[AMD Official Use Only - AMD Internal Distribution Only]


[AMD Official Use Only - AMD Internal Distribution Only]

> Does this patch fix it?
https://patchwork.freedesktop.org/patch/605437/

No, please do not check in the patch, it will make my fix not working.

Regards,
Jack
________________________________
From: Alex Deucher <alexdeucher@gmail.com<mailto:alexdeucher@gmail.com>>
Sent: Tuesday, 23 July 2024 03:52
To: Christian König <ckoenig.leichtzumerken@gmail.com<mailto:ckoenig.leichtzumerken@gmail.com>>
Cc: Xiao, Jack <Jack.Xiao@amd.com<mailto:Jack.Xiao@amd.com>>; amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org> <amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org>>; Deucher, Alexander <Alexander.Deucher@amd.com<mailto:Alexander.Deucher@amd.com>>
Subject: Re: [PATCH] drm/amdgpu/mes: fix mes ring buffer overflow

Does this patch fix it?
https://patchwork.freedesktop.org/patch/605437/

Alex

On Mon, Jul 22, 2024 at 7:21 AM Christian König
<ckoenig.leichtzumerken@gmail.com<mailto:ckoenig.leichtzumerken@gmail.com>> wrote:
>
> What I meant is that the MES ring is now to small for the number of packets written to it.
>
> Either reduce the num_hw_submission or increase the MES ring size.
>
> E.g. see this code here in amdgpu_ring_init:
>
>         if (ring->funcs->type == AMDGPU_RING_TYPE_KIQ)
>                 sched_hw_submission = max(sched_hw_submission, 256);
>         else if (ring == &adev->sdma.instance[0].page)
>                 sched_hw_submission = 256;
>
> We are basically just missing a case for the MES here as far as I can see.
>
> Regards,
> Christian.
>
> Am 22.07.24 um 10:46 schrieb Xiao, Jack:
>
> [AMD Official Use Only - AMD Internal Distribution Only]
>
>
> >> Can you try to reduce num_hw_submission for the MES ring?
>
> Smaller num_hw_submission should not help for this issue, for Mes work without drm scheduler like legacy kiq. Smaller num_hw_submission will result in smaller mes ring size and more waiting time.
>
> Regards,
> Jack
> ________________________________
> From: Christian König <ckoenig.leichtzumerken@gmail.com<mailto:ckoenig.leichtzumerken@gmail.com>>
> Sent: Monday, 22 July 2024 16:20
> To: Xiao, Jack <Jack.Xiao@amd.com<mailto:Jack.Xiao@amd.com>>; amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org> <amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org>>; Deucher, Alexander <Alexander.Deucher@amd.com<mailto:Alexander.Deucher@amd.com>>
> Subject: Re: [PATCH] drm/amdgpu/mes: fix mes ring buffer overflow
>
> Thx, but in that case this patch here then won't help either it just mitigates the problem.
>
> Can you try to reduce num_hw_submission for the MES ring?
>
> Thanks,
> Christian.
>
> Am 22.07.24 um 05:27 schrieb Xiao, Jack:
>
> [AMD Official Use Only - AMD Internal Distribution Only]
>
>
> >> I think we rather need to increase the MES ring size instead.
>
> Unfortunately, it doesn't work. I guess mes firmware has limitation.
>
> Regards,
> Jack
>
> ________________________________
> From: Christian König <ckoenig.leichtzumerken@gmail.com<mailto:ckoenig.leichtzumerken@gmail.com>>
> Sent: Friday, 19 July 2024 23:44
> To: Xiao, Jack <Jack.Xiao@amd.com<mailto:Jack.Xiao@amd.com>>; amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org> <amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org>>; Deucher, Alexander <Alexander.Deucher@amd.com<mailto:Alexander.Deucher@amd.com>>
> Subject: Re: [PATCH] drm/amdgpu/mes: fix mes ring buffer overflow
>
> Am 19.07.24 um 11:16 schrieb Jack Xiao:
> > wait memory room until enough before writing mes packets
> > to avoid ring buffer overflow.
> >
> > Signed-off-by: Jack Xiao <Jack.Xiao@amd.com<mailto:Jack.Xiao@amd.com>>
> > ---
> >   drivers/gpu/drm/amd/amdgpu/mes_v11_0.c | 18 ++++++++++++++----
> >   drivers/gpu/drm/amd/amdgpu/mes_v12_0.c | 18 ++++++++++++++----
> >   2 files changed, 28 insertions(+), 8 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
> > index 8ce51b9236c1..68c74adf79f1 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
> > @@ -168,7 +168,7 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
> >        const char *op_str, *misc_op_str;
> >        unsigned long flags;
> >        u64 status_gpu_addr;
> > -     u32 status_offset;
> > +     u32 seq, status_offset;
> >        u64 *status_ptr;
> >        signed long r;
> >        int ret;
> > @@ -196,6 +196,13 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
> >        if (r)
> >                goto error_unlock_free;
> >
> > +     seq = ++ring->fence_drv.sync_seq;
> > +     r = amdgpu_fence_wait_polling(ring,
> > +                                   seq - ring->fence_drv.num_fences_mask,
> > +                                   timeout);
> > +     if (r < 1)
> > +             goto error_undo;
> > +
> >        api_status = (struct MES_API_STATUS *)((char *)pkt + api_status_off);
> >        api_status->api_completion_fence_addr = status_gpu_addr;
> >        api_status->api_completion_fence_value = 1;
> > @@ -208,8 +215,7 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
> >        mes_status_pkt.header.dwsize = API_FRAME_SIZE_IN_DWORDS;
> >        mes_status_pkt.api_status.api_completion_fence_addr =
> >                ring->fence_drv.gpu_addr;
> > -     mes_status_pkt.api_status.api_completion_fence_value =
> > -             ++ring->fence_drv.sync_seq;
> > +     mes_status_pkt.api_status.api_completion_fence_value = seq;
> >
> >        amdgpu_ring_write_multiple(ring, &mes_status_pkt,
> >                                   sizeof(mes_status_pkt) / 4);
> > @@ -229,7 +235,7 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
> >                dev_dbg(adev->dev, "MES msg=%d was emitted\n",
> >                        x_pkt->header.opcode);
> >
> > -     r = amdgpu_fence_wait_polling(ring, ring->fence_drv.sync_seq, timeout);
> > +     r = amdgpu_fence_wait_polling(ring, seq, timeout);
> >        if (r < 1 || !*status_ptr) {
> >
> >                if (misc_op_str)
> > @@ -252,6 +258,10 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
> >        amdgpu_device_wb_free(adev, status_offset);
> >        return 0;
> >
> > +error_undo:
> > +     dev_err(adev->dev, "MES ring buffer is full.\n");
> > +     amdgpu_ring_undo(ring);
> > +
> >   error_unlock_free:
> >        spin_unlock_irqrestore(&mes->ring_lock, flags);
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
> > index c9f74231ad59..48e01206bcc4 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
> > @@ -154,7 +154,7 @@ static int mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
> >        const char *op_str, *misc_op_str;
> >        unsigned long flags;
> >        u64 status_gpu_addr;
> > -     u32 status_offset;
> > +     u32 seq, status_offset;
> >        u64 *status_ptr;
> >        signed long r;
> >        int ret;
> > @@ -182,6 +182,13 @@ static int mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
> >        if (r)
> >                goto error_unlock_free;
> >
> > +     seq = ++ring->fence_drv.sync_seq;
> > +     r = amdgpu_fence_wait_polling(ring,
> > +                                   seq - ring->fence_drv.num_fences_mask,
>
> That's what's amdgpu_fence_emit_polling() does anyway.
>
> So this here just moves the polling a bit earlier.
>
> I think we rather need to increase the MES ring size instead.
>
> Regards,
> Christian.
>
>
> > +                                   timeout);
> > +     if (r < 1)
> > +             goto error_undo;
> > +
> >        api_status = (struct MES_API_STATUS *)((char *)pkt + api_status_off);
> >        api_status->api_completion_fence_addr = status_gpu_addr;
> >        api_status->api_completion_fence_value = 1;
> > @@ -194,8 +201,7 @@ static int mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
> >        mes_status_pkt.header.dwsize = API_FRAME_SIZE_IN_DWORDS;
> >        mes_status_pkt.api_status.api_completion_fence_addr =
> >                ring->fence_drv.gpu_addr;
> > -     mes_status_pkt.api_status.api_completion_fence_value =
> > -             ++ring->fence_drv.sync_seq;
> > +     mes_status_pkt.api_status.api_completion_fence_value = seq;
> >
> >        amdgpu_ring_write_multiple(ring, &mes_status_pkt,
> >                                   sizeof(mes_status_pkt) / 4);
> > @@ -215,7 +221,7 @@ static int mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
> >                dev_dbg(adev->dev, "MES msg=%d was emitted\n",
> >                        x_pkt->header.opcode);
> >
> > -     r = amdgpu_fence_wait_polling(ring, ring->fence_drv.sync_seq, timeout);
> > +     r = amdgpu_fence_wait_polling(ring, seq, timeout);
> >        if (r < 1 || !*status_ptr) {
> >
> >                if (misc_op_str)
> > @@ -238,6 +244,10 @@ static int mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
> >        amdgpu_device_wb_free(adev, status_offset);
> >        return 0;
> >
> > +error_undo:
> > +     dev_err(adev->dev, "MES ring buffer is full.\n");
> > +     amdgpu_ring_undo(ring);
> > +
> >   error_unlock_free:
> >        spin_unlock_irqrestore(&mes->ring_lock, flags);
> >
>
>
>

[-- Attachment #2: Type: text/html, Size: 21852 bytes --]

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH] drm/amdgpu/mes: fix mes ring buffer overflow
@ 2024-08-27 14:10 Alex Deucher
  2024-08-27 14:21 ` Greg KH
  0 siblings, 1 reply; 14+ messages in thread
From: Alex Deucher @ 2024-08-27 14:10 UTC (permalink / raw)
  To: stable, gregkh, sashal; +Cc: Jack Xiao, Alex Deucher

From: Jack Xiao <Jack.Xiao@amd.com>

wait memory room until enough before writing mes packets
to avoid ring buffer overflow.

v2: squash in sched_hw_submission fix

Backport from 6.11.

Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/3571
Fixes: de3246254156 ("drm/amdgpu: cleanup MES11 command submission")
Fixes: fffe347e1478 ("drm/amdgpu: cleanup MES12 command submission")
Signed-off-by: Jack Xiao <Jack.Xiao@amd.com>
Acked-by: Alex Deucher <alexander.deucher@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
(cherry picked from commit 34e087e8920e635c62e2ed6a758b0cd27f836d13)
Cc: stable@vger.kernel.org # 6.10.x
(cherry picked from commit 11752c013f562a1124088a35bd314aa0e9f0e88f)
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c |  2 ++
 drivers/gpu/drm/amd/amdgpu/mes_v11_0.c   | 18 ++++++++++++++----
 2 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
index 06f0a6534a94..88ffb15e25cc 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
@@ -212,6 +212,8 @@ int amdgpu_ring_init(struct amdgpu_device *adev, struct amdgpu_ring *ring,
 	 */
 	if (ring->funcs->type == AMDGPU_RING_TYPE_KIQ)
 		sched_hw_submission = max(sched_hw_submission, 256);
+	if (ring->funcs->type == AMDGPU_RING_TYPE_MES)
+		sched_hw_submission = 8;
 	else if (ring == &adev->sdma.instance[0].page)
 		sched_hw_submission = 256;
 
diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
index 32d4519541c6..e1a66d585f5e 100644
--- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
@@ -163,7 +163,7 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
 	const char *op_str, *misc_op_str;
 	unsigned long flags;
 	u64 status_gpu_addr;
-	u32 status_offset;
+	u32 seq, status_offset;
 	u64 *status_ptr;
 	signed long r;
 	int ret;
@@ -191,6 +191,13 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
 	if (r)
 		goto error_unlock_free;
 
+	seq = ++ring->fence_drv.sync_seq;
+	r = amdgpu_fence_wait_polling(ring,
+				      seq - ring->fence_drv.num_fences_mask,
+				      timeout);
+	if (r < 1)
+		goto error_undo;
+
 	api_status = (struct MES_API_STATUS *)((char *)pkt + api_status_off);
 	api_status->api_completion_fence_addr = status_gpu_addr;
 	api_status->api_completion_fence_value = 1;
@@ -203,8 +210,7 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
 	mes_status_pkt.header.dwsize = API_FRAME_SIZE_IN_DWORDS;
 	mes_status_pkt.api_status.api_completion_fence_addr =
 		ring->fence_drv.gpu_addr;
-	mes_status_pkt.api_status.api_completion_fence_value =
-		++ring->fence_drv.sync_seq;
+	mes_status_pkt.api_status.api_completion_fence_value = seq;
 
 	amdgpu_ring_write_multiple(ring, &mes_status_pkt,
 				   sizeof(mes_status_pkt) / 4);
@@ -224,7 +230,7 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
 		dev_dbg(adev->dev, "MES msg=%d was emitted\n",
 			x_pkt->header.opcode);
 
-	r = amdgpu_fence_wait_polling(ring, ring->fence_drv.sync_seq, timeout);
+	r = amdgpu_fence_wait_polling(ring, seq, timeout);
 	if (r < 1 || !*status_ptr) {
 
 		if (misc_op_str)
@@ -247,6 +253,10 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
 	amdgpu_device_wb_free(adev, status_offset);
 	return 0;
 
+error_undo:
+	dev_err(adev->dev, "MES ring buffer is full.\n");
+	amdgpu_ring_undo(ring);
+
 error_unlock_free:
 	spin_unlock_irqrestore(&mes->ring_lock, flags);
 
-- 
2.46.0


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH] drm/amdgpu/mes: fix mes ring buffer overflow
  2024-08-27 14:10 [PATCH] drm/amdgpu/mes: fix mes ring buffer overflow Alex Deucher
@ 2024-08-27 14:21 ` Greg KH
  2024-08-27 15:01   ` Deucher, Alexander
  0 siblings, 1 reply; 14+ messages in thread
From: Greg KH @ 2024-08-27 14:21 UTC (permalink / raw)
  To: Alex Deucher; +Cc: stable, sashal, Jack Xiao

On Tue, Aug 27, 2024 at 10:10:25AM -0400, Alex Deucher wrote:
> From: Jack Xiao <Jack.Xiao@amd.com>
> 
> wait memory room until enough before writing mes packets
> to avoid ring buffer overflow.
> 
> v2: squash in sched_hw_submission fix
> 
> Backport from 6.11.
> 
> Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/3571
> Fixes: de3246254156 ("drm/amdgpu: cleanup MES11 command submission")
> Fixes: fffe347e1478 ("drm/amdgpu: cleanup MES12 command submission")

These commits are in 6.11-rc1.

> Signed-off-by: Jack Xiao <Jack.Xiao@amd.com>
> Acked-by: Alex Deucher <alexander.deucher@amd.com>
> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
> (cherry picked from commit 34e087e8920e635c62e2ed6a758b0cd27f836d13)
> Cc: stable@vger.kernel.org # 6.10.x

So why does this need to go to 6.10.y?

confused,

greg k-h

^ permalink raw reply	[flat|nested] 14+ messages in thread

* RE: [PATCH] drm/amdgpu/mes: fix mes ring buffer overflow
  2024-08-27 14:21 ` Greg KH
@ 2024-08-27 15:01   ` Deucher, Alexander
  2024-08-27 16:13     ` Greg KH
  0 siblings, 1 reply; 14+ messages in thread
From: Deucher, Alexander @ 2024-08-27 15:01 UTC (permalink / raw)
  To: Greg KH; +Cc: stable@vger.kernel.org, sashal@kernel.org, Xiao, Jack

[Public]

> -----Original Message-----
> From: Greg KH <gregkh@linuxfoundation.org>
> Sent: Tuesday, August 27, 2024 10:21 AM
> To: Deucher, Alexander <Alexander.Deucher@amd.com>
> Cc: stable@vger.kernel.org; sashal@kernel.org; Xiao, Jack
> <Jack.Xiao@amd.com>
> Subject: Re: [PATCH] drm/amdgpu/mes: fix mes ring buffer overflow
>
> On Tue, Aug 27, 2024 at 10:10:25AM -0400, Alex Deucher wrote:
> > From: Jack Xiao <Jack.Xiao@amd.com>
> >
> > wait memory room until enough before writing mes packets to avoid ring
> > buffer overflow.
> >
> > v2: squash in sched_hw_submission fix
> >
> > Backport from 6.11.
> >
> > Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/3571
> > Fixes: de3246254156 ("drm/amdgpu: cleanup MES11 command
> submission")
> > Fixes: fffe347e1478 ("drm/amdgpu: cleanup MES12 command submission")
>
> These commits are in 6.11-rc1.

de3246254156 ("drm/amdgpu: cleanup MES11 command submission")
was ported to 6.10 as well:
https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c?h=linux-6.10.y&id=e356d321d0240663a09b139fa3658ddbca163e27
So this fix is applicable there.

Alex

>
> > Signed-off-by: Jack Xiao <Jack.Xiao@amd.com>
> > Acked-by: Alex Deucher <alexander.deucher@amd.com>
> > Signed-off-by: Alex Deucher <alexander.deucher@amd.com> (cherry picked
> > from commit 34e087e8920e635c62e2ed6a758b0cd27f836d13)
> > Cc: stable@vger.kernel.org # 6.10.x
>
> So why does this need to go to 6.10.y?
>
> confused,
>
> greg k-h

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] drm/amdgpu/mes: fix mes ring buffer overflow
  2024-08-27 15:01   ` Deucher, Alexander
@ 2024-08-27 16:13     ` Greg KH
  0 siblings, 0 replies; 14+ messages in thread
From: Greg KH @ 2024-08-27 16:13 UTC (permalink / raw)
  To: Deucher, Alexander; +Cc: stable@vger.kernel.org, sashal@kernel.org, Xiao, Jack

On Tue, Aug 27, 2024 at 03:01:54PM +0000, Deucher, Alexander wrote:
> [Public]
> 
> > -----Original Message-----
> > From: Greg KH <gregkh@linuxfoundation.org>
> > Sent: Tuesday, August 27, 2024 10:21 AM
> > To: Deucher, Alexander <Alexander.Deucher@amd.com>
> > Cc: stable@vger.kernel.org; sashal@kernel.org; Xiao, Jack
> > <Jack.Xiao@amd.com>
> > Subject: Re: [PATCH] drm/amdgpu/mes: fix mes ring buffer overflow
> >
> > On Tue, Aug 27, 2024 at 10:10:25AM -0400, Alex Deucher wrote:
> > > From: Jack Xiao <Jack.Xiao@amd.com>
> > >
> > > wait memory room until enough before writing mes packets to avoid ring
> > > buffer overflow.
> > >
> > > v2: squash in sched_hw_submission fix
> > >
> > > Backport from 6.11.
> > >
> > > Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/3571
> > > Fixes: de3246254156 ("drm/amdgpu: cleanup MES11 command
> > submission")
> > > Fixes: fffe347e1478 ("drm/amdgpu: cleanup MES12 command submission")
> >
> > These commits are in 6.11-rc1.
> 
> de3246254156 ("drm/amdgpu: cleanup MES11 command submission")
> was ported to 6.10 as well:
> https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c?h=linux-6.10.y&id=e356d321d0240663a09b139fa3658ddbca163e27
> So this fix is applicable there.

No, commit e356d321d024 ("drm/amdgpu: cleanup MES11 command submission")
is in the 6.10 release, but commit de3246254156 ("drm/amdgpu: cleanup
MES11 command submission") is in 6.11-rc1!

So how in the world are we supposed to know anything here?

See how broken this all is?

I give up.

If you all want any AMD patches applied to stable trees, manually send
us a set of backported patches, AND be sure to get the git ids right.

I'll leave what I have right now in the queues, but after this round of
-rc releases, all AMD patches with cc: stable are going to be
automatically dropped and ignored.  I NEED you all to manually send them
to me now as this is just insane.

Time to go buy a Intel gpu card as there's no way this is going to work
out well over time...

{sigh}

greg k-h

^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2024-08-27 16:13 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2024-08-27 14:10 [PATCH] drm/amdgpu/mes: fix mes ring buffer overflow Alex Deucher
2024-08-27 14:21 ` Greg KH
2024-08-27 15:01   ` Deucher, Alexander
2024-08-27 16:13     ` Greg KH
  -- strict thread matches above, loose matches on Subject: below --
2024-07-19  9:16 Jack Xiao
2024-07-19 13:56 ` Alex Deucher
2024-07-19 15:44 ` Christian König
2024-07-22  3:27   ` Xiao, Jack
2024-07-22  8:20     ` Christian König
2024-07-22  8:46       ` Xiao, Jack
2024-07-22 11:20         ` Christian König
2024-07-22 19:52           ` Alex Deucher
2024-07-23  3:08             ` Xiao, Jack
2024-07-23  8:15               ` Xiao, Jack

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.