[PATCH] drm/amdgpu: optimize the padding with hw optimization

AMD-GFX Archive on lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH] drm/amdgpu: optimize the padding with hw optimization
@ 2024-07-30 12:43 Sunil Khatri
  2024-07-30 13:17 ` Christian König
                   ` (2 more replies)
  0 siblings, 3 replies; 16+ messages in thread
From: Sunil Khatri @ 2024-07-30 12:43 UTC (permalink / raw)
  To: Alex Deucher, Christian König
  Cc: amd-gfx, Sunil Khatri, Pierre-Eric Pelloux-Prayer, Tvrtko Ursulin,
	Marek Olšák

Adding NOP packets one by one in the ring
does not use the CP efficiently.

Solution:
Use CP optimization while adding NOP packet's so PFP
can discard NOP packets based on information of count
from the Header instead of fetching all NOP packets
one by one.

Cc: Christian König <christian.koenig@amd.com>
Cc: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Cc: Tvrtko Ursulin <tursulin@igalia.com>
Cc: Marek Olšák <marek.olsak@amd.com>
Signed-off-by: Sunil Khatri <sunil.khatri@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 24 +++++++++++++++++++++---
 1 file changed, 21 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
index 853084a2ce7f..edf5b5c4d185 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
@@ -9397,6 +9397,24 @@ static void gfx_v10_0_emit_mem_sync(struct amdgpu_ring *ring)
 	amdgpu_ring_write(ring, gcr_cntl); /* GCR_CNTL */
 }
 
+static void amdgpu_gfx10_ring_insert_nop(struct amdgpu_ring *ring, uint32_t num_nop)
+{
+	int i;
+
+	/* Header itself is a NOP packet */
+	if (num_nop == 1) {
+		amdgpu_ring_write(ring, ring->funcs->nop);
+		return;
+	}
+
+	/* Max HW optimization till 0x3ffe, followed by remaining one NOP at a time*/
+	amdgpu_ring_write(ring, PACKET3(PACKET3_NOP, min(num_nop - 2, 0x3ffe)));
+
+	/* Header is at index 0, followed by num_nops - 1 NOP packet's */
+	for (i = 1; i < num_nop; i++)
+		amdgpu_ring_write(ring, ring->funcs->nop);
+}
+
 static void gfx_v10_ip_print(void *handle, struct drm_printer *p)
 {
 	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
@@ -9588,7 +9606,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_gfx = {
 	.emit_hdp_flush = gfx_v10_0_ring_emit_hdp_flush,
 	.test_ring = gfx_v10_0_ring_test_ring,
 	.test_ib = gfx_v10_0_ring_test_ib,
-	.insert_nop = amdgpu_ring_insert_nop,
+	.insert_nop = amdgpu_gfx10_ring_insert_nop,
 	.pad_ib = amdgpu_ring_generic_pad_ib,
 	.emit_switch_buffer = gfx_v10_0_ring_emit_sb,
 	.emit_cntxcntl = gfx_v10_0_ring_emit_cntxcntl,
@@ -9629,7 +9647,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_compute = {
 	.emit_hdp_flush = gfx_v10_0_ring_emit_hdp_flush,
 	.test_ring = gfx_v10_0_ring_test_ring,
 	.test_ib = gfx_v10_0_ring_test_ib,
-	.insert_nop = amdgpu_ring_insert_nop,
+	.insert_nop = amdgpu_gfx10_ring_insert_nop,
 	.pad_ib = amdgpu_ring_generic_pad_ib,
 	.emit_wreg = gfx_v10_0_ring_emit_wreg,
 	.emit_reg_wait = gfx_v10_0_ring_emit_reg_wait,
@@ -9659,7 +9677,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_kiq = {
 	.emit_fence = gfx_v10_0_ring_emit_fence_kiq,
 	.test_ring = gfx_v10_0_ring_test_ring,
 	.test_ib = gfx_v10_0_ring_test_ib,
-	.insert_nop = amdgpu_ring_insert_nop,
+	.insert_nop = amdgpu_gfx10_ring_insert_nop,
 	.pad_ib = amdgpu_ring_generic_pad_ib,
 	.emit_rreg = gfx_v10_0_ring_emit_rreg,
 	.emit_wreg = gfx_v10_0_ring_emit_wreg,
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* Re: [PATCH] drm/amdgpu: optimize the padding with hw optimization
  2024-07-30 12:43 [PATCH] drm/amdgpu: optimize the padding with hw optimization Sunil Khatri
@ 2024-07-30 13:17 ` Christian König
  2024-08-01  3:19 ` Marek Olšák
  2024-08-01  3:24 ` Marek Olšák
  2 siblings, 0 replies; 16+ messages in thread
From: Christian König @ 2024-07-30 13:17 UTC (permalink / raw)
  To: Sunil Khatri, Alex Deucher
  Cc: amd-gfx, Pierre-Eric Pelloux-Prayer, Tvrtko Ursulin,
	Marek Olšák

Am 30.07.24 um 14:43 schrieb Sunil Khatri:
> Adding NOP packets one by one in the ring
> does not use the CP efficiently.
>
> Solution:
> Use CP optimization while adding NOP packet's so PFP
> can discard NOP packets based on information of count
> from the Header instead of fetching all NOP packets
> one by one.
>
> Cc: Christian König <christian.koenig@amd.com>
> Cc: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
> Cc: Tvrtko Ursulin <tursulin@igalia.com>
> Cc: Marek Olšák <marek.olsak@amd.com>
> Signed-off-by: Sunil Khatri <sunil.khatri@amd.com>

Reviewed-by: Christian König <christian.koenig@amd.com>

> ---
>   drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 24 +++++++++++++++++++++---
>   1 file changed, 21 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> index 853084a2ce7f..edf5b5c4d185 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> @@ -9397,6 +9397,24 @@ static void gfx_v10_0_emit_mem_sync(struct amdgpu_ring *ring)
>   	amdgpu_ring_write(ring, gcr_cntl); /* GCR_CNTL */
>   }
>   
> +static void amdgpu_gfx10_ring_insert_nop(struct amdgpu_ring *ring, uint32_t num_nop)
> +{
> +	int i;
> +
> +	/* Header itself is a NOP packet */
> +	if (num_nop == 1) {
> +		amdgpu_ring_write(ring, ring->funcs->nop);
> +		return;
> +	}
> +
> +	/* Max HW optimization till 0x3ffe, followed by remaining one NOP at a time*/
> +	amdgpu_ring_write(ring, PACKET3(PACKET3_NOP, min(num_nop - 2, 0x3ffe)));
> +
> +	/* Header is at index 0, followed by num_nops - 1 NOP packet's */
> +	for (i = 1; i < num_nop; i++)
> +		amdgpu_ring_write(ring, ring->funcs->nop);
> +}
> +
>   static void gfx_v10_ip_print(void *handle, struct drm_printer *p)
>   {
>   	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
> @@ -9588,7 +9606,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_gfx = {
>   	.emit_hdp_flush = gfx_v10_0_ring_emit_hdp_flush,
>   	.test_ring = gfx_v10_0_ring_test_ring,
>   	.test_ib = gfx_v10_0_ring_test_ib,
> -	.insert_nop = amdgpu_ring_insert_nop,
> +	.insert_nop = amdgpu_gfx10_ring_insert_nop,
>   	.pad_ib = amdgpu_ring_generic_pad_ib,
>   	.emit_switch_buffer = gfx_v10_0_ring_emit_sb,
>   	.emit_cntxcntl = gfx_v10_0_ring_emit_cntxcntl,
> @@ -9629,7 +9647,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_compute = {
>   	.emit_hdp_flush = gfx_v10_0_ring_emit_hdp_flush,
>   	.test_ring = gfx_v10_0_ring_test_ring,
>   	.test_ib = gfx_v10_0_ring_test_ib,
> -	.insert_nop = amdgpu_ring_insert_nop,
> +	.insert_nop = amdgpu_gfx10_ring_insert_nop,
>   	.pad_ib = amdgpu_ring_generic_pad_ib,
>   	.emit_wreg = gfx_v10_0_ring_emit_wreg,
>   	.emit_reg_wait = gfx_v10_0_ring_emit_reg_wait,
> @@ -9659,7 +9677,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_kiq = {
>   	.emit_fence = gfx_v10_0_ring_emit_fence_kiq,
>   	.test_ring = gfx_v10_0_ring_test_ring,
>   	.test_ib = gfx_v10_0_ring_test_ib,
> -	.insert_nop = amdgpu_ring_insert_nop,
> +	.insert_nop = amdgpu_gfx10_ring_insert_nop,
>   	.pad_ib = amdgpu_ring_generic_pad_ib,
>   	.emit_rreg = gfx_v10_0_ring_emit_rreg,
>   	.emit_wreg = gfx_v10_0_ring_emit_wreg,


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] drm/amdgpu: optimize the padding with hw optimization
  2024-07-30 12:43 [PATCH] drm/amdgpu: optimize the padding with hw optimization Sunil Khatri
  2024-07-30 13:17 ` Christian König
@ 2024-08-01  3:19 ` Marek Olšák
  2024-08-01  3:22   ` Marek Olšák
  2024-08-01  4:27   ` Khatri, Sunil
  2024-08-01  3:24 ` Marek Olšák
  2 siblings, 2 replies; 16+ messages in thread
From: Marek Olšák @ 2024-08-01  3:19 UTC (permalink / raw)
  To: Sunil Khatri
  Cc: Alex Deucher, Christian König, amd-gfx,
	Pierre-Eric Pelloux-Prayer, Tvrtko Ursulin, Marek Olšák

On Tue, Jul 30, 2024 at 8:43 AM Sunil Khatri <sunil.khatri@amd.com> wrote:
>
> Adding NOP packets one by one in the ring
> does not use the CP efficiently.
>
> Solution:
> Use CP optimization while adding NOP packet's so PFP
> can discard NOP packets based on information of count
> from the Header instead of fetching all NOP packets
> one by one.
>
> Cc: Christian König <christian.koenig@amd.com>
> Cc: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
> Cc: Tvrtko Ursulin <tursulin@igalia.com>
> Cc: Marek Olšák <marek.olsak@amd.com>
> Signed-off-by: Sunil Khatri <sunil.khatri@amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 24 +++++++++++++++++++++---
>  1 file changed, 21 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> index 853084a2ce7f..edf5b5c4d185 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> @@ -9397,6 +9397,24 @@ static void gfx_v10_0_emit_mem_sync(struct amdgpu_ring *ring)
>         amdgpu_ring_write(ring, gcr_cntl); /* GCR_CNTL */
>  }
>
> +static void amdgpu_gfx10_ring_insert_nop(struct amdgpu_ring *ring, uint32_t num_nop)
> +{
> +       int i;
> +
> +       /* Header itself is a NOP packet */
> +       if (num_nop == 1) {
> +               amdgpu_ring_write(ring, ring->funcs->nop);
> +               return;
> +       }
> +
> +       /* Max HW optimization till 0x3ffe, followed by remaining one NOP at a time*/
> +       amdgpu_ring_write(ring, PACKET3(PACKET3_NOP, min(num_nop - 2, 0x3ffe)));
> +
> +       /* Header is at index 0, followed by num_nops - 1 NOP packet's */
> +       for (i = 1; i < num_nop; i++)
> +               amdgpu_ring_write(ring, ring->funcs->nop);

This loop should be removed. It's unnecessary CPU overhead and we
should never get more than 0x3fff NOPs (maybe use BUG_ON). Leaving the
whole packet body uninitialized is the fastest option.

Marek

> +}
> +
>  static void gfx_v10_ip_print(void *handle, struct drm_printer *p)
>  {
>         struct amdgpu_device *adev = (struct amdgpu_device *)handle;
> @@ -9588,7 +9606,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_gfx = {
>         .emit_hdp_flush = gfx_v10_0_ring_emit_hdp_flush,
>         .test_ring = gfx_v10_0_ring_test_ring,
>         .test_ib = gfx_v10_0_ring_test_ib,
> -       .insert_nop = amdgpu_ring_insert_nop,
> +       .insert_nop = amdgpu_gfx10_ring_insert_nop,
>         .pad_ib = amdgpu_ring_generic_pad_ib,
>         .emit_switch_buffer = gfx_v10_0_ring_emit_sb,
>         .emit_cntxcntl = gfx_v10_0_ring_emit_cntxcntl,
> @@ -9629,7 +9647,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_compute = {
>         .emit_hdp_flush = gfx_v10_0_ring_emit_hdp_flush,
>         .test_ring = gfx_v10_0_ring_test_ring,
>         .test_ib = gfx_v10_0_ring_test_ib,
> -       .insert_nop = amdgpu_ring_insert_nop,
> +       .insert_nop = amdgpu_gfx10_ring_insert_nop,
>         .pad_ib = amdgpu_ring_generic_pad_ib,
>         .emit_wreg = gfx_v10_0_ring_emit_wreg,
>         .emit_reg_wait = gfx_v10_0_ring_emit_reg_wait,
> @@ -9659,7 +9677,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_kiq = {
>         .emit_fence = gfx_v10_0_ring_emit_fence_kiq,
>         .test_ring = gfx_v10_0_ring_test_ring,
>         .test_ib = gfx_v10_0_ring_test_ib,
> -       .insert_nop = amdgpu_ring_insert_nop,
> +       .insert_nop = amdgpu_gfx10_ring_insert_nop,
>         .pad_ib = amdgpu_ring_generic_pad_ib,
>         .emit_rreg = gfx_v10_0_ring_emit_rreg,
>         .emit_wreg = gfx_v10_0_ring_emit_wreg,
> --
> 2.34.1
>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] drm/amdgpu: optimize the padding with hw optimization
  2024-08-01  3:19 ` Marek Olšák
@ 2024-08-01  3:22   ` Marek Olšák
  2024-08-01  4:32     ` Khatri, Sunil
  2024-08-01  4:27   ` Khatri, Sunil
  1 sibling, 1 reply; 16+ messages in thread
From: Marek Olšák @ 2024-08-01  3:22 UTC (permalink / raw)
  To: Sunil Khatri
  Cc: Alex Deucher, Christian König, amd-gfx,
	Pierre-Eric Pelloux-Prayer, Tvrtko Ursulin, Marek Olšák

On Wed, Jul 31, 2024 at 11:19 PM Marek Olšák <maraeo@gmail.com> wrote:
>
> On Tue, Jul 30, 2024 at 8:43 AM Sunil Khatri <sunil.khatri@amd.com> wrote:
> >
> > Adding NOP packets one by one in the ring
> > does not use the CP efficiently.
> >
> > Solution:
> > Use CP optimization while adding NOP packet's so PFP
> > can discard NOP packets based on information of count
> > from the Header instead of fetching all NOP packets
> > one by one.
> >
> > Cc: Christian König <christian.koenig@amd.com>
> > Cc: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
> > Cc: Tvrtko Ursulin <tursulin@igalia.com>
> > Cc: Marek Olšák <marek.olsak@amd.com>
> > Signed-off-by: Sunil Khatri <sunil.khatri@amd.com>
> > ---
> >  drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 24 +++++++++++++++++++++---
> >  1 file changed, 21 insertions(+), 3 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> > index 853084a2ce7f..edf5b5c4d185 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> > @@ -9397,6 +9397,24 @@ static void gfx_v10_0_emit_mem_sync(struct amdgpu_ring *ring)
> >         amdgpu_ring_write(ring, gcr_cntl); /* GCR_CNTL */
> >  }
> >
> > +static void amdgpu_gfx10_ring_insert_nop(struct amdgpu_ring *ring, uint32_t num_nop)
> > +{
> > +       int i;
> > +
> > +       /* Header itself is a NOP packet */
> > +       if (num_nop == 1) {
> > +               amdgpu_ring_write(ring, ring->funcs->nop);
> > +               return;
> > +       }
> > +
> > +       /* Max HW optimization till 0x3ffe, followed by remaining one NOP at a time*/
> > +       amdgpu_ring_write(ring, PACKET3(PACKET3_NOP, min(num_nop - 2, 0x3ffe)));
> > +
> > +       /* Header is at index 0, followed by num_nops - 1 NOP packet's */
> > +       for (i = 1; i < num_nop; i++)
> > +               amdgpu_ring_write(ring, ring->funcs->nop);
>
> This loop should be removed. It's unnecessary CPU overhead and we
> should never get more than 0x3fff NOPs (maybe use BUG_ON). Leaving the
> whole packet body uninitialized is the fastest option.

If you remove amdgpu_ring_write, you still need to move wptr somehow.
amdgpu_ring_write_multiple gives a hint about how to do it:

ring->wptr += count_dw;
ring->wptr &= ring->ptr_mask;
ring->count_dw -= count_dw;

Marek

>
> Marek
>
> > +}
> > +
> >  static void gfx_v10_ip_print(void *handle, struct drm_printer *p)
> >  {
> >         struct amdgpu_device *adev = (struct amdgpu_device *)handle;
> > @@ -9588,7 +9606,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_gfx = {
> >         .emit_hdp_flush = gfx_v10_0_ring_emit_hdp_flush,
> >         .test_ring = gfx_v10_0_ring_test_ring,
> >         .test_ib = gfx_v10_0_ring_test_ib,
> > -       .insert_nop = amdgpu_ring_insert_nop,
> > +       .insert_nop = amdgpu_gfx10_ring_insert_nop,
> >         .pad_ib = amdgpu_ring_generic_pad_ib,
> >         .emit_switch_buffer = gfx_v10_0_ring_emit_sb,
> >         .emit_cntxcntl = gfx_v10_0_ring_emit_cntxcntl,
> > @@ -9629,7 +9647,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_compute = {
> >         .emit_hdp_flush = gfx_v10_0_ring_emit_hdp_flush,
> >         .test_ring = gfx_v10_0_ring_test_ring,
> >         .test_ib = gfx_v10_0_ring_test_ib,
> > -       .insert_nop = amdgpu_ring_insert_nop,
> > +       .insert_nop = amdgpu_gfx10_ring_insert_nop,
> >         .pad_ib = amdgpu_ring_generic_pad_ib,
> >         .emit_wreg = gfx_v10_0_ring_emit_wreg,
> >         .emit_reg_wait = gfx_v10_0_ring_emit_reg_wait,
> > @@ -9659,7 +9677,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_kiq = {
> >         .emit_fence = gfx_v10_0_ring_emit_fence_kiq,
> >         .test_ring = gfx_v10_0_ring_test_ring,
> >         .test_ib = gfx_v10_0_ring_test_ib,
> > -       .insert_nop = amdgpu_ring_insert_nop,
> > +       .insert_nop = amdgpu_gfx10_ring_insert_nop,
> >         .pad_ib = amdgpu_ring_generic_pad_ib,
> >         .emit_rreg = gfx_v10_0_ring_emit_rreg,
> >         .emit_wreg = gfx_v10_0_ring_emit_wreg,
> > --
> > 2.34.1
> >

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] drm/amdgpu: optimize the padding with hw optimization
  2024-07-30 12:43 [PATCH] drm/amdgpu: optimize the padding with hw optimization Sunil Khatri
  2024-07-30 13:17 ` Christian König
  2024-08-01  3:19 ` Marek Olšák
@ 2024-08-01  3:24 ` Marek Olšák
  2024-08-01  4:34   ` Khatri, Sunil
  2 siblings, 1 reply; 16+ messages in thread
From: Marek Olšák @ 2024-08-01  3:24 UTC (permalink / raw)
  To: Sunil Khatri
  Cc: Alex Deucher, Christian König, amd-gfx,
	Pierre-Eric Pelloux-Prayer, Tvrtko Ursulin, Marek Olšák

On Tue, Jul 30, 2024 at 8:43 AM Sunil Khatri <sunil.khatri@amd.com> wrote:
>
> Adding NOP packets one by one in the ring
> does not use the CP efficiently.
>
> Solution:
> Use CP optimization while adding NOP packet's so PFP
> can discard NOP packets based on information of count
> from the Header instead of fetching all NOP packets
> one by one.
>
> Cc: Christian König <christian.koenig@amd.com>
> Cc: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
> Cc: Tvrtko Ursulin <tursulin@igalia.com>
> Cc: Marek Olšák <marek.olsak@amd.com>
> Signed-off-by: Sunil Khatri <sunil.khatri@amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 24 +++++++++++++++++++++---
>  1 file changed, 21 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> index 853084a2ce7f..edf5b5c4d185 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> @@ -9397,6 +9397,24 @@ static void gfx_v10_0_emit_mem_sync(struct amdgpu_ring *ring)
>         amdgpu_ring_write(ring, gcr_cntl); /* GCR_CNTL */
>  }
>
> +static void amdgpu_gfx10_ring_insert_nop(struct amdgpu_ring *ring, uint32_t num_nop)

It would be better to have this function in common code instead of
duplicating it in 5 files.

Marek

> +{
> +       int i;
> +
> +       /* Header itself is a NOP packet */
> +       if (num_nop == 1) {
> +               amdgpu_ring_write(ring, ring->funcs->nop);
> +               return;
> +       }
> +
> +       /* Max HW optimization till 0x3ffe, followed by remaining one NOP at a time*/
> +       amdgpu_ring_write(ring, PACKET3(PACKET3_NOP, min(num_nop - 2, 0x3ffe)));
> +
> +       /* Header is at index 0, followed by num_nops - 1 NOP packet's */
> +       for (i = 1; i < num_nop; i++)
> +               amdgpu_ring_write(ring, ring->funcs->nop);
> +}
> +
>  static void gfx_v10_ip_print(void *handle, struct drm_printer *p)
>  {
>         struct amdgpu_device *adev = (struct amdgpu_device *)handle;
> @@ -9588,7 +9606,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_gfx = {
>         .emit_hdp_flush = gfx_v10_0_ring_emit_hdp_flush,
>         .test_ring = gfx_v10_0_ring_test_ring,
>         .test_ib = gfx_v10_0_ring_test_ib,
> -       .insert_nop = amdgpu_ring_insert_nop,
> +       .insert_nop = amdgpu_gfx10_ring_insert_nop,
>         .pad_ib = amdgpu_ring_generic_pad_ib,
>         .emit_switch_buffer = gfx_v10_0_ring_emit_sb,
>         .emit_cntxcntl = gfx_v10_0_ring_emit_cntxcntl,
> @@ -9629,7 +9647,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_compute = {
>         .emit_hdp_flush = gfx_v10_0_ring_emit_hdp_flush,
>         .test_ring = gfx_v10_0_ring_test_ring,
>         .test_ib = gfx_v10_0_ring_test_ib,
> -       .insert_nop = amdgpu_ring_insert_nop,
> +       .insert_nop = amdgpu_gfx10_ring_insert_nop,
>         .pad_ib = amdgpu_ring_generic_pad_ib,
>         .emit_wreg = gfx_v10_0_ring_emit_wreg,
>         .emit_reg_wait = gfx_v10_0_ring_emit_reg_wait,
> @@ -9659,7 +9677,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_kiq = {
>         .emit_fence = gfx_v10_0_ring_emit_fence_kiq,
>         .test_ring = gfx_v10_0_ring_test_ring,
>         .test_ib = gfx_v10_0_ring_test_ib,
> -       .insert_nop = amdgpu_ring_insert_nop,
> +       .insert_nop = amdgpu_gfx10_ring_insert_nop,
>         .pad_ib = amdgpu_ring_generic_pad_ib,
>         .emit_rreg = gfx_v10_0_ring_emit_rreg,
>         .emit_wreg = gfx_v10_0_ring_emit_wreg,
> --
> 2.34.1
>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] drm/amdgpu: optimize the padding with hw optimization
  2024-08-01  3:19 ` Marek Olšák
  2024-08-01  3:22   ` Marek Olšák
@ 2024-08-01  4:27   ` Khatri, Sunil
  2024-08-01  6:53     ` Marek Olšák
  1 sibling, 1 reply; 16+ messages in thread
From: Khatri, Sunil @ 2024-08-01  4:27 UTC (permalink / raw)
  To: Marek Olšák, Sunil Khatri
  Cc: Alex Deucher, Christian König, amd-gfx,
	Pierre-Eric Pelloux-Prayer, Tvrtko Ursulin, Marek Olšák


On 8/1/2024 8:49 AM, Marek Olšák wrote:
> On Tue, Jul 30, 2024 at 8:43 AM Sunil Khatri <sunil.khatri@amd.com> wrote:
>> Adding NOP packets one by one in the ring
>> does not use the CP efficiently.
>>
>> Solution:
>> Use CP optimization while adding NOP packet's so PFP
>> can discard NOP packets based on information of count
>> from the Header instead of fetching all NOP packets
>> one by one.
>>
>> Cc: Christian König <christian.koenig@amd.com>
>> Cc: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
>> Cc: Tvrtko Ursulin <tursulin@igalia.com>
>> Cc: Marek Olšák <marek.olsak@amd.com>
>> Signed-off-by: Sunil Khatri <sunil.khatri@amd.com>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 24 +++++++++++++++++++++---
>>   1 file changed, 21 insertions(+), 3 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>> index 853084a2ce7f..edf5b5c4d185 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>> @@ -9397,6 +9397,24 @@ static void gfx_v10_0_emit_mem_sync(struct amdgpu_ring *ring)
>>          amdgpu_ring_write(ring, gcr_cntl); /* GCR_CNTL */
>>   }
>>
>> +static void amdgpu_gfx10_ring_insert_nop(struct amdgpu_ring *ring, uint32_t num_nop)
>> +{
>> +       int i;
>> +
>> +       /* Header itself is a NOP packet */
>> +       if (num_nop == 1) {
>> +               amdgpu_ring_write(ring, ring->funcs->nop);
>> +               return;
>> +       }
>> +
>> +       /* Max HW optimization till 0x3ffe, followed by remaining one NOP at a time*/
>> +       amdgpu_ring_write(ring, PACKET3(PACKET3_NOP, min(num_nop - 2, 0x3ffe)));
>> +
>> +       /* Header is at index 0, followed by num_nops - 1 NOP packet's */
>> +       for (i = 1; i < num_nop; i++)
>> +               amdgpu_ring_write(ring, ring->funcs->nop);
> This loop should be removed. It's unnecessary CPU overhead and we
> should never get more than 0x3fff NOPs (maybe use BUG_ON). Leaving the
> whole packet body uninitialized is the fastest option.
That was the original intent to just move the WPTR for the no of nops 
and tried too. Based on Christian inputs we should not let the nops packet

as garbage or whatever was there originally as a threat/safety measure. 
So CPU side there isnt any optimization but just CP will skip all these 
so GPU side should see the benefit.

Regards
Sunil Khatri

>
> Marek
>
>> +}
>> +
>>   static void gfx_v10_ip_print(void *handle, struct drm_printer *p)
>>   {
>>          struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>> @@ -9588,7 +9606,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_gfx = {
>>          .emit_hdp_flush = gfx_v10_0_ring_emit_hdp_flush,
>>          .test_ring = gfx_v10_0_ring_test_ring,
>>          .test_ib = gfx_v10_0_ring_test_ib,
>> -       .insert_nop = amdgpu_ring_insert_nop,
>> +       .insert_nop = amdgpu_gfx10_ring_insert_nop,
>>          .pad_ib = amdgpu_ring_generic_pad_ib,
>>          .emit_switch_buffer = gfx_v10_0_ring_emit_sb,
>>          .emit_cntxcntl = gfx_v10_0_ring_emit_cntxcntl,
>> @@ -9629,7 +9647,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_compute = {
>>          .emit_hdp_flush = gfx_v10_0_ring_emit_hdp_flush,
>>          .test_ring = gfx_v10_0_ring_test_ring,
>>          .test_ib = gfx_v10_0_ring_test_ib,
>> -       .insert_nop = amdgpu_ring_insert_nop,
>> +       .insert_nop = amdgpu_gfx10_ring_insert_nop,
>>          .pad_ib = amdgpu_ring_generic_pad_ib,
>>          .emit_wreg = gfx_v10_0_ring_emit_wreg,
>>          .emit_reg_wait = gfx_v10_0_ring_emit_reg_wait,
>> @@ -9659,7 +9677,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_kiq = {
>>          .emit_fence = gfx_v10_0_ring_emit_fence_kiq,
>>          .test_ring = gfx_v10_0_ring_test_ring,
>>          .test_ib = gfx_v10_0_ring_test_ib,
>> -       .insert_nop = amdgpu_ring_insert_nop,
>> +       .insert_nop = amdgpu_gfx10_ring_insert_nop,
>>          .pad_ib = amdgpu_ring_generic_pad_ib,
>>          .emit_rreg = gfx_v10_0_ring_emit_rreg,
>>          .emit_wreg = gfx_v10_0_ring_emit_wreg,
>> --
>> 2.34.1
>>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] drm/amdgpu: optimize the padding with hw optimization
  2024-08-01  3:22   ` Marek Olšák
@ 2024-08-01  4:32     ` Khatri, Sunil
  0 siblings, 0 replies; 16+ messages in thread
From: Khatri, Sunil @ 2024-08-01  4:32 UTC (permalink / raw)
  To: Marek Olšák, Sunil Khatri
  Cc: Alex Deucher, Christian König, amd-gfx,
	Pierre-Eric Pelloux-Prayer, Tvrtko Ursulin, Marek Olšák


On 8/1/2024 8:52 AM, Marek Olšák wrote:
> On Wed, Jul 31, 2024 at 11:19 PM Marek Olšák <maraeo@gmail.com> wrote:
>> On Tue, Jul 30, 2024 at 8:43 AM Sunil Khatri <sunil.khatri@amd.com> wrote:
>>> Adding NOP packets one by one in the ring
>>> does not use the CP efficiently.
>>>
>>> Solution:
>>> Use CP optimization while adding NOP packet's so PFP
>>> can discard NOP packets based on information of count
>>> from the Header instead of fetching all NOP packets
>>> one by one.
>>>
>>> Cc: Christian König <christian.koenig@amd.com>
>>> Cc: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
>>> Cc: Tvrtko Ursulin <tursulin@igalia.com>
>>> Cc: Marek Olšák <marek.olsak@amd.com>
>>> Signed-off-by: Sunil Khatri <sunil.khatri@amd.com>
>>> ---
>>>   drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 24 +++++++++++++++++++++---
>>>   1 file changed, 21 insertions(+), 3 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>> index 853084a2ce7f..edf5b5c4d185 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>> @@ -9397,6 +9397,24 @@ static void gfx_v10_0_emit_mem_sync(struct amdgpu_ring *ring)
>>>          amdgpu_ring_write(ring, gcr_cntl); /* GCR_CNTL */
>>>   }
>>>
>>> +static void amdgpu_gfx10_ring_insert_nop(struct amdgpu_ring *ring, uint32_t num_nop)
>>> +{
>>> +       int i;
>>> +
>>> +       /* Header itself is a NOP packet */
>>> +       if (num_nop == 1) {
>>> +               amdgpu_ring_write(ring, ring->funcs->nop);
>>> +               return;
>>> +       }
>>> +
>>> +       /* Max HW optimization till 0x3ffe, followed by remaining one NOP at a time*/
>>> +       amdgpu_ring_write(ring, PACKET3(PACKET3_NOP, min(num_nop - 2, 0x3ffe)));
>>> +
>>> +       /* Header is at index 0, followed by num_nops - 1 NOP packet's */
>>> +       for (i = 1; i < num_nop; i++)
>>> +               amdgpu_ring_write(ring, ring->funcs->nop);
>> This loop should be removed. It's unnecessary CPU overhead and we
>> should never get more than 0x3fff NOPs (maybe use BUG_ON). Leaving the
>> whole packet body uninitialized is the fastest option.
> If you remove amdgpu_ring_write, you still need to move wptr somehow.
> amdgpu_ring_write_multiple gives a hint about how to do it:
>
> ring->wptr += count_dw;
> ring->wptr &= ring->ptr_mask;
> ring->count_dw -= count_dw;
The reason i gave in the previous commit why we just dint just move the 
wptr instead or writing all nops in the ring. I also tried exactly what 
is given above to just move wptr but the device dint boot.

Possible the calculation is wrong or on some target the NOP isnt working 
as expected. With this approach if the NOP is working as per spec it 
would help in saving GPU cycles and if it does not in that case

also it wont crash as NOPS are still there in the ring.
I did not spend more time in analysing the crash due to just shifting 
the wptr for the reason explained in previous commit. Original 
understanding was to just move wptr but based on christian feedback
we are still filling the ring with nops.

>
> Marek
>
>> Marek
>>
>>> +}
>>> +
>>>   static void gfx_v10_ip_print(void *handle, struct drm_printer *p)
>>>   {
>>>          struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>> @@ -9588,7 +9606,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_gfx = {
>>>          .emit_hdp_flush = gfx_v10_0_ring_emit_hdp_flush,
>>>          .test_ring = gfx_v10_0_ring_test_ring,
>>>          .test_ib = gfx_v10_0_ring_test_ib,
>>> -       .insert_nop = amdgpu_ring_insert_nop,
>>> +       .insert_nop = amdgpu_gfx10_ring_insert_nop,
>>>          .pad_ib = amdgpu_ring_generic_pad_ib,
>>>          .emit_switch_buffer = gfx_v10_0_ring_emit_sb,
>>>          .emit_cntxcntl = gfx_v10_0_ring_emit_cntxcntl,
>>> @@ -9629,7 +9647,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_compute = {
>>>          .emit_hdp_flush = gfx_v10_0_ring_emit_hdp_flush,
>>>          .test_ring = gfx_v10_0_ring_test_ring,
>>>          .test_ib = gfx_v10_0_ring_test_ib,
>>> -       .insert_nop = amdgpu_ring_insert_nop,
>>> +       .insert_nop = amdgpu_gfx10_ring_insert_nop,
>>>          .pad_ib = amdgpu_ring_generic_pad_ib,
>>>          .emit_wreg = gfx_v10_0_ring_emit_wreg,
>>>          .emit_reg_wait = gfx_v10_0_ring_emit_reg_wait,
>>> @@ -9659,7 +9677,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_kiq = {
>>>          .emit_fence = gfx_v10_0_ring_emit_fence_kiq,
>>>          .test_ring = gfx_v10_0_ring_test_ring,
>>>          .test_ib = gfx_v10_0_ring_test_ib,
>>> -       .insert_nop = amdgpu_ring_insert_nop,
>>> +       .insert_nop = amdgpu_gfx10_ring_insert_nop,
>>>          .pad_ib = amdgpu_ring_generic_pad_ib,
>>>          .emit_rreg = gfx_v10_0_ring_emit_rreg,
>>>          .emit_wreg = gfx_v10_0_ring_emit_wreg,
>>> --
>>> 2.34.1
>>>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] drm/amdgpu: optimize the padding with hw optimization
  2024-08-01  3:24 ` Marek Olšák
@ 2024-08-01  4:34   ` Khatri, Sunil
  0 siblings, 0 replies; 16+ messages in thread
From: Khatri, Sunil @ 2024-08-01  4:34 UTC (permalink / raw)
  To: Marek Olšák, Sunil Khatri
  Cc: Alex Deucher, Christian König, amd-gfx,
	Pierre-Eric Pelloux-Prayer, Tvrtko Ursulin, Marek Olšák


On 8/1/2024 8:54 AM, Marek Olšák wrote:
> On Tue, Jul 30, 2024 at 8:43 AM Sunil Khatri <sunil.khatri@amd.com> wrote:
>> Adding NOP packets one by one in the ring
>> does not use the CP efficiently.
>>
>> Solution:
>> Use CP optimization while adding NOP packet's so PFP
>> can discard NOP packets based on information of count
>> from the Header instead of fetching all NOP packets
>> one by one.
>>
>> Cc: Christian König <christian.koenig@amd.com>
>> Cc: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
>> Cc: Tvrtko Ursulin <tursulin@igalia.com>
>> Cc: Marek Olšák <marek.olsak@amd.com>
>> Signed-off-by: Sunil Khatri <sunil.khatri@amd.com>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 24 +++++++++++++++++++++---
>>   1 file changed, 21 insertions(+), 3 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>> index 853084a2ce7f..edf5b5c4d185 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>> @@ -9397,6 +9397,24 @@ static void gfx_v10_0_emit_mem_sync(struct amdgpu_ring *ring)
>>          amdgpu_ring_write(ring, gcr_cntl); /* GCR_CNTL */
>>   }
>>
>> +static void amdgpu_gfx10_ring_insert_nop(struct amdgpu_ring *ring, uint32_t num_nop)
> It would be better to have this function in common code instead of
> duplicating it in 5 files.
Although the code here is seem but its architecture dependent. Keeping 
the code in common file also need to redefine PACKET3 and PACKET NOP etc 
in common which are defined in soc15.h
So it was suggested not to move architecture specific changes to common 
code.
>
> Marek
>
>> +{
>> +       int i;
>> +
>> +       /* Header itself is a NOP packet */
>> +       if (num_nop == 1) {
>> +               amdgpu_ring_write(ring, ring->funcs->nop);
>> +               return;
>> +       }
>> +
>> +       /* Max HW optimization till 0x3ffe, followed by remaining one NOP at a time*/
>> +       amdgpu_ring_write(ring, PACKET3(PACKET3_NOP, min(num_nop - 2, 0x3ffe)));
>> +
>> +       /* Header is at index 0, followed by num_nops - 1 NOP packet's */
>> +       for (i = 1; i < num_nop; i++)
>> +               amdgpu_ring_write(ring, ring->funcs->nop);
>> +}
>> +
>>   static void gfx_v10_ip_print(void *handle, struct drm_printer *p)
>>   {
>>          struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>> @@ -9588,7 +9606,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_gfx = {
>>          .emit_hdp_flush = gfx_v10_0_ring_emit_hdp_flush,
>>          .test_ring = gfx_v10_0_ring_test_ring,
>>          .test_ib = gfx_v10_0_ring_test_ib,
>> -       .insert_nop = amdgpu_ring_insert_nop,
>> +       .insert_nop = amdgpu_gfx10_ring_insert_nop,
>>          .pad_ib = amdgpu_ring_generic_pad_ib,
>>          .emit_switch_buffer = gfx_v10_0_ring_emit_sb,
>>          .emit_cntxcntl = gfx_v10_0_ring_emit_cntxcntl,
>> @@ -9629,7 +9647,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_compute = {
>>          .emit_hdp_flush = gfx_v10_0_ring_emit_hdp_flush,
>>          .test_ring = gfx_v10_0_ring_test_ring,
>>          .test_ib = gfx_v10_0_ring_test_ib,
>> -       .insert_nop = amdgpu_ring_insert_nop,
>> +       .insert_nop = amdgpu_gfx10_ring_insert_nop,
>>          .pad_ib = amdgpu_ring_generic_pad_ib,
>>          .emit_wreg = gfx_v10_0_ring_emit_wreg,
>>          .emit_reg_wait = gfx_v10_0_ring_emit_reg_wait,
>> @@ -9659,7 +9677,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_kiq = {
>>          .emit_fence = gfx_v10_0_ring_emit_fence_kiq,
>>          .test_ring = gfx_v10_0_ring_test_ring,
>>          .test_ib = gfx_v10_0_ring_test_ib,
>> -       .insert_nop = amdgpu_ring_insert_nop,
>> +       .insert_nop = amdgpu_gfx10_ring_insert_nop,
>>          .pad_ib = amdgpu_ring_generic_pad_ib,
>>          .emit_rreg = gfx_v10_0_ring_emit_rreg,
>>          .emit_wreg = gfx_v10_0_ring_emit_wreg,
>> --
>> 2.34.1
>>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] drm/amdgpu: optimize the padding with hw optimization
  2024-08-01  4:27   ` Khatri, Sunil
@ 2024-08-01  6:53     ` Marek Olšák
  2024-08-01  7:37       ` Christian König
  0 siblings, 1 reply; 16+ messages in thread
From: Marek Olšák @ 2024-08-01  6:53 UTC (permalink / raw)
  To: Khatri, Sunil
  Cc: Sunil Khatri, Alex Deucher, Christian König,
	amd-gfx mailing list, Pierre-Eric Pelloux-Prayer, Tvrtko Ursulin,
	Marek Olšák

[-- Attachment #1: Type: text/plain, Size: 4767 bytes --]

On Thu, Aug 1, 2024, 00:28 Khatri, Sunil <sukhatri@amd.com> wrote:

>
> On 8/1/2024 8:49 AM, Marek Olšák wrote:
> > On Tue, Jul 30, 2024 at 8:43 AM Sunil Khatri <sunil.khatri@amd.com>
> wrote:
> >> Adding NOP packets one by one in the ring
> >> does not use the CP efficiently.
> >>
> >> Solution:
> >> Use CP optimization while adding NOP packet's so PFP
> >> can discard NOP packets based on information of count
> >> from the Header instead of fetching all NOP packets
> >> one by one.
> >>
> >> Cc: Christian König <christian.koenig@amd.com>
> >> Cc: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
> >> Cc: Tvrtko Ursulin <tursulin@igalia.com>
> >> Cc: Marek Olšák <marek.olsak@amd.com>
> >> Signed-off-by: Sunil Khatri <sunil.khatri@amd.com>
> >> ---
> >>   drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 24 +++++++++++++++++++++---
> >>   1 file changed, 21 insertions(+), 3 deletions(-)
> >>
> >> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> >> index 853084a2ce7f..edf5b5c4d185 100644
> >> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> >> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> >> @@ -9397,6 +9397,24 @@ static void gfx_v10_0_emit_mem_sync(struct
> amdgpu_ring *ring)
> >>          amdgpu_ring_write(ring, gcr_cntl); /* GCR_CNTL */
> >>   }
> >>
> >> +static void amdgpu_gfx10_ring_insert_nop(struct amdgpu_ring *ring,
> uint32_t num_nop)
> >> +{
> >> +       int i;
> >> +
> >> +       /* Header itself is a NOP packet */
> >> +       if (num_nop == 1) {
> >> +               amdgpu_ring_write(ring, ring->funcs->nop);
> >> +               return;
> >> +       }
> >> +
> >> +       /* Max HW optimization till 0x3ffe, followed by remaining one
> NOP at a time*/
> >> +       amdgpu_ring_write(ring, PACKET3(PACKET3_NOP, min(num_nop - 2,
> 0x3ffe)));
> >> +
> >> +       /* Header is at index 0, followed by num_nops - 1 NOP packet's
> */
> >> +       for (i = 1; i < num_nop; i++)
> >> +               amdgpu_ring_write(ring, ring->funcs->nop);
> > This loop should be removed. It's unnecessary CPU overhead and we
> > should never get more than 0x3fff NOPs (maybe use BUG_ON). Leaving the
> > whole packet body uninitialized is the fastest option.
> That was the original intent to just move the WPTR for the no of nops
> and tried too. Based on Christian inputs we should not let the nops packet
>
> as garbage or whatever was there originally as a threat/safety measure.


It doesn't help safety. It can only be read by the GPU with kernel-level
permissions.

Initializing the packet body is useless and adds CPU overhead, especially
with the 256 NOPs or so that we use for no reason.

Marek


> So CPU side there isnt any optimization but just CP will skip all these
> so GPU side should see the benefit.
>
> Regards
> Sunil Khatri
>
> >
> > Marek
> >
> >> +}
> >> +
> >>   static void gfx_v10_ip_print(void *handle, struct drm_printer *p)
> >>   {
> >>          struct amdgpu_device *adev = (struct amdgpu_device *)handle;
> >> @@ -9588,7 +9606,7 @@ static const struct amdgpu_ring_funcs
> gfx_v10_0_ring_funcs_gfx = {
> >>          .emit_hdp_flush = gfx_v10_0_ring_emit_hdp_flush,
> >>          .test_ring = gfx_v10_0_ring_test_ring,
> >>          .test_ib = gfx_v10_0_ring_test_ib,
> >> -       .insert_nop = amdgpu_ring_insert_nop,
> >> +       .insert_nop = amdgpu_gfx10_ring_insert_nop,
> >>          .pad_ib = amdgpu_ring_generic_pad_ib,
> >>          .emit_switch_buffer = gfx_v10_0_ring_emit_sb,
> >>          .emit_cntxcntl = gfx_v10_0_ring_emit_cntxcntl,
> >> @@ -9629,7 +9647,7 @@ static const struct amdgpu_ring_funcs
> gfx_v10_0_ring_funcs_compute = {
> >>          .emit_hdp_flush = gfx_v10_0_ring_emit_hdp_flush,
> >>          .test_ring = gfx_v10_0_ring_test_ring,
> >>          .test_ib = gfx_v10_0_ring_test_ib,
> >> -       .insert_nop = amdgpu_ring_insert_nop,
> >> +       .insert_nop = amdgpu_gfx10_ring_insert_nop,
> >>          .pad_ib = amdgpu_ring_generic_pad_ib,
> >>          .emit_wreg = gfx_v10_0_ring_emit_wreg,
> >>          .emit_reg_wait = gfx_v10_0_ring_emit_reg_wait,
> >> @@ -9659,7 +9677,7 @@ static const struct amdgpu_ring_funcs
> gfx_v10_0_ring_funcs_kiq = {
> >>          .emit_fence = gfx_v10_0_ring_emit_fence_kiq,
> >>          .test_ring = gfx_v10_0_ring_test_ring,
> >>          .test_ib = gfx_v10_0_ring_test_ib,
> >> -       .insert_nop = amdgpu_ring_insert_nop,
> >> +       .insert_nop = amdgpu_gfx10_ring_insert_nop,
> >>          .pad_ib = amdgpu_ring_generic_pad_ib,
> >>          .emit_rreg = gfx_v10_0_ring_emit_rreg,
> >>          .emit_wreg = gfx_v10_0_ring_emit_wreg,
> >> --
> >> 2.34.1
> >>
>

[-- Attachment #2: Type: text/html, Size: 6722 bytes --]

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] drm/amdgpu: optimize the padding with hw optimization
  2024-08-01  6:53     ` Marek Olšák
@ 2024-08-01  7:37       ` Christian König
  2024-08-01 18:55         ` Marek Olšák
  0 siblings, 1 reply; 16+ messages in thread
From: Christian König @ 2024-08-01  7:37 UTC (permalink / raw)
  To: Marek Olšák, Khatri, Sunil
  Cc: Sunil Khatri, Alex Deucher, amd-gfx mailing list,
	Pierre-Eric Pelloux-Prayer, Tvrtko Ursulin, Marek Olšák

[-- Attachment #1: Type: text/plain, Size: 1375 bytes --]

Am 01.08.24 um 08:53 schrieb Marek Olšák:
> On Thu, Aug 1, 2024, 00:28 Khatri, Sunil <sukhatri@amd.com> wrote:
>
>
>     On 8/1/2024 8:49 AM, Marek Olšák wrote:
>     >> +       /* Header is at index 0, followed by num_nops - 1 NOP
>     packet's */
>     >> +       for (i = 1; i < num_nop; i++)
>     >> +               amdgpu_ring_write(ring, ring->funcs->nop);
>     > This loop should be removed. It's unnecessary CPU overhead and we
>     > should never get more than 0x3fff NOPs (maybe use BUG_ON).
>     Leaving the
>     > whole packet body uninitialized is the fastest option.
>     That was the original intent to just move the WPTR for the no of nops
>     and tried too. Based on Christian inputs we should not let the
>     nops packet
>
>     as garbage or whatever was there originally as a threat/safety
>     measure.
>
>
> It doesn't help safety. It can only be read by the GPU with 
> kernel-level permissions.
>
> Initializing the packet body is useless and adds CPU overhead, 
> especially with the 256 NOPs or so that we use for no reason.

Not filling the remaining ring buffers with NOPs is a pretty clear NAK 
from my side. Leaving garbage in the ring buffer is not even remotely 
defensive.

What we can do is to optimize filling N DWs into the ring buffer without 
updating the WPTR each time.

Regards,
Christian.

>
> Marek
>

[-- Attachment #2: Type: text/html, Size: 2886 bytes --]

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] drm/amdgpu: optimize the padding with hw optimization
  2024-08-01  7:37       ` Christian König
@ 2024-08-01 18:55         ` Marek Olšák
  2024-08-02 10:10           ` Lazar, Lijo
  2024-08-04 18:11           ` Marek Olšák
  0 siblings, 2 replies; 16+ messages in thread
From: Marek Olšák @ 2024-08-01 18:55 UTC (permalink / raw)
  To: Christian König
  Cc: Khatri, Sunil, Sunil Khatri, Alex Deucher, amd-gfx mailing list,
	Pierre-Eric Pelloux-Prayer, Tvrtko Ursulin, Marek Olšák

[-- Attachment #1: Type: text/plain, Size: 1551 bytes --]

On Thu, Aug 1, 2024, 03:37 Christian König <christian.koenig@amd.com> wrote:

> Am 01.08.24 um 08:53 schrieb Marek Olšák:
>
> On Thu, Aug 1, 2024, 00:28 Khatri, Sunil <sukhatri@amd.com> wrote:
>
>>
>> On 8/1/2024 8:49 AM, Marek Olšák wrote:
>> >> +       /* Header is at index 0, followed by num_nops - 1 NOP packet's
>> */
>> >> +       for (i = 1; i < num_nop; i++)
>> >> +               amdgpu_ring_write(ring, ring->funcs->nop);
>> > This loop should be removed. It's unnecessary CPU overhead and we
>> > should never get more than 0x3fff NOPs (maybe use BUG_ON). Leaving the
>> > whole packet body uninitialized is the fastest option.
>> That was the original intent to just move the WPTR for the no of nops
>> and tried too. Based on Christian inputs we should not let the nops packet
>>
>> as garbage or whatever was there originally as a threat/safety measure.
>
>
> It doesn't help safety. It can only be read by the GPU with kernel-level
> permissions.
>
> Initializing the packet body is useless and adds CPU overhead, especially
> with the 256 NOPs or so that we use for no reason.
>
>
> Not filling the remaining ring buffers with NOPs is a pretty clear NAK
> from my side. Leaving garbage in the ring buffer is not even remotely
> defensive.
>

What are you defending against? You know the ring is kernel-owned memory,
right?

Marek


> What we can do is to optimize filling N DWs into the ring buffer without
> updating the WPTR each time.
>
> Regards,
> Christian.
>
>
> Marek
>
>

[-- Attachment #2: Type: text/html, Size: 3365 bytes --]

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] drm/amdgpu: optimize the padding with hw optimization
  2024-08-01 18:55         ` Marek Olšák
@ 2024-08-02 10:10           ` Lazar, Lijo
  2024-08-04  5:28             ` Marek Olšák
  2024-08-04 18:11           ` Marek Olšák
  1 sibling, 1 reply; 16+ messages in thread
From: Lazar, Lijo @ 2024-08-02 10:10 UTC (permalink / raw)
  To: Marek Olšák, Christian König
  Cc: Khatri, Sunil, Sunil Khatri, Alex Deucher, amd-gfx mailing list,
	Pierre-Eric Pelloux-Prayer, Tvrtko Ursulin, Marek Olšák



On 8/2/2024 12:25 AM, Marek Olšák wrote:
> On Thu, Aug 1, 2024, 03:37 Christian König <christian.koenig@amd.com
> <mailto:christian.koenig@amd.com>> wrote:
> 
>     __
>     Am 01.08.24 um 08:53 schrieb Marek Olšák:
>>     On Thu, Aug 1, 2024, 00:28 Khatri, Sunil <sukhatri@amd.com
>>     <mailto:sukhatri@amd.com>> wrote:
>>
>>
>>         On 8/1/2024 8:49 AM, Marek Olšák wrote:
>>         >> +       /* Header is at index 0, followed by num_nops - 1
>>         NOP packet's */
>>         >> +       for (i = 1; i < num_nop; i++)
>>         >> +               amdgpu_ring_write(ring, ring->funcs->nop);
>>         > This loop should be removed. It's unnecessary CPU overhead
>>         and we
>>         > should never get more than 0x3fff NOPs (maybe use BUG_ON).
>>         Leaving the
>>         > whole packet body uninitialized is the fastest option.
>>         That was the original intent to just move the WPTR for the no
>>         of nops
>>         and tried too. Based on Christian inputs we should not let the
>>         nops packet
>>
>>         as garbage or whatever was there originally as a threat/safety
>>         measure.
>>
>>
>>     It doesn't help safety. It can only be read by the GPU with
>>     kernel-level permissions.
>>
>>     Initializing the packet body is useless and adds CPU overhead,
>>     especially with the 256 NOPs or so that we use for no reason.
> 
>     Not filling the remaining ring buffers with NOPs is a pretty clear
>     NAK from my side. Leaving garbage in the ring buffer is not even
>     remotely defensive.
> 
> 
> What are you defending against? You know the ring is kernel-owned
> memory, right? 
> 

Aside from that, the true hardware behavior is that CP still fetches the
words and discards them. It's not the same mentioned in the description.
So the only optimization it allows is to move the pointer without
filling/caring about the contents as hardware also doesn't care about
them. The notion of filling those unused region is exactly opposite of
the intention. If that's the case, nothing is gained and just drop these
patches.

Thanks,
Lijo

> Marek
> 
> 
>     What we can do is to optimize filling N DWs into the ring buffer
>     without updating the WPTR each time.
> 
>     Regards,
>     Christian.
> 
>>
>>     Marek
>>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] drm/amdgpu: optimize the padding with hw optimization
  2024-08-02 10:10           ` Lazar, Lijo
@ 2024-08-04  5:28             ` Marek Olšák
  0 siblings, 0 replies; 16+ messages in thread
From: Marek Olšák @ 2024-08-04  5:28 UTC (permalink / raw)
  To: Lazar, Lijo
  Cc: Christian König, Khatri, Sunil, Sunil Khatri, Alex Deucher,
	amd-gfx mailing list, Pierre-Eric Pelloux-Prayer, Tvrtko Ursulin,
	Marek Olšák

On Fri, Aug 2, 2024 at 6:10 AM Lazar, Lijo <lijo.lazar@amd.com> wrote:
>
>
>
> On 8/2/2024 12:25 AM, Marek Olšák wrote:
> > On Thu, Aug 1, 2024, 03:37 Christian König <christian.koenig@amd.com
> > <mailto:christian.koenig@amd.com>> wrote:
> >
> >     __
> >     Am 01.08.24 um 08:53 schrieb Marek Olšák:
> >>     On Thu, Aug 1, 2024, 00:28 Khatri, Sunil <sukhatri@amd.com
> >>     <mailto:sukhatri@amd.com>> wrote:
> >>
> >>
> >>         On 8/1/2024 8:49 AM, Marek Olšák wrote:
> >>         >> +       /* Header is at index 0, followed by num_nops - 1
> >>         NOP packet's */
> >>         >> +       for (i = 1; i < num_nop; i++)
> >>         >> +               amdgpu_ring_write(ring, ring->funcs->nop);
> >>         > This loop should be removed. It's unnecessary CPU overhead
> >>         and we
> >>         > should never get more than 0x3fff NOPs (maybe use BUG_ON).
> >>         Leaving the
> >>         > whole packet body uninitialized is the fastest option.
> >>         That was the original intent to just move the WPTR for the no
> >>         of nops
> >>         and tried too. Based on Christian inputs we should not let the
> >>         nops packet
> >>
> >>         as garbage or whatever was there originally as a threat/safety
> >>         measure.
> >>
> >>
> >>     It doesn't help safety. It can only be read by the GPU with
> >>     kernel-level permissions.
> >>
> >>     Initializing the packet body is useless and adds CPU overhead,
> >>     especially with the 256 NOPs or so that we use for no reason.
> >
> >     Not filling the remaining ring buffers with NOPs is a pretty clear
> >     NAK from my side. Leaving garbage in the ring buffer is not even
> >     remotely defensive.
> >
> >
> > What are you defending against? You know the ring is kernel-owned
> > memory, right?
> >
>
> Aside from that, the true hardware behavior is that CP still fetches the
> words and discards them. It's not the same mentioned in the description.
> So the only optimization it allows is to move the pointer without
> filling/caring about the contents as hardware also doesn't care about
> them. The notion of filling those unused region is exactly opposite of
> the intention. If that's the case, nothing is gained and just drop these
> patches.

It's correct that it doesn't reduce fetching, but this optimization is
not about fetching. It's about reducing the number of instructions
that the firmware must execute. Single dword NOPs are quite expensive
for their size.

Marek

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] drm/amdgpu: optimize the padding with hw optimization
  2024-08-01 18:55         ` Marek Olšák
  2024-08-02 10:10           ` Lazar, Lijo
@ 2024-08-04 18:11           ` Marek Olšák
  2024-08-07  8:21             ` Tvrtko Ursulin
  1 sibling, 1 reply; 16+ messages in thread
From: Marek Olšák @ 2024-08-04 18:11 UTC (permalink / raw)
  To: Christian König
  Cc: Khatri, Sunil, Sunil Khatri, Alex Deucher, amd-gfx mailing list,
	Pierre-Eric Pelloux-Prayer, Tvrtko Ursulin, Marek Olšák

On Thu, Aug 1, 2024 at 2:55 PM Marek Olšák <maraeo@gmail.com> wrote:
>
> On Thu, Aug 1, 2024, 03:37 Christian König <christian.koenig@amd.com> wrote:
>>
>> Am 01.08.24 um 08:53 schrieb Marek Olšák:
>>
>> On Thu, Aug 1, 2024, 00:28 Khatri, Sunil <sukhatri@amd.com> wrote:
>>>
>>>
>>> On 8/1/2024 8:49 AM, Marek Olšák wrote:
>>> >> +       /* Header is at index 0, followed by num_nops - 1 NOP packet's */
>>> >> +       for (i = 1; i < num_nop; i++)
>>> >> +               amdgpu_ring_write(ring, ring->funcs->nop);
>>> > This loop should be removed. It's unnecessary CPU overhead and we
>>> > should never get more than 0x3fff NOPs (maybe use BUG_ON). Leaving the
>>> > whole packet body uninitialized is the fastest option.
>>> That was the original intent to just move the WPTR for the no of nops
>>> and tried too. Based on Christian inputs we should not let the nops packet
>>>
>>> as garbage or whatever was there originally as a threat/safety measure.
>>
>>
>> It doesn't help safety. It can only be read by the GPU with kernel-level permissions.
>>
>> Initializing the packet body is useless and adds CPU overhead, especially with the 256 NOPs or so that we use for no reason.
>>
>>
>> Not filling the remaining ring buffers with NOPs is a pretty clear NAK from my side. Leaving garbage in the ring buffer is not even remotely defensive.
>
>
> What are you defending against? You know the ring is kernel-owned memory, right?

This was pushed without any justification why you need to clear
kernel-allocated memory with some constant number up to 30000 times
per second that only the kernel can read.

Marek

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] drm/amdgpu: optimize the padding with hw optimization
  2024-08-04 18:11           ` Marek Olšák
@ 2024-08-07  8:21             ` Tvrtko Ursulin
  2024-08-07 16:42               ` Marek Olšák
  0 siblings, 1 reply; 16+ messages in thread
From: Tvrtko Ursulin @ 2024-08-07  8:21 UTC (permalink / raw)
  To: Marek Olšák, Christian König
  Cc: Khatri, Sunil, Sunil Khatri, Alex Deucher, amd-gfx mailing list,
	Pierre-Eric Pelloux-Prayer, Tvrtko Ursulin, Marek Olšák


On 04/08/2024 19:11, Marek Olšák wrote:
> On Thu, Aug 1, 2024 at 2:55 PM Marek Olšák <maraeo@gmail.com> wrote:
>>
>> On Thu, Aug 1, 2024, 03:37 Christian König <christian.koenig@amd.com> wrote:
>>>
>>> Am 01.08.24 um 08:53 schrieb Marek Olšák:
>>>
>>> On Thu, Aug 1, 2024, 00:28 Khatri, Sunil <sukhatri@amd.com> wrote:
>>>>
>>>>
>>>> On 8/1/2024 8:49 AM, Marek Olšák wrote:
>>>>>> +       /* Header is at index 0, followed by num_nops - 1 NOP packet's */
>>>>>> +       for (i = 1; i < num_nop; i++)
>>>>>> +               amdgpu_ring_write(ring, ring->funcs->nop);
>>>>> This loop should be removed. It's unnecessary CPU overhead and we
>>>>> should never get more than 0x3fff NOPs (maybe use BUG_ON). Leaving the
>>>>> whole packet body uninitialized is the fastest option.
>>>> That was the original intent to just move the WPTR for the no of nops
>>>> and tried too. Based on Christian inputs we should not let the nops packet
>>>>
>>>> as garbage or whatever was there originally as a threat/safety measure.
>>>
>>>
>>> It doesn't help safety. It can only be read by the GPU with kernel-level permissions.
>>>
>>> Initializing the packet body is useless and adds CPU overhead, especially with the 256 NOPs or so that we use for no reason.
>>>
>>>
>>> Not filling the remaining ring buffers with NOPs is a pretty clear NAK from my side. Leaving garbage in the ring buffer is not even remotely defensive.
>>
>>
>> What are you defending against? You know the ring is kernel-owned memory, right?
> 
> This was pushed without any justification why you need to clear
> kernel-allocated memory with some constant number up to 30000 times
> per second that only the kernel can read.

I see that this seems to be controversial, but FWIW, if the loop ends up 
staying, at least we could replace it with memset32 as I have shown in 
https://lore.kernel.org/amd-gfx/20240715104026.6311-1-tursulin@igalia.com/ 
that the inefficient amdgpu_ring_write can show up in the profile.

And also maybe consider other than gfx? Again, I did something in 
https://lore.kernel.org/amd-gfx/20240712152855.45284-4-tursulin@igalia.com/, 
but AMD folks will know if there is a similar (like in this series) 
approach which also improves the GPU side processing and not just CPU side.

Regards,

Tvrtko

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] drm/amdgpu: optimize the padding with hw optimization
  2024-08-07  8:21             ` Tvrtko Ursulin
@ 2024-08-07 16:42               ` Marek Olšák
  0 siblings, 0 replies; 16+ messages in thread
From: Marek Olšák @ 2024-08-07 16:42 UTC (permalink / raw)
  To: Tvrtko Ursulin
  Cc: Christian König, Khatri, Sunil, Sunil Khatri, Alex Deucher,
	amd-gfx mailing list, Pierre-Eric Pelloux-Prayer, Tvrtko Ursulin,
	Marek Olšák

On Wed, Aug 7, 2024 at 4:21 AM Tvrtko Ursulin <tursulin@ursulin.net> wrote:
>
>
> On 04/08/2024 19:11, Marek Olšák wrote:
> > On Thu, Aug 1, 2024 at 2:55 PM Marek Olšák <maraeo@gmail.com> wrote:
> >>
> >> On Thu, Aug 1, 2024, 03:37 Christian König <christian.koenig@amd.com> wrote:
> >>>
> >>> Am 01.08.24 um 08:53 schrieb Marek Olšák:
> >>>
> >>> On Thu, Aug 1, 2024, 00:28 Khatri, Sunil <sukhatri@amd.com> wrote:
> >>>>
> >>>>
> >>>> On 8/1/2024 8:49 AM, Marek Olšák wrote:
> >>>>>> +       /* Header is at index 0, followed by num_nops - 1 NOP packet's */
> >>>>>> +       for (i = 1; i < num_nop; i++)
> >>>>>> +               amdgpu_ring_write(ring, ring->funcs->nop);
> >>>>> This loop should be removed. It's unnecessary CPU overhead and we
> >>>>> should never get more than 0x3fff NOPs (maybe use BUG_ON). Leaving the
> >>>>> whole packet body uninitialized is the fastest option.
> >>>> That was the original intent to just move the WPTR for the no of nops
> >>>> and tried too. Based on Christian inputs we should not let the nops packet
> >>>>
> >>>> as garbage or whatever was there originally as a threat/safety measure.
> >>>
> >>>
> >>> It doesn't help safety. It can only be read by the GPU with kernel-level permissions.
> >>>
> >>> Initializing the packet body is useless and adds CPU overhead, especially with the 256 NOPs or so that we use for no reason.
> >>>
> >>>
> >>> Not filling the remaining ring buffers with NOPs is a pretty clear NAK from my side. Leaving garbage in the ring buffer is not even remotely defensive.
> >>
> >>
> >> What are you defending against? You know the ring is kernel-owned memory, right?
> >
> > This was pushed without any justification why you need to clear
> > kernel-allocated memory with some constant number up to 30000 times
> > per second that only the kernel can read.
>
> I see that this seems to be controversial, but FWIW, if the loop ends up
> staying, at least we could replace it with memset32 as I have shown in
> https://lore.kernel.org/amd-gfx/20240715104026.6311-1-tursulin@igalia.com/
> that the inefficient amdgpu_ring_write can show up in the profile.
>
> And also maybe consider other than gfx? Again, I did something in
> https://lore.kernel.org/amd-gfx/20240712152855.45284-4-tursulin@igalia.com/,
> but AMD folks will know if there is a similar (like in this series)
> approach which also improves the GPU side processing and not just CPU side.

1. Yes, we should reduce CPU overhead by not using amdgpu_ring_write
to flll a ring buffer.

2. We should stop clearing NOP content.

3. We should stop padding to 256 dwords when we really just need to
pad to 8 dwords.

4. We should get rid of amdgpu_ring_write and use a more efficient way
to write into a ring buffer, which is described below.
amdgpu_ring_write generates bad CPU code because the compiler can't
determine pointer aliasing.

When starting to write into a ring buffer, count_dw, *ring, buf_mask,
and wptr should be copied to local variables and ring writes should
only use those. After everything has been written into the ring
buffer, the local variables should be copied back to amdgpu_ring, and
ptr_mask should be applied only then. That allows the compiler to use
constant offsets for the writes, and
reorder/merge/const-evaluate/optimize all operations on the local
variables, which doesn't happen with non-local variables. 3 macros
(amdgpu_ring_begin, amdgpu_ring_write, amdgpu_ring_end) can be created
to encapsulate this logic. Example:

amdgpu_ring_begin(ring);
amdgpu_ring_write(value0);
amdgpu_ring_write(value1);
amdgpu_ring_write(value2);
amdgpu_ring_write(value3);
amdgpu_ring_end();

Marek

^ permalink raw reply	[flat|nested] 16+ messages in thread

end of thread, other threads:[~2024-08-07 16:43 UTC | newest]

Thread overview: 16+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2024-07-30 12:43 [PATCH] drm/amdgpu: optimize the padding with hw optimization Sunil Khatri
2024-07-30 13:17 ` Christian König
2024-08-01  3:19 ` Marek Olšák
2024-08-01  3:22   ` Marek Olšák
2024-08-01  4:32     ` Khatri, Sunil
2024-08-01  4:27   ` Khatri, Sunil
2024-08-01  6:53     ` Marek Olšák
2024-08-01  7:37       ` Christian König
2024-08-01 18:55         ` Marek Olšák
2024-08-02 10:10           ` Lazar, Lijo
2024-08-04  5:28             ` Marek Olšák
2024-08-04 18:11           ` Marek Olšák
2024-08-07  8:21             ` Tvrtko Ursulin
2024-08-07 16:42               ` Marek Olšák
2024-08-01  3:24 ` Marek Olšák
2024-08-01  4:34   ` Khatri, Sunil

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox