Igt-dev Archive on lore.kernel.org
 help / color / mirror / Atom feed
From: "Grzegorzek, Dominik" <dominik.grzegorzek@intel.com>
To: "Kempczynski, Zbigniew" <zbigniew.kempczynski@intel.com>,
	"igt-dev@lists.freedesktop.org" <igt-dev@lists.freedesktop.org>
Subject: Re: [PATCH i-g-t 1/3] lib/rendercopy: Add render-copy xe2 implementation
Date: Mon, 8 Jan 2024 15:59:21 +0000	[thread overview]
Message-ID: <eeb22ef9c7ffe590dd94df1b5cea7ee2deddb408.camel@intel.com> (raw)
In-Reply-To: <20240108113012.382557-2-zbigniew.kempczynski@intel.com>

On Mon, 2024-01-08 at 12:30 +0100, Zbigniew Kempczyński wrote:
> Due to small differences between xe2 and previous 3d pipeline I decided
> to adopt this in gen9 render-copy function instead of introducing a new one.
> Xe2 uses large GRFs (512-bit) where coordinates occupy only 2 GRF
> registers (instead 4 on 256-bit GRFs). This requires shader adoption
> on data preparation for sampler/render target write.
> 
> Signed-off-by: Zbigniew Kempczyński <zbigniew.kempczynski@intel.com>
> Cc: Juha-Pekka Heikkila <juhapekka.heikkila@gmail.com>
> ---
>  lib/genxe2_render.h                       | 14 ++++++
> 
I believe we should not use 'gen' notation anymore. So either use as a prefix
first platform which uses this (xe2lpg_ in this case) or simply architecture xe2_. Applies to all
'GENXE2, genxe2' prefixes.

>  lib/i915/shaders/ps/gen20_render_copy.asm |  8 +++
>  lib/rendercopy.h                          |  4 ++
>  lib/rendercopy_gen9.c                     | 60 +++++++++++++++++++++--
>  4 files changed, 81 insertions(+), 5 deletions(-)
>  create mode 100644 lib/genxe2_render.h
>  create mode 100644 lib/i915/shaders/ps/gen20_render_copy.asm
> 
> diff --git a/lib/genxe2_render.h b/lib/genxe2_render.h
> new file mode 100644
> index 0000000000..3db7a84894
> --- /dev/null
> +++ b/lib/genxe2_render.h
> @@ -0,0 +1,14 @@
> +/* SPDX-License-Identifier: MIT */
> +/*
> + * Copyright © 2024 Intel Corporation
> + */
> +#ifndef GENXE2_RENDER_H
> +#define GENXE2_RENDER_H
> +
> +#define GENXE2_3DSTATE_DRAWING_RECTANGLE_FAST	GEN4_3D(3, 0, 0)
> +
> +/* 3DSTATE_PS dword6 */
> +# define GENXE_KERNEL0_PACKING_POLICY		24
> +#  define GENXE_KERNEL0_POLY_PACK16_FIXED	3
> +
> +#endif
> diff --git a/lib/i915/shaders/ps/gen20_render_copy.asm b/lib/i915/shaders/ps/gen20_render_copy.asm
> new file mode 100644
> index 0000000000..330417966d
> --- /dev/null
> +++ b/lib/i915/shaders/ps/gen20_render_copy.asm
> @@ -0,0 +1,8 @@
> +L0:
> +(W)     mad (16|M0)              acc0.0<1>:f   r6.3<0;0>:f      r1.0<1;1>:f       r6.0<0>:f
> +(W)     mad (16|M0)              r113.0<1>:f   acc0.0<1;1>:f    r1.0<1;1>:f       r6.1<0>:f
> +(W)     mad (16|M0)              acc0.0<1>:f   r6.7<0;0>:f      r1.0<1;1>:f       r6.4<0>:f
> +(W)     mad (16|M0)              r114.0<1>:f   acc0.0<1;1>:f    r2.0<1;1>:f       r6.5<0>:f
> +(W)     send.smpl (16|M0)        r12      r113  null:0  0x0            0x04420001           {F@1,$0} // wr:2+0, rd:4; simd16 sample:u+v+r+ai+mlod using sampler index 0
> +(W)     send.rc (16|M0)          null     r12   null:0  0x0            0x08031400           {EOT,$0} // wr:4+0, rd:0; full-precision render target write SIMD16; last render target to surface 0
> +L96:
> diff --git a/lib/rendercopy.h b/lib/rendercopy.h

> index 0d81d27f83..1a97a72573 100644
> --- a/lib/rendercopy.h
> +++ b/lib/rendercopy.h
> @@ -43,6 +43,10 @@ void gen12p71_render_copyfunc(struct intel_bb *ibb,
>  			      struct intel_buf *src, uint32_t src_x, uint32_t src_y,
>  			      uint32_t width, uint32_t height,
>  			      struct intel_buf *dst, uint32_t dst_x, uint32_t dst_y);
> +void genxe2_render_copyfunc(struct intel_bb *ibb,
> +			    struct intel_buf *src, uint32_t src_x, uint32_t src_y,
> +			    uint32_t width, uint32_t height,
> +			    struct intel_buf *dst, uint32_t dst_x, uint32_t dst_y);
>  void gen12_render_copyfunc(struct intel_bb *ibb,
>  			   struct intel_buf *src, uint32_t src_x, uint32_t src_y,
>  			   uint32_t width, uint32_t height,
> diff --git a/lib/rendercopy_gen9.c b/lib/rendercopy_gen9.c
> index 363bc6c1b2..f0efadeb50 100644
> --- a/lib/rendercopy_gen9.c
> +++ b/lib/rendercopy_gen9.c
> @@ -22,6 +22,7 @@
>  #include "intel_mocs.h"
>  #include "rendercopy.h"
>  #include "gen9_render.h"
> +#include "genxe2_render.h"
>  #include "intel_reg.h"
>  #include "igt_aux.h"
>  #include "intel_chipset.h"
> @@ -136,6 +137,15 @@ static const uint32_t gen12p71_render_copy[][4] = {
>  	{ 0x80041131, 0x00000004, 0x50007144, 0x00c40000 },
>  };
>  
> +static const uint32_t xe2_render_copy[][4] = {
> +	{ 0x8010005b, 0x200002a0, 0x020a0634, 0x06040105 },
> +	{ 0x8010005b, 0x710402a8, 0x020a2001, 0x06140105 },
> +	{ 0x8010005b, 0x200002a0, 0x020a0674, 0x06440105 },
> +	{ 0x8010005b, 0x720402a8, 0x020a2001, 0x06540205 },
> +	{ 0x80122031, 0x0c240000, 0x20027114, 0x00800000 },
> +	{ 0x8010c031, 0x00000004, 0x58000c24, 0x00c40000 },
> +};
> +
>  /* Mostly copy+paste from gen6, except height, width, pitch moved */
>  static uint32_t
>  gen9_bind_buf(struct intel_bb *ibb, const struct intel_buf *buf, int is_dst,
> @@ -545,7 +555,10 @@ gen9_emit_state_base_address(struct intel_bb *ibb) {
>  	/* WaBindlessSurfaceStateModifyEnable:skl,bxt */
>  	/* The length has to be one less if we dont modify
>  	   bindless state */
> -	intel_bb_out(ibb, GEN4_STATE_BASE_ADDRESS | (19 - 1 - 2));
> +	if (AT_LEAST_GEN(intel_get_drm_devid(ibb->fd), 20))
> +		intel_bb_out(ibb, GEN4_STATE_BASE_ADDRESS | 20);
> +	else
> +		intel_bb_out(ibb, GEN4_STATE_BASE_ADDRESS | (19 - 1 - 2));
>  
>  	/* general */
>  	intel_bb_out(ibb, 0 | BASE_ADDRESS_MODIFY);
> @@ -586,6 +599,13 @@ gen9_emit_state_base_address(struct intel_bb *ibb) {
>  	intel_bb_out(ibb, 0);
>  	intel_bb_out(ibb, 0);
>  	intel_bb_out(ibb, 0);
> +
> +	if (AT_LEAST_GEN(intel_get_drm_devid(ibb->fd), 20)) {
> +		/* Bindless sampler */
> +		intel_bb_out(ibb, 0);
> +		intel_bb_out(ibb, 0);
> +		intel_bb_out(ibb, 0);
> +	}
>  }
>  
>  static void
> @@ -753,7 +773,10 @@ gen9_emit_ds(struct intel_bb *ibb) {
>  
>  static void
>  gen8_emit_wm_hz_op(struct intel_bb *ibb) {
> -	intel_bb_out(ibb, GEN8_3DSTATE_WM_HZ_OP | (5-2));
> +	if (AT_LEAST_GEN(intel_get_drm_devid(ibb->fd), 20))
> +		intel_bb_out(ibb, GEN8_3DSTATE_WM_HZ_OP | (6-2));
> +	else
> +		intel_bb_out(ibb, GEN8_3DSTATE_WM_HZ_OP | (5-2));
>  	intel_bb_out(ibb, 0);
>  	intel_bb_out(ibb, 0);
>  	intel_bb_out(ibb, 0);
> 
Shouldn't we add here intel_bb_out(ibb, 0); as well? You increased lenght of the instruction.
> @@ -852,7 +875,11 @@ gen8_emit_ps(struct intel_bb *ibb, uint32_t kernel, bool fast_clear) {
>  	intel_bb_out(ibb, (max_threads - 1) << GEN8_3DSTATE_PS_MAX_THREADS_SHIFT |
>  	             GEN6_3DSTATE_WM_16_DISPATCH_ENABLE |
>  	             (fast_clear ? GEN8_3DSTATE_FAST_CLEAR_ENABLE : 0));
> -	intel_bb_out(ibb, 6 << GEN6_3DSTATE_WM_DISPATCH_START_GRF_0_SHIFT);
> +	if (AT_LEAST_GEN(intel_get_drm_devid(ibb->fd), 20))
> +		intel_bb_out(ibb, 6 << GEN6_3DSTATE_WM_DISPATCH_START_GRF_0_SHIFT |
> +			     GENXE_KERNEL0_POLY_PACK16_FIXED << GENXE_KERNEL0_PACKING_POLICY);
Looks sane.
> +	else
> +		intel_bb_out(ibb, 6 << GEN6_3DSTATE_WM_DISPATCH_START_GRF_0_SHIFT);
>  	intel_bb_out(ibb, 0); // kernel 1
>  	intel_bb_out(ibb, 0); /* kernel 1 hi */
>  	intel_bb_out(ibb, 0); // kernel 2
> @@ -862,7 +889,11 @@ gen8_emit_ps(struct intel_bb *ibb, uint32_t kernel, bool fast_clear) {
>  	intel_bb_out(ibb, GEN8_PS_BLEND_HAS_WRITEABLE_RT);
>  
>  	intel_bb_out(ibb, GEN8_3DSTATE_PS_EXTRA | (2 - 2));
> -	intel_bb_out(ibb, GEN8_PSX_PIXEL_SHADER_VALID | GEN8_PSX_ATTRIBUTE_ENABLE);
> +
> +	if (AT_LEAST_GEN(intel_get_drm_devid(ibb->fd), 20))
> +		intel_bb_out(ibb, GEN8_PSX_PIXEL_SHADER_VALID);
> +	else
> +		intel_bb_out(ibb, GEN8_PSX_PIXEL_SHADER_VALID | GEN8_PSX_ATTRIBUTE_ENABLE);
Looking at the spec, meaning of that field has not change. So why do we need to clear this? Was
wrong before?

Overall, it looks good to me, just minor nits spotted.

Regards, 
Dominik 
>  }
>  
>  static void
> @@ -903,6 +934,9 @@ gen9_emit_depth(struct intel_bb *ibb)
>  
>  static void
>  gen7_emit_clear(struct intel_bb *ibb) {
> +	if (AT_LEAST_GEN(intel_get_drm_devid(ibb->fd), 20))
> +		return;
> +
>  	intel_bb_out(ibb, GEN7_3DSTATE_CLEAR_PARAMS | (3-2));
>  	intel_bb_out(ibb, 0);
>  	intel_bb_out(ibb, 1); // clear valid
> @@ -911,7 +945,10 @@ gen7_emit_clear(struct intel_bb *ibb) {
>  static void
>  gen6_emit_drawing_rectangle(struct intel_bb *ibb, const struct intel_buf *dst)
>  {
> -	intel_bb_out(ibb, GEN4_3DSTATE_DRAWING_RECTANGLE | (4 - 2));
> +	if (AT_LEAST_GEN(intel_get_drm_devid(ibb->fd), 20))
> +		intel_bb_out(ibb, GENXE2_3DSTATE_DRAWING_RECTANGLE_FAST | (4 - 2));
> +	else
> +		intel_bb_out(ibb, GEN4_3DSTATE_DRAWING_RECTANGLE | (4 - 2));
>  	intel_bb_out(ibb, 0);
>  	intel_bb_out(ibb, (intel_buf_height(dst) - 1) << 16 | (intel_buf_width(dst) - 1));
>  	intel_bb_out(ibb, 0);
> @@ -1220,6 +1257,19 @@ void gen12p71_render_copyfunc(struct intel_bb *ibb,
>  			sizeof(gen12p71_render_copy));
>  }
>  
> +void genxe2_render_copyfunc(struct intel_bb *ibb,
> +			    struct intel_buf *src, uint32_t src_x, uint32_t src_y,
> +			    uint32_t width, uint32_t height,
> +			    struct intel_buf *dst, uint32_t dst_x, uint32_t dst_y)
> +{
> +	_gen9_render_op(ibb, src, src_x, src_y,
> +			  width, height, dst, dst_x, dst_y,
> +			  NULL,
> +			  NULL,
> +			  xe2_render_copy,
> +			  sizeof(xe2_render_copy));
> +}
> +
>  void mtl_render_copyfunc(struct intel_bb *ibb,
>  			 struct intel_buf *src,
>  			 unsigned int src_x, unsigned int src_y,


  reply	other threads:[~2024-01-08 15:59 UTC|newest]

Thread overview: 14+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-01-08 11:30 [PATCH i-g-t 0/3] Add Xe2 render-copy implementation Zbigniew Kempczyński
2024-01-08 11:30 ` [PATCH i-g-t 1/3] lib/rendercopy: Add render-copy xe2 implementation Zbigniew Kempczyński
2024-01-08 15:59   ` Grzegorzek, Dominik [this message]
2024-01-09 13:16     ` Zbigniew Kempczyński
2024-01-08 11:30 ` [PATCH i-g-t 2/3] lib/intel_batchbuffer: Select xe2 rendercopy for LunarLake Zbigniew Kempczyński
2024-01-10  8:46   ` Grzegorzek, Dominik
2024-01-10 11:25   ` Matthew Auld
2024-01-10 12:02     ` Zbigniew Kempczyński
2024-01-08 11:30 ` [PATCH i-g-t 3/3] tests/xe_intel_bb: Use Tile4 instead Y on render subtest Zbigniew Kempczyński
2024-01-08 12:27   ` Grzegorzek, Dominik
2024-01-08 12:09 ` ✗ GitLab.Pipeline: warning for Add Xe2 render-copy implementation Patchwork
2024-01-08 12:45 ` ✓ Fi.CI.BAT: success " Patchwork
2024-01-08 12:46 ` ✓ CI.xeBAT: " Patchwork
2024-01-08 14:00 ` ✗ Fi.CI.IGT: failure " Patchwork

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=eeb22ef9c7ffe590dd94df1b5cea7ee2deddb408.camel@intel.com \
    --to=dominik.grzegorzek@intel.com \
    --cc=igt-dev@lists.freedesktop.org \
    --cc=zbigniew.kempczynski@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox