[igt-dev] [PATCH i-g-t v2 0/2] Add dg1 compute pipeline

Igt-dev Archive on lore.kernel.org
 help / color / mirror / Atom feed

* [igt-dev] [PATCH i-g-t v2 0/2] Add dg1 compute pipeline
@ 2023-11-14 10:41 Zbigniew Kempczyński
  2023-11-14 10:41 ` [igt-dev] [PATCH i-g-t v2 1/2] lib/intel_compute: Prepare tgllp compute functions to be dg1 ready Zbigniew Kempczyński
  2023-11-14 10:41 ` [igt-dev] [PATCH i-g-t v2 2/2] lib/intel_compute: Add dg1 compute implementation for i915 Zbigniew Kempczyński
  0 siblings, 2 replies; 5+ messages in thread
From: Zbigniew Kempczyński @ 2023-11-14 10:41 UTC (permalink / raw)
  To: igt-dev

Enable dg1 compute pipeline for i915.

v2: remove xe_ prefix and add comment about indirect data (Francois)

Zbigniew Kempczyński (2):
  lib/intel_compute: Prepare tgllp compute functions to be dg1 ready
  lib/intel_compute: Add dg1 compute implementation for i915

 lib/intel_compute.c                | 236 ++++++++++++++++++-----------
 lib/intel_compute_square_kernels.c |  42 +++++
 2 files changed, 187 insertions(+), 91 deletions(-)

-- 
2.34.1

^ permalink raw reply	[flat|nested] 5+ messages in thread

* [igt-dev] [PATCH i-g-t v2 1/2] lib/intel_compute: Prepare tgllp compute functions to be dg1 ready
  2023-11-14 10:41 [igt-dev] [PATCH i-g-t v2 0/2] Add dg1 compute pipeline Zbigniew Kempczyński
@ 2023-11-14 10:41 ` Zbigniew Kempczyński
  2023-11-14 11:12   ` Francois Dugast
  2023-11-14 10:41 ` [igt-dev] [PATCH i-g-t v2 2/2] lib/intel_compute: Add dg1 compute implementation for i915 Zbigniew Kempczyński
  1 sibling, 1 reply; 5+ messages in thread
From: Zbigniew Kempczyński @ 2023-11-14 10:41 UTC (permalink / raw)
  To: igt-dev

Preparing dynamic, surface and indirect data states is similar between
platforms so let's rename it to "xe" prefix. It might be confusing
at first glance do to "xe" prefix clash between platform and new
driver but it is closed (static) in this compilation unit.
Preparing indirect data was rewritten to generate input for kernels.

v2: avoid name confusion with "xe_" prefix (Francois)

Signed-off-by: Zbigniew Kempczyński <zbigniew.kempczynski@intel.com>
Cc: Francois Dugast <francois.dugast@intel.com>
---
 lib/intel_compute.c | 115 ++++++++++++++------------------------------
 1 file changed, 35 insertions(+), 80 deletions(-)

diff --git a/lib/intel_compute.c b/lib/intel_compute.c
index 7f1ea90e72..772c22fa37 100644
--- a/lib/intel_compute.c
+++ b/lib/intel_compute.c
@@ -203,120 +203,75 @@ static void bo_execenv_exec(struct bo_execenv *execenv, uint64_t start_addr)
  */
 
 /**
- * tgllp_create_indirect_data:
+ * create_indirect_data:
  * @addr_bo_buffer_batch: pointer to batch buffer
  * @addr_input: input buffer gpu offset
  * @addr_output: output buffer gpu offset
  *
  * Prepares indirect data for compute pipeline.
  */
-static void tgllp_create_indirect_data(uint32_t *addr_bo_buffer_batch,
-				       uint64_t addr_input,
-				       uint64_t addr_output)
+static void create_indirect_data(uint32_t *addr_bo_buffer_batch,
+				 uint64_t addr_input,
+				 uint64_t addr_output,
+				 uint32_t end_value)
 {
-	int b = 0;
+	uint32_t val = 0;
+	int b = 0, curr = 0;
 
 	addr_bo_buffer_batch[b++] = 0x00000000;
 	addr_bo_buffer_batch[b++] = 0x00000000;
 	addr_bo_buffer_batch[b++] = 0x00000000;
 	addr_bo_buffer_batch[b++] = 0x00000200;
+
 	addr_bo_buffer_batch[b++] = 0x00000001;
 	addr_bo_buffer_batch[b++] = 0x00000001;
 	addr_bo_buffer_batch[b++] = 0x00000000;
 	addr_bo_buffer_batch[b++] = 0x00000000;
+
 	addr_bo_buffer_batch[b++] = addr_input & 0xffffffff;
 	addr_bo_buffer_batch[b++] = addr_input >> 32;
 	addr_bo_buffer_batch[b++] = addr_output & 0xffffffff;
 	addr_bo_buffer_batch[b++] = addr_output >> 32;
+
 	addr_bo_buffer_batch[b++] = 0x00000400;
 	addr_bo_buffer_batch[b++] = 0x00000000;
 	addr_bo_buffer_batch[b++] = 0x00000000;
 	addr_bo_buffer_batch[b++] = 0x00000000;
+
 	addr_bo_buffer_batch[b++] = 0x00000200;
 	addr_bo_buffer_batch[b++] = 0x00000001;
 	addr_bo_buffer_batch[b++] = 0x00000001;
 	addr_bo_buffer_batch[b++] = 0x00000000;
+
 	addr_bo_buffer_batch[b++] = 0x00000000;
 	addr_bo_buffer_batch[b++] = 0x00000000;
 	addr_bo_buffer_batch[b++] = 0x00000000;
 	addr_bo_buffer_batch[b++] = 0x00000000;
-	addr_bo_buffer_batch[b++] = 0x00010000;
-	addr_bo_buffer_batch[b++] = 0x00030002;
-	addr_bo_buffer_batch[b++] = 0x00050004;
-	addr_bo_buffer_batch[b++] = 0x00070006;
-	addr_bo_buffer_batch[b++] = 0x00090008;
-	addr_bo_buffer_batch[b++] = 0x000B000A;
-	addr_bo_buffer_batch[b++] = 0x000D000C;
-	addr_bo_buffer_batch[b++] = 0x000F000E;
-	addr_bo_buffer_batch[b++] = 0x00110010;
-	addr_bo_buffer_batch[b++] = 0x00130012;
-	addr_bo_buffer_batch[b++] = 0x00150014;
-	addr_bo_buffer_batch[b++] = 0x00170016;
-	addr_bo_buffer_batch[b++] = 0x00190018;
-	addr_bo_buffer_batch[b++] = 0x001B001A;
-	addr_bo_buffer_batch[b++] = 0x001D001C;
-	addr_bo_buffer_batch[b++] = 0x001F001E;
-	addr_bo_buffer_batch[b++] = 0x00000000;
-	addr_bo_buffer_batch[b++] = 0x00000000;
-	addr_bo_buffer_batch[b++] = 0x00000000;
-	addr_bo_buffer_batch[b++] = 0x00000000;
-	addr_bo_buffer_batch[b++] = 0x00000000;
-	addr_bo_buffer_batch[b++] = 0x00000000;
-	addr_bo_buffer_batch[b++] = 0x00000000;
-	addr_bo_buffer_batch[b++] = 0x00000000;
-	addr_bo_buffer_batch[b++] = 0x00000000;
-	addr_bo_buffer_batch[b++] = 0x00000000;
-	addr_bo_buffer_batch[b++] = 0x00000000;
-	addr_bo_buffer_batch[b++] = 0x00000000;
-	addr_bo_buffer_batch[b++] = 0x00000000;
-	addr_bo_buffer_batch[b++] = 0x00000000;
-	addr_bo_buffer_batch[b++] = 0x00000000;
-	addr_bo_buffer_batch[b++] = 0x00000000;
-	addr_bo_buffer_batch[b++] = 0x00000000;
-	addr_bo_buffer_batch[b++] = 0x00000000;
-	addr_bo_buffer_batch[b++] = 0x00000000;
-	addr_bo_buffer_batch[b++] = 0x00000000;
-	addr_bo_buffer_batch[b++] = 0x00000000;
-	addr_bo_buffer_batch[b++] = 0x00000000;
-	addr_bo_buffer_batch[b++] = 0x00000000;
-	addr_bo_buffer_batch[b++] = 0x00000000;
-	addr_bo_buffer_batch[b++] = 0x00000000;
-	addr_bo_buffer_batch[b++] = 0x00000000;
-	addr_bo_buffer_batch[b++] = 0x00000000;
-	addr_bo_buffer_batch[b++] = 0x00000000;
-	addr_bo_buffer_batch[b++] = 0x00000000;
-	addr_bo_buffer_batch[b++] = 0x00000000;
-	addr_bo_buffer_batch[b++] = 0x00000000;
-	addr_bo_buffer_batch[b++] = 0x00000000;
-	addr_bo_buffer_batch[b++] = 0x00210020;
-	addr_bo_buffer_batch[b++] = 0x00230022;
-	addr_bo_buffer_batch[b++] = 0x00250024;
-	addr_bo_buffer_batch[b++] = 0x00270026;
-	addr_bo_buffer_batch[b++] = 0x00290028;
-	addr_bo_buffer_batch[b++] = 0x002B002A;
-	addr_bo_buffer_batch[b++] = 0x002D002C;
-	addr_bo_buffer_batch[b++] = 0x002F002E;
-	addr_bo_buffer_batch[b++] = 0x00310030;
-	addr_bo_buffer_batch[b++] = 0x00330032;
-	addr_bo_buffer_batch[b++] = 0x00350034;
-	addr_bo_buffer_batch[b++] = 0x00370036;
-	addr_bo_buffer_batch[b++] = 0x00390038;
-	addr_bo_buffer_batch[b++] = 0x003B003A;
-	addr_bo_buffer_batch[b++] = 0x003D003C;
-	addr_bo_buffer_batch[b++] = 0x003F003E;
+
+	/*
+	 * Runtime prepares 32 16-bit incremented values packed to single dword.
+	 * Then it lefts 32 dword gap filled with zeroes. Pattern looks the
+	 * same for tgl and dg1 (apart of number of values).
+	 */
+	while (val < end_value) {
+		addr_bo_buffer_batch[b++] = val | ((val + 1) << 16);
+		val += 2;
+		if (++curr % 16 == 0)
+			b += 32;
+	}
 }
 
 /**
- * tgllp_create_surface_state:
+ * create_surface_state:
  * @addr_bo_buffer_batch: pointer to batch buffer
  * @addr_input: input buffer gpu offset
  * @addr_output: output buffer gpu offset
  *
  * Prepares surface state for compute pipeline.
  */
-static void tgllp_create_surface_state(uint32_t *addr_bo_buffer_batch,
-				       uint64_t addr_input,
-				       uint64_t addr_output)
+static void create_surface_state(uint32_t *addr_bo_buffer_batch,
+				 uint64_t addr_input,
+				 uint64_t addr_output)
 {
 	int b = 0;
 
@@ -387,14 +342,14 @@ static void tgllp_create_surface_state(uint32_t *addr_bo_buffer_batch,
 }
 
 /**
- * tgllp_create_dynamic_state:
+ * create_dynamic_state:
  * @addr_bo_buffer_batch: pointer to batch buffer
  * @offset_kernel: gpu offset of the shader
  *
  * Prepares dynamic state for compute pipeline.
  */
-static void tgllp_create_dynamic_state(uint32_t *addr_bo_buffer_batch,
-				       uint64_t offset_kernel)
+static void create_dynamic_state(uint32_t *addr_bo_buffer_batch,
+				 uint64_t offset_kernel)
 {
 	int b = 0;
 
@@ -582,9 +537,9 @@ static void tgl_compute_exec(int fd, const unsigned char *kernel,
 	bo_execenv_bind(&execenv, bo_dict, TGL_BO_DICT_ENTRIES);
 
 	memcpy(bo_dict[0].data, kernel, size);
-	tgllp_create_dynamic_state(bo_dict[1].data, OFFSET_KERNEL);
-	tgllp_create_surface_state(bo_dict[2].data, ADDR_INPUT, ADDR_OUTPUT);
-	tgllp_create_indirect_data(bo_dict[3].data, ADDR_INPUT, ADDR_OUTPUT);
+	create_dynamic_state(bo_dict[1].data, OFFSET_KERNEL);
+	create_surface_state(bo_dict[2].data, ADDR_INPUT, ADDR_OUTPUT);
+	create_indirect_data(bo_dict[3].data, ADDR_INPUT, ADDR_OUTPUT, 0x40);
 
 	dinput = (float *)bo_dict[4].data;
 	srand(time(NULL));
@@ -852,7 +807,7 @@ static void xehp_compute_exec(int fd, const unsigned char *kernel,
 	bo_execenv_bind(&execenv, bo_dict, XEHP_BO_DICT_ENTRIES);
 
 	memcpy(bo_dict[0].data, kernel, size);
-	tgllp_create_dynamic_state(bo_dict[1].data, OFFSET_KERNEL);
+	create_dynamic_state(bo_dict[1].data, OFFSET_KERNEL);
 	xehp_create_surface_state(bo_dict[2].data, ADDR_INPUT, ADDR_OUTPUT);
 	xehp_create_indirect_data(bo_dict[3].data, ADDR_INPUT, ADDR_OUTPUT);
 	xehp_create_surface_state(bo_dict[7].data, ADDR_INPUT, ADDR_OUTPUT);
-- 
2.34.1

^ permalink raw reply related	[flat|nested] 5+ messages in thread

* [igt-dev] [PATCH i-g-t v2 2/2] lib/intel_compute: Add dg1 compute implementation for i915
  2023-11-14 10:41 [igt-dev] [PATCH i-g-t v2 0/2] Add dg1 compute pipeline Zbigniew Kempczyński
  2023-11-14 10:41 ` [igt-dev] [PATCH i-g-t v2 1/2] lib/intel_compute: Prepare tgllp compute functions to be dg1 ready Zbigniew Kempczyński
@ 2023-11-14 10:41 ` Zbigniew Kempczyński
  2023-11-14 11:17   ` Francois Dugast
  1 sibling, 1 reply; 5+ messages in thread
From: Zbigniew Kempczyński @ 2023-11-14 10:41 UTC (permalink / raw)
  To: igt-dev

Extend current testing for i915 and add dedicated to dg1 compute
pipeline. Due to ppgtt limitation to 47 bits on dg1 alter offsets
to use lower addresses.

Signed-off-by: Zbigniew Kempczyński <zbigniew.kempczynski@intel.com>
Cc: Francois Dugast <francois.dugast@intel.com>
---
 lib/intel_compute.c                | 123 ++++++++++++++++++++++++++---
 lib/intel_compute_square_kernels.c |  42 ++++++++++
 2 files changed, 153 insertions(+), 12 deletions(-)

diff --git a/lib/intel_compute.c b/lib/intel_compute.c
index 772c22fa37..248046895b 100644
--- a/lib/intel_compute.c
+++ b/lib/intel_compute.c
@@ -33,9 +33,9 @@
 #define ADDR_OUTPUT			0x300000UL
 #define ADDR_SURFACE_STATE_BASE		0x400000UL
 #define ADDR_DYNAMIC_STATE_BASE		0x500000UL
-#define ADDR_INDIRECT_OBJECT_BASE	0x800100000000
-#define OFFSET_INDIRECT_DATA_START	0xFFFDF000
-#define OFFSET_KERNEL			0xFFFEF000
+#define ADDR_INDIRECT_OBJECT_BASE	0x100000000
+#define OFFSET_INDIRECT_DATA_START	0xFFFD0000
+#define OFFSET_KERNEL			0xFFFE0000
 
 #define XEHP_ADDR_GENERAL_STATE_BASE		0x80000000UL
 #define XEHP_ADDR_INSTRUCTION_STATE_BASE	0x90000000UL
@@ -494,13 +494,98 @@ static void tgllp_compute_exec_compute(uint32_t *addr_bo_buffer_batch,
 }
 
 /**
- * tgl_compute_exec - run a pipeline compatible with Tiger Lake
+ * dg1_compute_exec_compute:
+ * @addr_bo_buffer_batch: pointer to batch buffer
+ * @addr_surface_state_base: gpu offset of surface state data
+ * @addr_dynamic_state_base: gpu offset of dynamic state data
+ * @addr_indirect_object_base: gpu offset of indirect object data
+ * @offset_indirect_data_start: gpu offset of indirect data start
+ *
+ * Prepares compute pipeline.
+ */
+static void dg1_compute_exec_compute(uint32_t *addr_bo_buffer_batch,
+				     uint64_t addr_surface_state_base,
+				     uint64_t addr_dynamic_state_base,
+				     uint64_t addr_indirect_object_base,
+				     uint64_t offset_indirect_data_start)
+{
+	int b = 0;
+
+	addr_bo_buffer_batch[b++] = XEHP_STATE_COMPUTE_MODE;
+	addr_bo_buffer_batch[b++] = 0x00180010;
+
+	addr_bo_buffer_batch[b++] = MEDIA_VFE_STATE | (9 - 2);
+	addr_bo_buffer_batch[b++] = 0x00000000;
+	addr_bo_buffer_batch[b++] = 0x00000000;
+	addr_bo_buffer_batch[b++] = 0x02FF0100;
+	addr_bo_buffer_batch[b++] = 0x00000000;
+	addr_bo_buffer_batch[b++] = 0x04000000;
+	addr_bo_buffer_batch[b++] = 0x00000000;
+	addr_bo_buffer_batch[b++] = 0x00000000;
+	addr_bo_buffer_batch[b++] = 0x00000000;
+
+	addr_bo_buffer_batch[b++] = MI_LOAD_REGISTER_IMM(1);
+	addr_bo_buffer_batch[b++] = 0x00002580;
+	addr_bo_buffer_batch[b++] = 0x00060002;
+
+	addr_bo_buffer_batch[b++] = STATE_BASE_ADDRESS | 0x14;
+	addr_bo_buffer_batch[b++] = 0x00000001;
+	addr_bo_buffer_batch[b++] = 0x00000000;
+	addr_bo_buffer_batch[b++] = 0x000A0000;
+	addr_bo_buffer_batch[b++] = (addr_surface_state_base & 0xffffffff) | 0x1;
+	addr_bo_buffer_batch[b++] = addr_surface_state_base >> 32;
+	addr_bo_buffer_batch[b++] = (addr_dynamic_state_base & 0xffffffff) | 0x1;
+	addr_bo_buffer_batch[b++] = addr_dynamic_state_base >> 32;
+	addr_bo_buffer_batch[b++] = (addr_indirect_object_base & 0xffffffff) | 0x1;
+	addr_bo_buffer_batch[b++] = (addr_indirect_object_base >> 32) | 0xffff0000;
+	addr_bo_buffer_batch[b++] = (addr_indirect_object_base & 0xffffffff) | 0xA1;
+	addr_bo_buffer_batch[b++] = addr_indirect_object_base >> 32;
+	addr_bo_buffer_batch[b++] = 0xFFFFF001;
+	addr_bo_buffer_batch[b++] = 0x00010001;
+	addr_bo_buffer_batch[b++] = 0xFFFFF001;
+	addr_bo_buffer_batch[b++] = 0xFFFFF001;
+	addr_bo_buffer_batch[b++] = (addr_surface_state_base & 0xffffffff) | 0xA1;
+	addr_bo_buffer_batch[b++] = addr_surface_state_base >> 32;
+	addr_bo_buffer_batch[b++] = 0x003BF000;
+	addr_bo_buffer_batch[b++] = 0x000000A1;
+	addr_bo_buffer_batch[b++] = 0x00000000;
+	addr_bo_buffer_batch[b++] = 0x00000000;
+
+	addr_bo_buffer_batch[b++] = MEDIA_INTERFACE_DESCRIPTOR_LOAD | (4 - 2);
+	addr_bo_buffer_batch[b++] = 0x00000000;
+	addr_bo_buffer_batch[b++] = 0x00000020;
+	addr_bo_buffer_batch[b++] = 0x00000000;
+
+	addr_bo_buffer_batch[b++] = GPGPU_WALKER | 13;
+	addr_bo_buffer_batch[b++] = 0x00000000;
+	addr_bo_buffer_batch[b++] = 0x00000c80;
+	addr_bo_buffer_batch[b++] = offset_indirect_data_start;
+	addr_bo_buffer_batch[b++] = 0x8000000f;
+	addr_bo_buffer_batch[b++] = 0x00000000;
+	addr_bo_buffer_batch[b++] = 0x00000000;
+	addr_bo_buffer_batch[b++] = 0x00000002;
+	addr_bo_buffer_batch[b++] = 0x00000000;
+	addr_bo_buffer_batch[b++] = 0x00000000;
+	addr_bo_buffer_batch[b++] = 0x00000001;
+	addr_bo_buffer_batch[b++] = 0x00000000;
+	addr_bo_buffer_batch[b++] = 0x00000001;
+	addr_bo_buffer_batch[b++] = 0xffffffff;
+	addr_bo_buffer_batch[b++] = 0xffffffff;
+
+	addr_bo_buffer_batch[b++] = MEDIA_STATE_FLUSH;
+	addr_bo_buffer_batch[b++] = 0x00000000;
+
+	addr_bo_buffer_batch[b++] = MI_BATCH_BUFFER_END;
+}
+
+/**
+ * xe_compute_exec - run a pipeline compatible with Tiger Lake and DG1
  *
  * @fd: file descriptor of the opened DRM device
  * @kernel: GPU Kernel binary to be executed
  * @size: size of @kernel.
  */
-static void tgl_compute_exec(int fd, const unsigned char *kernel,
+static void xe_compute_exec(int fd, const unsigned char *kernel,
 			     unsigned int size)
 {
 #define TGL_BO_DICT_ENTRIES 7
@@ -528,6 +613,7 @@ static void tgl_compute_exec(int fd, const unsigned char *kernel,
 	};
 	struct bo_execenv execenv;
 	float *dinput;
+	uint16_t devid = intel_get_drm_devid(fd);
 
 	bo_execenv_create(fd, &execenv);
 
@@ -539,18 +625,26 @@ static void tgl_compute_exec(int fd, const unsigned char *kernel,
 	memcpy(bo_dict[0].data, kernel, size);
 	create_dynamic_state(bo_dict[1].data, OFFSET_KERNEL);
 	create_surface_state(bo_dict[2].data, ADDR_INPUT, ADDR_OUTPUT);
-	create_indirect_data(bo_dict[3].data, ADDR_INPUT, ADDR_OUTPUT, 0x40);
+	create_indirect_data(bo_dict[3].data, ADDR_INPUT, ADDR_OUTPUT,
+			     IS_DG1(devid) ? 0x200 : 0x40);
 
 	dinput = (float *)bo_dict[4].data;
 	srand(time(NULL));
 	for (int i = 0; i < SIZE_DATA; i++)
 		((float *)dinput)[i] = rand() / (float)RAND_MAX;
 
-	tgllp_compute_exec_compute(bo_dict[6].data,
-				   ADDR_SURFACE_STATE_BASE,
-				   ADDR_DYNAMIC_STATE_BASE,
-				   ADDR_INDIRECT_OBJECT_BASE,
-				   OFFSET_INDIRECT_DATA_START);
+	if (IS_DG1(devid))
+		dg1_compute_exec_compute(bo_dict[6].data,
+					 ADDR_SURFACE_STATE_BASE,
+					 ADDR_DYNAMIC_STATE_BASE,
+					 ADDR_INDIRECT_OBJECT_BASE,
+					 OFFSET_INDIRECT_DATA_START);
+	else
+		tgllp_compute_exec_compute(bo_dict[6].data,
+					   ADDR_SURFACE_STATE_BASE,
+					   ADDR_DYNAMIC_STATE_BASE,
+					   ADDR_INDIRECT_OBJECT_BASE,
+					   OFFSET_INDIRECT_DATA_START);
 
 	bo_execenv_exec(&execenv, ADDR_BATCH);
 
@@ -1063,9 +1157,14 @@ static const struct {
 } intel_compute_batches[] = {
 	{
 		.ip_ver = IP_VER(12, 0),
-		.compute_exec = tgl_compute_exec,
+		.compute_exec = xe_compute_exec,
 		.compat = COMPAT_DRIVER_I915 | COMPAT_DRIVER_XE,
 	},
+	{
+		.ip_ver = IP_VER(12, 10),
+		.compute_exec = xe_compute_exec,
+		.compat = COMPAT_DRIVER_I915,
+	},
 	{
 		.ip_ver = IP_VER(12, 55),
 		.compute_exec = xehp_compute_exec,
diff --git a/lib/intel_compute_square_kernels.c b/lib/intel_compute_square_kernels.c
index d094c23ccb..3d5b1ad475 100644
--- a/lib/intel_compute_square_kernels.c
+++ b/lib/intel_compute_square_kernels.c
@@ -61,6 +61,43 @@ static const unsigned char tgllp_kernel_square_bin[] = {
 	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
 };
 
+static const unsigned char dg1_kernel_square_bin[] = {
+	0x61, 0x00, 0x03, 0x80, 0x20, 0x02, 0x05, 0x03, 0x04, 0x00, 0x10, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x66, 0x01, 0x00, 0x80, 0x20, 0x82, 0x01, 0x80,
+	0x00, 0x80, 0x00, 0x01, 0xc0, 0x04, 0xc0, 0x04, 0x41, 0x01, 0x20, 0x22,
+	0x16, 0x09, 0x11, 0x03, 0x49, 0x00, 0x04, 0xa2, 0x12, 0x09, 0x11, 0x03,
+	0x40, 0x01, 0x04, 0x00, 0x60, 0x06, 0x05, 0x05, 0x04, 0x04, 0x00, 0x01,
+	0x05, 0x01, 0x58, 0x00, 0x40, 0x00, 0x24, 0x00, 0x60, 0x06, 0x05, 0x0a,
+	0x04, 0x04, 0x00, 0x01, 0x05, 0x02, 0x58, 0x00, 0x40, 0x02, 0x0c, 0xa0,
+	0x02, 0x05, 0x10, 0x07, 0x40, 0x02, 0x0e, 0xa6, 0x02, 0x0a, 0x10, 0x07,
+	0x70, 0x02, 0x04, 0x00, 0x60, 0x02, 0x01, 0x00, 0x05, 0x0c, 0x46, 0x52,
+	0x84, 0x08, 0x00, 0x00, 0x70, 0x02, 0x24, 0x00, 0x60, 0x02, 0x01, 0x00,
+	0x05, 0x0e, 0x46, 0x52, 0x84, 0x08, 0x00, 0x00, 0x72, 0x00, 0x02, 0x80,
+	0x50, 0x0d, 0x04, 0x01, 0x05, 0x01, 0x05, 0x1d, 0x05, 0x01, 0x05, 0x01,
+	0x22, 0x00, 0x05, 0x01, 0x00, 0xc0, 0x00, 0x00, 0x90, 0x00, 0x00, 0x00,
+	0x90, 0x00, 0x00, 0x00, 0x69, 0x00, 0x10, 0x60, 0x02, 0x0c, 0x20, 0x00,
+	0x69, 0x00, 0x12, 0x66, 0x02, 0x0e, 0x20, 0x00, 0x40, 0x02, 0x14, 0xa0,
+	0x32, 0x10, 0x10, 0x08, 0x40, 0x02, 0x16, 0xa6, 0x32, 0x12, 0x10, 0x08,
+	0x31, 0xa0, 0x04, 0x00, 0x00, 0x00, 0x14, 0x18, 0x14, 0x14, 0x00, 0xcc,
+	0x00, 0x00, 0x16, 0x00, 0x31, 0x91, 0x24, 0x00, 0x00, 0x00, 0x14, 0x1a,
+	0x14, 0x16, 0x00, 0xcc, 0x00, 0x00, 0x16, 0x00, 0x40, 0x00, 0x10, 0xa0,
+	0x4a, 0x10, 0x10, 0x08, 0x40, 0x00, 0x12, 0xa6, 0x4a, 0x12, 0x10, 0x08,
+	0x41, 0x20, 0x18, 0x20, 0x00, 0x18, 0x00, 0x18, 0x41, 0x21, 0x1a, 0x26,
+	0x00, 0x1a, 0x00, 0x1a, 0x31, 0xa2, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x14, 0x10, 0x02, 0xcc, 0x14, 0x18, 0x96, 0x00, 0x31, 0x93, 0x24, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x14, 0x12, 0x02, 0xcc, 0x14, 0x1a, 0x96, 0x00,
+	0x25, 0x00, 0x05, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x10, 0x00, 0x00, 0x00, 0x61, 0x00, 0x7f, 0x64, 0x00, 0x03, 0x10, 0x00,
+	0x31, 0x44, 0x03, 0x80, 0x00, 0x00, 0x0c, 0x1c, 0x0c, 0x03, 0x00, 0xa0,
+	0x00, 0x00, 0x78, 0x02, 0x61, 0x24, 0x03, 0x80, 0x20, 0x02, 0x01, 0x00,
+	0x05, 0x1c, 0x46, 0x00, 0x00, 0x00, 0x00, 0x00, 0x61, 0x00, 0x04, 0x80,
+	0xa0, 0x4a, 0x01, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x31, 0x01, 0x03, 0x80, 0x04, 0x00, 0x00, 0x00, 0x0c, 0x7f, 0x20, 0x70,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00
+};
+
 static const unsigned char xehp_kernel_square_bin[] = {
 	0x61, 0x31, 0x03, 0x80, 0x20, 0x42, 0x05, 0x7f, 0x00, 0x00, 0x00, 0x00,
 	0x00, 0x00, 0x00, 0x00, 0x65, 0x00, 0x00, 0x80, 0x20, 0x82, 0x45, 0x7f,
@@ -152,6 +189,11 @@ const struct intel_compute_kernels intel_compute_square_kernels[] = {
 		.size = sizeof(tgllp_kernel_square_bin),
 		.kernel = tgllp_kernel_square_bin,
 	},
+	{
+		.ip_ver = IP_VER(12, 10),
+		.size = sizeof(dg1_kernel_square_bin),
+		.kernel = dg1_kernel_square_bin,
+	},
 	{
 		.ip_ver = IP_VER(12, 55),
 		.size = sizeof(xehp_kernel_square_bin),
-- 
2.34.1

^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [igt-dev] [PATCH i-g-t v2 1/2] lib/intel_compute: Prepare tgllp compute functions to be dg1 ready
  2023-11-14 10:41 ` [igt-dev] [PATCH i-g-t v2 1/2] lib/intel_compute: Prepare tgllp compute functions to be dg1 ready Zbigniew Kempczyński
@ 2023-11-14 11:12   ` Francois Dugast
  0 siblings, 0 replies; 5+ messages in thread
From: Francois Dugast @ 2023-11-14 11:12 UTC (permalink / raw)
  To: Zbigniew Kempczyński; +Cc: igt-dev

On Tue, Nov 14, 2023 at 11:41:50AM +0100, Zbigniew Kempczyński wrote:
> Preparing dynamic, surface and indirect data states is similar between
> platforms so let's rename it to "xe" prefix. It might be confusing
> at first glance do to "xe" prefix clash between platform and new
> driver but it is closed (static) in this compilation unit.
> Preparing indirect data was rewritten to generate input for kernels.
> 
> v2: avoid name confusion with "xe_" prefix (Francois)

The commit message is no longer valid. Please update it to reflect v2
and with that:

	Reviewed-by: Francois Dugast <francois.dugast@intel.com>

> 
> Signed-off-by: Zbigniew Kempczyński <zbigniew.kempczynski@intel.com>
> Cc: Francois Dugast <francois.dugast@intel.com>
> ---
>  lib/intel_compute.c | 115 ++++++++++++++------------------------------
>  1 file changed, 35 insertions(+), 80 deletions(-)
> 
> diff --git a/lib/intel_compute.c b/lib/intel_compute.c
> index 7f1ea90e72..772c22fa37 100644
> --- a/lib/intel_compute.c
> +++ b/lib/intel_compute.c
> @@ -203,120 +203,75 @@ static void bo_execenv_exec(struct bo_execenv *execenv, uint64_t start_addr)
>   */
>  
>  /**
> - * tgllp_create_indirect_data:
> + * create_indirect_data:
>   * @addr_bo_buffer_batch: pointer to batch buffer
>   * @addr_input: input buffer gpu offset
>   * @addr_output: output buffer gpu offset
>   *
>   * Prepares indirect data for compute pipeline.
>   */
> -static void tgllp_create_indirect_data(uint32_t *addr_bo_buffer_batch,
> -				       uint64_t addr_input,
> -				       uint64_t addr_output)
> +static void create_indirect_data(uint32_t *addr_bo_buffer_batch,
> +				 uint64_t addr_input,
> +				 uint64_t addr_output,
> +				 uint32_t end_value)
>  {
> -	int b = 0;
> +	uint32_t val = 0;
> +	int b = 0, curr = 0;
>  
>  	addr_bo_buffer_batch[b++] = 0x00000000;
>  	addr_bo_buffer_batch[b++] = 0x00000000;
>  	addr_bo_buffer_batch[b++] = 0x00000000;
>  	addr_bo_buffer_batch[b++] = 0x00000200;
> +
>  	addr_bo_buffer_batch[b++] = 0x00000001;
>  	addr_bo_buffer_batch[b++] = 0x00000001;
>  	addr_bo_buffer_batch[b++] = 0x00000000;
>  	addr_bo_buffer_batch[b++] = 0x00000000;
> +
>  	addr_bo_buffer_batch[b++] = addr_input & 0xffffffff;
>  	addr_bo_buffer_batch[b++] = addr_input >> 32;
>  	addr_bo_buffer_batch[b++] = addr_output & 0xffffffff;
>  	addr_bo_buffer_batch[b++] = addr_output >> 32;
> +
>  	addr_bo_buffer_batch[b++] = 0x00000400;
>  	addr_bo_buffer_batch[b++] = 0x00000000;
>  	addr_bo_buffer_batch[b++] = 0x00000000;
>  	addr_bo_buffer_batch[b++] = 0x00000000;
> +
>  	addr_bo_buffer_batch[b++] = 0x00000200;
>  	addr_bo_buffer_batch[b++] = 0x00000001;
>  	addr_bo_buffer_batch[b++] = 0x00000001;
>  	addr_bo_buffer_batch[b++] = 0x00000000;
> +
>  	addr_bo_buffer_batch[b++] = 0x00000000;
>  	addr_bo_buffer_batch[b++] = 0x00000000;
>  	addr_bo_buffer_batch[b++] = 0x00000000;
>  	addr_bo_buffer_batch[b++] = 0x00000000;
> -	addr_bo_buffer_batch[b++] = 0x00010000;
> -	addr_bo_buffer_batch[b++] = 0x00030002;
> -	addr_bo_buffer_batch[b++] = 0x00050004;
> -	addr_bo_buffer_batch[b++] = 0x00070006;
> -	addr_bo_buffer_batch[b++] = 0x00090008;
> -	addr_bo_buffer_batch[b++] = 0x000B000A;
> -	addr_bo_buffer_batch[b++] = 0x000D000C;
> -	addr_bo_buffer_batch[b++] = 0x000F000E;
> -	addr_bo_buffer_batch[b++] = 0x00110010;
> -	addr_bo_buffer_batch[b++] = 0x00130012;
> -	addr_bo_buffer_batch[b++] = 0x00150014;
> -	addr_bo_buffer_batch[b++] = 0x00170016;
> -	addr_bo_buffer_batch[b++] = 0x00190018;
> -	addr_bo_buffer_batch[b++] = 0x001B001A;
> -	addr_bo_buffer_batch[b++] = 0x001D001C;
> -	addr_bo_buffer_batch[b++] = 0x001F001E;
> -	addr_bo_buffer_batch[b++] = 0x00000000;
> -	addr_bo_buffer_batch[b++] = 0x00000000;
> -	addr_bo_buffer_batch[b++] = 0x00000000;
> -	addr_bo_buffer_batch[b++] = 0x00000000;
> -	addr_bo_buffer_batch[b++] = 0x00000000;
> -	addr_bo_buffer_batch[b++] = 0x00000000;
> -	addr_bo_buffer_batch[b++] = 0x00000000;
> -	addr_bo_buffer_batch[b++] = 0x00000000;
> -	addr_bo_buffer_batch[b++] = 0x00000000;
> -	addr_bo_buffer_batch[b++] = 0x00000000;
> -	addr_bo_buffer_batch[b++] = 0x00000000;
> -	addr_bo_buffer_batch[b++] = 0x00000000;
> -	addr_bo_buffer_batch[b++] = 0x00000000;
> -	addr_bo_buffer_batch[b++] = 0x00000000;
> -	addr_bo_buffer_batch[b++] = 0x00000000;
> -	addr_bo_buffer_batch[b++] = 0x00000000;
> -	addr_bo_buffer_batch[b++] = 0x00000000;
> -	addr_bo_buffer_batch[b++] = 0x00000000;
> -	addr_bo_buffer_batch[b++] = 0x00000000;
> -	addr_bo_buffer_batch[b++] = 0x00000000;
> -	addr_bo_buffer_batch[b++] = 0x00000000;
> -	addr_bo_buffer_batch[b++] = 0x00000000;
> -	addr_bo_buffer_batch[b++] = 0x00000000;
> -	addr_bo_buffer_batch[b++] = 0x00000000;
> -	addr_bo_buffer_batch[b++] = 0x00000000;
> -	addr_bo_buffer_batch[b++] = 0x00000000;
> -	addr_bo_buffer_batch[b++] = 0x00000000;
> -	addr_bo_buffer_batch[b++] = 0x00000000;
> -	addr_bo_buffer_batch[b++] = 0x00000000;
> -	addr_bo_buffer_batch[b++] = 0x00000000;
> -	addr_bo_buffer_batch[b++] = 0x00000000;
> -	addr_bo_buffer_batch[b++] = 0x00000000;
> -	addr_bo_buffer_batch[b++] = 0x00210020;
> -	addr_bo_buffer_batch[b++] = 0x00230022;
> -	addr_bo_buffer_batch[b++] = 0x00250024;
> -	addr_bo_buffer_batch[b++] = 0x00270026;
> -	addr_bo_buffer_batch[b++] = 0x00290028;
> -	addr_bo_buffer_batch[b++] = 0x002B002A;
> -	addr_bo_buffer_batch[b++] = 0x002D002C;
> -	addr_bo_buffer_batch[b++] = 0x002F002E;
> -	addr_bo_buffer_batch[b++] = 0x00310030;
> -	addr_bo_buffer_batch[b++] = 0x00330032;
> -	addr_bo_buffer_batch[b++] = 0x00350034;
> -	addr_bo_buffer_batch[b++] = 0x00370036;
> -	addr_bo_buffer_batch[b++] = 0x00390038;
> -	addr_bo_buffer_batch[b++] = 0x003B003A;
> -	addr_bo_buffer_batch[b++] = 0x003D003C;
> -	addr_bo_buffer_batch[b++] = 0x003F003E;
> +
> +	/*
> +	 * Runtime prepares 32 16-bit incremented values packed to single dword.
> +	 * Then it lefts 32 dword gap filled with zeroes. Pattern looks the
> +	 * same for tgl and dg1 (apart of number of values).
> +	 */
> +	while (val < end_value) {
> +		addr_bo_buffer_batch[b++] = val | ((val + 1) << 16);
> +		val += 2;
> +		if (++curr % 16 == 0)
> +			b += 32;
> +	}
>  }
>  
>  /**
> - * tgllp_create_surface_state:
> + * create_surface_state:
>   * @addr_bo_buffer_batch: pointer to batch buffer
>   * @addr_input: input buffer gpu offset
>   * @addr_output: output buffer gpu offset
>   *
>   * Prepares surface state for compute pipeline.
>   */
> -static void tgllp_create_surface_state(uint32_t *addr_bo_buffer_batch,
> -				       uint64_t addr_input,
> -				       uint64_t addr_output)
> +static void create_surface_state(uint32_t *addr_bo_buffer_batch,
> +				 uint64_t addr_input,
> +				 uint64_t addr_output)
>  {
>  	int b = 0;
>  
> @@ -387,14 +342,14 @@ static void tgllp_create_surface_state(uint32_t *addr_bo_buffer_batch,
>  }
>  
>  /**
> - * tgllp_create_dynamic_state:
> + * create_dynamic_state:
>   * @addr_bo_buffer_batch: pointer to batch buffer
>   * @offset_kernel: gpu offset of the shader
>   *
>   * Prepares dynamic state for compute pipeline.
>   */
> -static void tgllp_create_dynamic_state(uint32_t *addr_bo_buffer_batch,
> -				       uint64_t offset_kernel)
> +static void create_dynamic_state(uint32_t *addr_bo_buffer_batch,
> +				 uint64_t offset_kernel)
>  {
>  	int b = 0;
>  
> @@ -582,9 +537,9 @@ static void tgl_compute_exec(int fd, const unsigned char *kernel,
>  	bo_execenv_bind(&execenv, bo_dict, TGL_BO_DICT_ENTRIES);
>  
>  	memcpy(bo_dict[0].data, kernel, size);
> -	tgllp_create_dynamic_state(bo_dict[1].data, OFFSET_KERNEL);
> -	tgllp_create_surface_state(bo_dict[2].data, ADDR_INPUT, ADDR_OUTPUT);
> -	tgllp_create_indirect_data(bo_dict[3].data, ADDR_INPUT, ADDR_OUTPUT);
> +	create_dynamic_state(bo_dict[1].data, OFFSET_KERNEL);
> +	create_surface_state(bo_dict[2].data, ADDR_INPUT, ADDR_OUTPUT);
> +	create_indirect_data(bo_dict[3].data, ADDR_INPUT, ADDR_OUTPUT, 0x40);
>  
>  	dinput = (float *)bo_dict[4].data;
>  	srand(time(NULL));
> @@ -852,7 +807,7 @@ static void xehp_compute_exec(int fd, const unsigned char *kernel,
>  	bo_execenv_bind(&execenv, bo_dict, XEHP_BO_DICT_ENTRIES);
>  
>  	memcpy(bo_dict[0].data, kernel, size);
> -	tgllp_create_dynamic_state(bo_dict[1].data, OFFSET_KERNEL);
> +	create_dynamic_state(bo_dict[1].data, OFFSET_KERNEL);
>  	xehp_create_surface_state(bo_dict[2].data, ADDR_INPUT, ADDR_OUTPUT);
>  	xehp_create_indirect_data(bo_dict[3].data, ADDR_INPUT, ADDR_OUTPUT);
>  	xehp_create_surface_state(bo_dict[7].data, ADDR_INPUT, ADDR_OUTPUT);
> -- 
> 2.34.1
> 

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [igt-dev] [PATCH i-g-t v2 2/2] lib/intel_compute: Add dg1 compute implementation for i915
  2023-11-14 10:41 ` [igt-dev] [PATCH i-g-t v2 2/2] lib/intel_compute: Add dg1 compute implementation for i915 Zbigniew Kempczyński
@ 2023-11-14 11:17   ` Francois Dugast
  0 siblings, 0 replies; 5+ messages in thread
From: Francois Dugast @ 2023-11-14 11:17 UTC (permalink / raw)
  To: Zbigniew Kempczyński; +Cc: igt-dev

On Tue, Nov 14, 2023 at 11:41:51AM +0100, Zbigniew Kempczyński wrote:
> Extend current testing for i915 and add dedicated to dg1 compute
> pipeline. Due to ppgtt limitation to 47 bits on dg1 alter offsets
> to use lower addresses.
> 
> Signed-off-by: Zbigniew Kempczyński <zbigniew.kempczynski@intel.com>
> Cc: Francois Dugast <francois.dugast@intel.com>
> ---
>  lib/intel_compute.c                | 123 ++++++++++++++++++++++++++---
>  lib/intel_compute_square_kernels.c |  42 ++++++++++
>  2 files changed, 153 insertions(+), 12 deletions(-)
> 
> diff --git a/lib/intel_compute.c b/lib/intel_compute.c
> index 772c22fa37..248046895b 100644
> --- a/lib/intel_compute.c
> +++ b/lib/intel_compute.c
> @@ -33,9 +33,9 @@
>  #define ADDR_OUTPUT			0x300000UL
>  #define ADDR_SURFACE_STATE_BASE		0x400000UL
>  #define ADDR_DYNAMIC_STATE_BASE		0x500000UL
> -#define ADDR_INDIRECT_OBJECT_BASE	0x800100000000
> -#define OFFSET_INDIRECT_DATA_START	0xFFFDF000
> -#define OFFSET_KERNEL			0xFFFEF000
> +#define ADDR_INDIRECT_OBJECT_BASE	0x100000000
> +#define OFFSET_INDIRECT_DATA_START	0xFFFD0000
> +#define OFFSET_KERNEL			0xFFFE0000
>  
>  #define XEHP_ADDR_GENERAL_STATE_BASE		0x80000000UL
>  #define XEHP_ADDR_INSTRUCTION_STATE_BASE	0x90000000UL
> @@ -494,13 +494,98 @@ static void tgllp_compute_exec_compute(uint32_t *addr_bo_buffer_batch,
>  }
>  
>  /**
> - * tgl_compute_exec - run a pipeline compatible with Tiger Lake
> + * dg1_compute_exec_compute:
> + * @addr_bo_buffer_batch: pointer to batch buffer
> + * @addr_surface_state_base: gpu offset of surface state data
> + * @addr_dynamic_state_base: gpu offset of dynamic state data
> + * @addr_indirect_object_base: gpu offset of indirect object data
> + * @offset_indirect_data_start: gpu offset of indirect data start
> + *
> + * Prepares compute pipeline.
> + */
> +static void dg1_compute_exec_compute(uint32_t *addr_bo_buffer_batch,
> +				     uint64_t addr_surface_state_base,
> +				     uint64_t addr_dynamic_state_base,
> +				     uint64_t addr_indirect_object_base,
> +				     uint64_t offset_indirect_data_start)
> +{
> +	int b = 0;
> +
> +	addr_bo_buffer_batch[b++] = XEHP_STATE_COMPUTE_MODE;
> +	addr_bo_buffer_batch[b++] = 0x00180010;
> +
> +	addr_bo_buffer_batch[b++] = MEDIA_VFE_STATE | (9 - 2);
> +	addr_bo_buffer_batch[b++] = 0x00000000;
> +	addr_bo_buffer_batch[b++] = 0x00000000;
> +	addr_bo_buffer_batch[b++] = 0x02FF0100;
> +	addr_bo_buffer_batch[b++] = 0x00000000;
> +	addr_bo_buffer_batch[b++] = 0x04000000;
> +	addr_bo_buffer_batch[b++] = 0x00000000;
> +	addr_bo_buffer_batch[b++] = 0x00000000;
> +	addr_bo_buffer_batch[b++] = 0x00000000;
> +
> +	addr_bo_buffer_batch[b++] = MI_LOAD_REGISTER_IMM(1);
> +	addr_bo_buffer_batch[b++] = 0x00002580;
> +	addr_bo_buffer_batch[b++] = 0x00060002;
> +
> +	addr_bo_buffer_batch[b++] = STATE_BASE_ADDRESS | 0x14;
> +	addr_bo_buffer_batch[b++] = 0x00000001;
> +	addr_bo_buffer_batch[b++] = 0x00000000;
> +	addr_bo_buffer_batch[b++] = 0x000A0000;
> +	addr_bo_buffer_batch[b++] = (addr_surface_state_base & 0xffffffff) | 0x1;
> +	addr_bo_buffer_batch[b++] = addr_surface_state_base >> 32;
> +	addr_bo_buffer_batch[b++] = (addr_dynamic_state_base & 0xffffffff) | 0x1;
> +	addr_bo_buffer_batch[b++] = addr_dynamic_state_base >> 32;
> +	addr_bo_buffer_batch[b++] = (addr_indirect_object_base & 0xffffffff) | 0x1;
> +	addr_bo_buffer_batch[b++] = (addr_indirect_object_base >> 32) | 0xffff0000;
> +	addr_bo_buffer_batch[b++] = (addr_indirect_object_base & 0xffffffff) | 0xA1;
> +	addr_bo_buffer_batch[b++] = addr_indirect_object_base >> 32;
> +	addr_bo_buffer_batch[b++] = 0xFFFFF001;
> +	addr_bo_buffer_batch[b++] = 0x00010001;
> +	addr_bo_buffer_batch[b++] = 0xFFFFF001;
> +	addr_bo_buffer_batch[b++] = 0xFFFFF001;
> +	addr_bo_buffer_batch[b++] = (addr_surface_state_base & 0xffffffff) | 0xA1;
> +	addr_bo_buffer_batch[b++] = addr_surface_state_base >> 32;
> +	addr_bo_buffer_batch[b++] = 0x003BF000;
> +	addr_bo_buffer_batch[b++] = 0x000000A1;
> +	addr_bo_buffer_batch[b++] = 0x00000000;
> +	addr_bo_buffer_batch[b++] = 0x00000000;
> +
> +	addr_bo_buffer_batch[b++] = MEDIA_INTERFACE_DESCRIPTOR_LOAD | (4 - 2);
> +	addr_bo_buffer_batch[b++] = 0x00000000;
> +	addr_bo_buffer_batch[b++] = 0x00000020;
> +	addr_bo_buffer_batch[b++] = 0x00000000;
> +
> +	addr_bo_buffer_batch[b++] = GPGPU_WALKER | 13;
> +	addr_bo_buffer_batch[b++] = 0x00000000;
> +	addr_bo_buffer_batch[b++] = 0x00000c80;
> +	addr_bo_buffer_batch[b++] = offset_indirect_data_start;
> +	addr_bo_buffer_batch[b++] = 0x8000000f;
> +	addr_bo_buffer_batch[b++] = 0x00000000;
> +	addr_bo_buffer_batch[b++] = 0x00000000;
> +	addr_bo_buffer_batch[b++] = 0x00000002;
> +	addr_bo_buffer_batch[b++] = 0x00000000;
> +	addr_bo_buffer_batch[b++] = 0x00000000;
> +	addr_bo_buffer_batch[b++] = 0x00000001;
> +	addr_bo_buffer_batch[b++] = 0x00000000;
> +	addr_bo_buffer_batch[b++] = 0x00000001;
> +	addr_bo_buffer_batch[b++] = 0xffffffff;
> +	addr_bo_buffer_batch[b++] = 0xffffffff;
> +
> +	addr_bo_buffer_batch[b++] = MEDIA_STATE_FLUSH;
> +	addr_bo_buffer_batch[b++] = 0x00000000;
> +
> +	addr_bo_buffer_batch[b++] = MI_BATCH_BUFFER_END;
> +}
> +
> +/**
> + * xe_compute_exec - run a pipeline compatible with Tiger Lake and DG1
>   *
>   * @fd: file descriptor of the opened DRM device
>   * @kernel: GPU Kernel binary to be executed
>   * @size: size of @kernel.
>   */
> -static void tgl_compute_exec(int fd, const unsigned char *kernel,
> +static void xe_compute_exec(int fd, const unsigned char *kernel,

Here also please drop the prefix, to avoid confusion about Xe driver.

Francois

>  			     unsigned int size)
>  {
>  #define TGL_BO_DICT_ENTRIES 7
> @@ -528,6 +613,7 @@ static void tgl_compute_exec(int fd, const unsigned char *kernel,
>  	};
>  	struct bo_execenv execenv;
>  	float *dinput;
> +	uint16_t devid = intel_get_drm_devid(fd);
>  
>  	bo_execenv_create(fd, &execenv);
>  
> @@ -539,18 +625,26 @@ static void tgl_compute_exec(int fd, const unsigned char *kernel,
>  	memcpy(bo_dict[0].data, kernel, size);
>  	create_dynamic_state(bo_dict[1].data, OFFSET_KERNEL);
>  	create_surface_state(bo_dict[2].data, ADDR_INPUT, ADDR_OUTPUT);
> -	create_indirect_data(bo_dict[3].data, ADDR_INPUT, ADDR_OUTPUT, 0x40);
> +	create_indirect_data(bo_dict[3].data, ADDR_INPUT, ADDR_OUTPUT,
> +			     IS_DG1(devid) ? 0x200 : 0x40);
>  
>  	dinput = (float *)bo_dict[4].data;
>  	srand(time(NULL));
>  	for (int i = 0; i < SIZE_DATA; i++)
>  		((float *)dinput)[i] = rand() / (float)RAND_MAX;
>  
> -	tgllp_compute_exec_compute(bo_dict[6].data,
> -				   ADDR_SURFACE_STATE_BASE,
> -				   ADDR_DYNAMIC_STATE_BASE,
> -				   ADDR_INDIRECT_OBJECT_BASE,
> -				   OFFSET_INDIRECT_DATA_START);
> +	if (IS_DG1(devid))
> +		dg1_compute_exec_compute(bo_dict[6].data,
> +					 ADDR_SURFACE_STATE_BASE,
> +					 ADDR_DYNAMIC_STATE_BASE,
> +					 ADDR_INDIRECT_OBJECT_BASE,
> +					 OFFSET_INDIRECT_DATA_START);
> +	else
> +		tgllp_compute_exec_compute(bo_dict[6].data,
> +					   ADDR_SURFACE_STATE_BASE,
> +					   ADDR_DYNAMIC_STATE_BASE,
> +					   ADDR_INDIRECT_OBJECT_BASE,
> +					   OFFSET_INDIRECT_DATA_START);
>  
>  	bo_execenv_exec(&execenv, ADDR_BATCH);
>  
> @@ -1063,9 +1157,14 @@ static const struct {
>  } intel_compute_batches[] = {
>  	{
>  		.ip_ver = IP_VER(12, 0),
> -		.compute_exec = tgl_compute_exec,
> +		.compute_exec = xe_compute_exec,
>  		.compat = COMPAT_DRIVER_I915 | COMPAT_DRIVER_XE,
>  	},
> +	{
> +		.ip_ver = IP_VER(12, 10),
> +		.compute_exec = xe_compute_exec,
> +		.compat = COMPAT_DRIVER_I915,
> +	},
>  	{
>  		.ip_ver = IP_VER(12, 55),
>  		.compute_exec = xehp_compute_exec,
> diff --git a/lib/intel_compute_square_kernels.c b/lib/intel_compute_square_kernels.c
> index d094c23ccb..3d5b1ad475 100644
> --- a/lib/intel_compute_square_kernels.c
> +++ b/lib/intel_compute_square_kernels.c
> @@ -61,6 +61,43 @@ static const unsigned char tgllp_kernel_square_bin[] = {
>  	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
>  };
>  
> +static const unsigned char dg1_kernel_square_bin[] = {
> +	0x61, 0x00, 0x03, 0x80, 0x20, 0x02, 0x05, 0x03, 0x04, 0x00, 0x10, 0x00,
> +	0x00, 0x00, 0x00, 0x00, 0x66, 0x01, 0x00, 0x80, 0x20, 0x82, 0x01, 0x80,
> +	0x00, 0x80, 0x00, 0x01, 0xc0, 0x04, 0xc0, 0x04, 0x41, 0x01, 0x20, 0x22,
> +	0x16, 0x09, 0x11, 0x03, 0x49, 0x00, 0x04, 0xa2, 0x12, 0x09, 0x11, 0x03,
> +	0x40, 0x01, 0x04, 0x00, 0x60, 0x06, 0x05, 0x05, 0x04, 0x04, 0x00, 0x01,
> +	0x05, 0x01, 0x58, 0x00, 0x40, 0x00, 0x24, 0x00, 0x60, 0x06, 0x05, 0x0a,
> +	0x04, 0x04, 0x00, 0x01, 0x05, 0x02, 0x58, 0x00, 0x40, 0x02, 0x0c, 0xa0,
> +	0x02, 0x05, 0x10, 0x07, 0x40, 0x02, 0x0e, 0xa6, 0x02, 0x0a, 0x10, 0x07,
> +	0x70, 0x02, 0x04, 0x00, 0x60, 0x02, 0x01, 0x00, 0x05, 0x0c, 0x46, 0x52,
> +	0x84, 0x08, 0x00, 0x00, 0x70, 0x02, 0x24, 0x00, 0x60, 0x02, 0x01, 0x00,
> +	0x05, 0x0e, 0x46, 0x52, 0x84, 0x08, 0x00, 0x00, 0x72, 0x00, 0x02, 0x80,
> +	0x50, 0x0d, 0x04, 0x01, 0x05, 0x01, 0x05, 0x1d, 0x05, 0x01, 0x05, 0x01,
> +	0x22, 0x00, 0x05, 0x01, 0x00, 0xc0, 0x00, 0x00, 0x90, 0x00, 0x00, 0x00,
> +	0x90, 0x00, 0x00, 0x00, 0x69, 0x00, 0x10, 0x60, 0x02, 0x0c, 0x20, 0x00,
> +	0x69, 0x00, 0x12, 0x66, 0x02, 0x0e, 0x20, 0x00, 0x40, 0x02, 0x14, 0xa0,
> +	0x32, 0x10, 0x10, 0x08, 0x40, 0x02, 0x16, 0xa6, 0x32, 0x12, 0x10, 0x08,
> +	0x31, 0xa0, 0x04, 0x00, 0x00, 0x00, 0x14, 0x18, 0x14, 0x14, 0x00, 0xcc,
> +	0x00, 0x00, 0x16, 0x00, 0x31, 0x91, 0x24, 0x00, 0x00, 0x00, 0x14, 0x1a,
> +	0x14, 0x16, 0x00, 0xcc, 0x00, 0x00, 0x16, 0x00, 0x40, 0x00, 0x10, 0xa0,
> +	0x4a, 0x10, 0x10, 0x08, 0x40, 0x00, 0x12, 0xa6, 0x4a, 0x12, 0x10, 0x08,
> +	0x41, 0x20, 0x18, 0x20, 0x00, 0x18, 0x00, 0x18, 0x41, 0x21, 0x1a, 0x26,
> +	0x00, 0x1a, 0x00, 0x1a, 0x31, 0xa2, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00,
> +	0x14, 0x10, 0x02, 0xcc, 0x14, 0x18, 0x96, 0x00, 0x31, 0x93, 0x24, 0x00,
> +	0x00, 0x00, 0x00, 0x00, 0x14, 0x12, 0x02, 0xcc, 0x14, 0x1a, 0x96, 0x00,
> +	0x25, 0x00, 0x05, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
> +	0x10, 0x00, 0x00, 0x00, 0x61, 0x00, 0x7f, 0x64, 0x00, 0x03, 0x10, 0x00,
> +	0x31, 0x44, 0x03, 0x80, 0x00, 0x00, 0x0c, 0x1c, 0x0c, 0x03, 0x00, 0xa0,
> +	0x00, 0x00, 0x78, 0x02, 0x61, 0x24, 0x03, 0x80, 0x20, 0x02, 0x01, 0x00,
> +	0x05, 0x1c, 0x46, 0x00, 0x00, 0x00, 0x00, 0x00, 0x61, 0x00, 0x04, 0x80,
> +	0xa0, 0x4a, 0x01, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
> +	0x31, 0x01, 0x03, 0x80, 0x04, 0x00, 0x00, 0x00, 0x0c, 0x7f, 0x20, 0x70,
> +	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
> +	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
> +	0x00, 0x00, 0x00, 0x00
> +};
> +
>  static const unsigned char xehp_kernel_square_bin[] = {
>  	0x61, 0x31, 0x03, 0x80, 0x20, 0x42, 0x05, 0x7f, 0x00, 0x00, 0x00, 0x00,
>  	0x00, 0x00, 0x00, 0x00, 0x65, 0x00, 0x00, 0x80, 0x20, 0x82, 0x45, 0x7f,
> @@ -152,6 +189,11 @@ const struct intel_compute_kernels intel_compute_square_kernels[] = {
>  		.size = sizeof(tgllp_kernel_square_bin),
>  		.kernel = tgllp_kernel_square_bin,
>  	},
> +	{
> +		.ip_ver = IP_VER(12, 10),
> +		.size = sizeof(dg1_kernel_square_bin),
> +		.kernel = dg1_kernel_square_bin,
> +	},
>  	{
>  		.ip_ver = IP_VER(12, 55),
>  		.size = sizeof(xehp_kernel_square_bin),
> -- 
> 2.34.1
> 

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2023-11-14 11:17 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2023-11-14 10:41 [igt-dev] [PATCH i-g-t v2 0/2] Add dg1 compute pipeline Zbigniew Kempczyński
2023-11-14 10:41 ` [igt-dev] [PATCH i-g-t v2 1/2] lib/intel_compute: Prepare tgllp compute functions to be dg1 ready Zbigniew Kempczyński
2023-11-14 11:12   ` Francois Dugast
2023-11-14 10:41 ` [igt-dev] [PATCH i-g-t v2 2/2] lib/intel_compute: Add dg1 compute implementation for i915 Zbigniew Kempczyński
2023-11-14 11:17   ` Francois Dugast

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox