[PATCH 0/2] tcg: Streamline vector load/store

qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed

* [PATCH 0/2] tcg: Streamline vector load/store
@ 2023-08-31  3:07 Richard Henderson
  2023-08-31  3:07 ` [PATCH 1/2] tcg: Add tcg_gen_{ld,st}_i128 Richard Henderson
  2023-08-31  3:07 ` [PATCH 2/2] target/i386: Use i128 for 128 and 256-bit loads and stores Richard Henderson
  0 siblings, 2 replies; 3+ messages in thread
From: Richard Henderson @ 2023-08-31  3:07 UTC (permalink / raw)
  To: qemu-devel

We have tcg_gen_qemu_{ld,st}_i128, which can be used to implement
load/store of vectors to guest memory.  But at present we have to
split into, or concatenated from, two i64 to reference the guest
vector register backing store within env.

Provide tcg_gen_{ld,st}_i128, which can avoid the trip through i64.

This does require that the target store i128 in host byte ordering,
which is true of i386 (and some other backends) but not arm or s390x.
There is definitely further cleanup possible.

r~

Richard Henderson (2):
  tcg: Add tcg_gen_{ld,st}_i128
  target/i386: Use i128 for 128 and 256-bit loads and stores

 include/tcg/tcg-op-common.h |  3 ++
 target/i386/tcg/translate.c | 61 ++++++++++++++++---------------------
 tcg/tcg-op.c                | 22 +++++++++++++
 3 files changed, 52 insertions(+), 34 deletions(-)

-- 
2.34.1

^ permalink raw reply	[flat|nested] 3+ messages in thread

* [PATCH 1/2] tcg: Add tcg_gen_{ld,st}_i128
  2023-08-31  3:07 [PATCH 0/2] tcg: Streamline vector load/store Richard Henderson
@ 2023-08-31  3:07 ` Richard Henderson
  2023-08-31  3:07 ` [PATCH 2/2] target/i386: Use i128 for 128 and 256-bit loads and stores Richard Henderson
  1 sibling, 0 replies; 3+ messages in thread
From: Richard Henderson @ 2023-08-31  3:07 UTC (permalink / raw)
  To: qemu-devel

Do not require the translators to jump through concat and
extract of i64 in order to move values to and from  env.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/tcg/tcg-op-common.h |  3 +++
 tcg/tcg-op.c                | 22 ++++++++++++++++++++++
 2 files changed, 25 insertions(+)

diff --git a/include/tcg/tcg-op-common.h b/include/tcg/tcg-op-common.h
index a53b15933b..c81cdbe11c 100644
--- a/include/tcg/tcg-op-common.h
+++ b/include/tcg/tcg-op-common.h
@@ -747,6 +747,9 @@ void tcg_gen_mov_i128(TCGv_i128 dst, TCGv_i128 src);
 void tcg_gen_extr_i128_i64(TCGv_i64 lo, TCGv_i64 hi, TCGv_i128 arg);
 void tcg_gen_concat_i64_i128(TCGv_i128 ret, TCGv_i64 lo, TCGv_i64 hi);
 
+void tcg_gen_ld_i128(TCGv_i128 ret, TCGv_ptr base, tcg_target_long offset);
+void tcg_gen_st_i128(TCGv_i128 val, TCGv_ptr base, tcg_target_long offset);
+
 static inline void tcg_gen_concat32_i64(TCGv_i64 ret, TCGv_i64 lo, TCGv_i64 hi)
 {
     tcg_gen_deposit_i64(ret, lo, hi, 32, 32);
diff --git a/tcg/tcg-op.c b/tcg/tcg-op.c
index 02a8cadcc0..a005a0eb29 100644
--- a/tcg/tcg-op.c
+++ b/tcg/tcg-op.c
@@ -2880,6 +2880,28 @@ void tcg_gen_mov_i128(TCGv_i128 dst, TCGv_i128 src)
     }
 }
 
+void tcg_gen_ld_i128(TCGv_i128 ret, TCGv_ptr base, tcg_target_long offset)
+{
+    if (HOST_BIG_ENDIAN) {
+        tcg_gen_ld_i64(TCGV128_HIGH(ret), base, offset);
+        tcg_gen_ld_i64(TCGV128_LOW(ret), base, offset + 8);
+    } else {
+        tcg_gen_ld_i64(TCGV128_LOW(ret), base, offset);
+        tcg_gen_ld_i64(TCGV128_HIGH(ret), base, offset + 8);
+    }
+}
+
+void tcg_gen_st_i128(TCGv_i128 val, TCGv_ptr base, tcg_target_long offset)
+{
+    if (HOST_BIG_ENDIAN) {
+        tcg_gen_st_i64(TCGV128_HIGH(val), base, offset);
+        tcg_gen_st_i64(TCGV128_LOW(val), base, offset + 8);
+    } else {
+        tcg_gen_st_i64(TCGV128_LOW(val), base, offset);
+        tcg_gen_st_i64(TCGV128_HIGH(val), base, offset + 8);
+    }
+}
+
 /* QEMU specific operations.  */
 
 void tcg_gen_exit_tb(const TranslationBlock *tb, unsigned idx)
-- 
2.34.1



^ permalink raw reply related	[flat|nested] 3+ messages in thread

* [PATCH 2/2] target/i386: Use i128 for 128 and 256-bit loads and stores
  2023-08-31  3:07 [PATCH 0/2] tcg: Streamline vector load/store Richard Henderson
  2023-08-31  3:07 ` [PATCH 1/2] tcg: Add tcg_gen_{ld,st}_i128 Richard Henderson
@ 2023-08-31  3:07 ` Richard Henderson
  1 sibling, 0 replies; 3+ messages in thread
From: Richard Henderson @ 2023-08-31  3:07 UTC (permalink / raw)
  To: qemu-devel

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/i386/tcg/translate.c | 61 ++++++++++++++++---------------------
 1 file changed, 27 insertions(+), 34 deletions(-)

diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c
index 90c7b32f36..bbcb81e908 100644
--- a/target/i386/tcg/translate.c
+++ b/target/i386/tcg/translate.c
@@ -2919,58 +2919,51 @@ static inline void gen_stq_env_A0(DisasContext *s, int offset)
 static inline void gen_ldo_env_A0(DisasContext *s, int offset, bool align)
 {
     int mem_index = s->mem_index;
-    tcg_gen_qemu_ld_i64(s->tmp1_i64, s->A0, mem_index,
-                        MO_LEUQ | (align ? MO_ALIGN_16 : 0));
-    tcg_gen_st_i64(s->tmp1_i64, cpu_env, offset + offsetof(XMMReg, XMM_Q(0)));
-    tcg_gen_addi_tl(s->tmp0, s->A0, 8);
-    tcg_gen_qemu_ld_i64(s->tmp1_i64, s->tmp0, mem_index, MO_LEUQ);
-    tcg_gen_st_i64(s->tmp1_i64, cpu_env, offset + offsetof(XMMReg, XMM_Q(1)));
+    TCGv_i128 t = tcg_temp_new_i128();
+
+    tcg_gen_qemu_ld_i128(t, s->A0, mem_index,
+                         MO_128 | MO_LE | (align ? MO_ALIGN_16 : 0));
+    tcg_gen_st_i128(t, cpu_env, offset);
 }
 
 static inline void gen_sto_env_A0(DisasContext *s, int offset, bool align)
 {
     int mem_index = s->mem_index;
-    tcg_gen_ld_i64(s->tmp1_i64, cpu_env, offset + offsetof(XMMReg, XMM_Q(0)));
-    tcg_gen_qemu_st_i64(s->tmp1_i64, s->A0, mem_index,
-                        MO_LEUQ | (align ? MO_ALIGN_16 : 0));
-    tcg_gen_addi_tl(s->tmp0, s->A0, 8);
-    tcg_gen_ld_i64(s->tmp1_i64, cpu_env, offset + offsetof(XMMReg, XMM_Q(1)));
-    tcg_gen_qemu_st_i64(s->tmp1_i64, s->tmp0, mem_index, MO_LEUQ);
+    TCGv_i128 t = tcg_temp_new_i128();
+
+    tcg_gen_ld_i128(t, cpu_env, offset);
+    tcg_gen_qemu_st_i128(t, s->A0, mem_index,
+                         MO_128 | MO_LE | (align ? MO_ALIGN_16 : 0));
 }
 
 static void gen_ldy_env_A0(DisasContext *s, int offset, bool align)
 {
     int mem_index = s->mem_index;
-    tcg_gen_qemu_ld_i64(s->tmp1_i64, s->A0, mem_index,
-                        MO_LEUQ | (align ? MO_ALIGN_32 : 0));
-    tcg_gen_st_i64(s->tmp1_i64, cpu_env, offset + offsetof(YMMReg, YMM_Q(0)));
-    tcg_gen_addi_tl(s->tmp0, s->A0, 8);
-    tcg_gen_qemu_ld_i64(s->tmp1_i64, s->tmp0, mem_index, MO_LEUQ);
-    tcg_gen_st_i64(s->tmp1_i64, cpu_env, offset + offsetof(YMMReg, YMM_Q(1)));
+    TCGv_i128 t0 = tcg_temp_new_i128();
+    TCGv_i128 t1 = tcg_temp_new_i128();
 
+    tcg_gen_qemu_ld_i128(t0, s->A0, mem_index,
+                         MO_128 | MO_LE | (align ? MO_ALIGN_32 : 0));
     tcg_gen_addi_tl(s->tmp0, s->A0, 16);
-    tcg_gen_qemu_ld_i64(s->tmp1_i64, s->tmp0, mem_index, MO_LEUQ);
-    tcg_gen_st_i64(s->tmp1_i64, cpu_env, offset + offsetof(YMMReg, YMM_Q(2)));
-    tcg_gen_addi_tl(s->tmp0, s->A0, 24);
-    tcg_gen_qemu_ld_i64(s->tmp1_i64, s->tmp0, mem_index, MO_LEUQ);
-    tcg_gen_st_i64(s->tmp1_i64, cpu_env, offset + offsetof(YMMReg, YMM_Q(3)));
+    tcg_gen_qemu_ld_i128(t1, s->tmp0, mem_index,
+                         MO_128 | MO_LE | (align ? MO_ALIGN_16 : 0));
+
+    tcg_gen_st_i128(t0, cpu_env, offset + offsetof(YMMReg, YMM_X(0)));
+    tcg_gen_st_i128(t1, cpu_env, offset + offsetof(YMMReg, YMM_X(1)));
 }
 
 static void gen_sty_env_A0(DisasContext *s, int offset, bool align)
 {
     int mem_index = s->mem_index;
-    tcg_gen_ld_i64(s->tmp1_i64, cpu_env, offset + offsetof(YMMReg, YMM_Q(0)));
-    tcg_gen_qemu_st_i64(s->tmp1_i64, s->A0, mem_index,
-                        MO_LEUQ | (align ? MO_ALIGN_32 : 0));
-    tcg_gen_addi_tl(s->tmp0, s->A0, 8);
-    tcg_gen_ld_i64(s->tmp1_i64, cpu_env, offset + offsetof(YMMReg, YMM_Q(1)));
-    tcg_gen_qemu_st_i64(s->tmp1_i64, s->tmp0, mem_index, MO_LEUQ);
+    TCGv_i128 t = tcg_temp_new_i128();
+
+    tcg_gen_ld_i128(t, cpu_env, offset + offsetof(YMMReg, YMM_X(0)));
+    tcg_gen_qemu_st_i128(t, s->A0, mem_index,
+                         MO_128 | MO_LE | (align ? MO_ALIGN_32 : 0));
     tcg_gen_addi_tl(s->tmp0, s->A0, 16);
-    tcg_gen_ld_i64(s->tmp1_i64, cpu_env, offset + offsetof(YMMReg, YMM_Q(2)));
-    tcg_gen_qemu_st_i64(s->tmp1_i64, s->tmp0, mem_index, MO_LEUQ);
-    tcg_gen_addi_tl(s->tmp0, s->A0, 24);
-    tcg_gen_ld_i64(s->tmp1_i64, cpu_env, offset + offsetof(YMMReg, YMM_Q(3)));
-    tcg_gen_qemu_st_i64(s->tmp1_i64, s->tmp0, mem_index, MO_LEUQ);
+    tcg_gen_ld_i128(t, cpu_env, offset + offsetof(YMMReg, YMM_X(1)));
+    tcg_gen_qemu_st_i128(t, s->tmp0, mem_index,
+                         MO_128 | MO_LE | (align ? MO_ALIGN_16 : 0));
 }
 
 #include "decode-new.h"
-- 
2.34.1



^ permalink raw reply related	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2023-08-31  3:08 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2023-08-31  3:07 [PATCH 0/2] tcg: Streamline vector load/store Richard Henderson
2023-08-31  3:07 ` [PATCH 1/2] tcg: Add tcg_gen_{ld,st}_i128 Richard Henderson
2023-08-31  3:07 ` [PATCH 2/2] target/i386: Use i128 for 128 and 256-bit loads and stores Richard Henderson

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).