* [PATCH 1/2] tcg: Add tcg_gen_{ld,st}_i128
2023-08-31 3:07 [PATCH 0/2] tcg: Streamline vector load/store Richard Henderson
@ 2023-08-31 3:07 ` Richard Henderson
2023-08-31 3:07 ` [PATCH 2/2] target/i386: Use i128 for 128 and 256-bit loads and stores Richard Henderson
1 sibling, 0 replies; 3+ messages in thread
From: Richard Henderson @ 2023-08-31 3:07 UTC (permalink / raw)
To: qemu-devel
Do not require the translators to jump through concat and
extract of i64 in order to move values to and from env.
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
include/tcg/tcg-op-common.h | 3 +++
tcg/tcg-op.c | 22 ++++++++++++++++++++++
2 files changed, 25 insertions(+)
diff --git a/include/tcg/tcg-op-common.h b/include/tcg/tcg-op-common.h
index a53b15933b..c81cdbe11c 100644
--- a/include/tcg/tcg-op-common.h
+++ b/include/tcg/tcg-op-common.h
@@ -747,6 +747,9 @@ void tcg_gen_mov_i128(TCGv_i128 dst, TCGv_i128 src);
void tcg_gen_extr_i128_i64(TCGv_i64 lo, TCGv_i64 hi, TCGv_i128 arg);
void tcg_gen_concat_i64_i128(TCGv_i128 ret, TCGv_i64 lo, TCGv_i64 hi);
+void tcg_gen_ld_i128(TCGv_i128 ret, TCGv_ptr base, tcg_target_long offset);
+void tcg_gen_st_i128(TCGv_i128 val, TCGv_ptr base, tcg_target_long offset);
+
static inline void tcg_gen_concat32_i64(TCGv_i64 ret, TCGv_i64 lo, TCGv_i64 hi)
{
tcg_gen_deposit_i64(ret, lo, hi, 32, 32);
diff --git a/tcg/tcg-op.c b/tcg/tcg-op.c
index 02a8cadcc0..a005a0eb29 100644
--- a/tcg/tcg-op.c
+++ b/tcg/tcg-op.c
@@ -2880,6 +2880,28 @@ void tcg_gen_mov_i128(TCGv_i128 dst, TCGv_i128 src)
}
}
+void tcg_gen_ld_i128(TCGv_i128 ret, TCGv_ptr base, tcg_target_long offset)
+{
+ if (HOST_BIG_ENDIAN) {
+ tcg_gen_ld_i64(TCGV128_HIGH(ret), base, offset);
+ tcg_gen_ld_i64(TCGV128_LOW(ret), base, offset + 8);
+ } else {
+ tcg_gen_ld_i64(TCGV128_LOW(ret), base, offset);
+ tcg_gen_ld_i64(TCGV128_HIGH(ret), base, offset + 8);
+ }
+}
+
+void tcg_gen_st_i128(TCGv_i128 val, TCGv_ptr base, tcg_target_long offset)
+{
+ if (HOST_BIG_ENDIAN) {
+ tcg_gen_st_i64(TCGV128_HIGH(val), base, offset);
+ tcg_gen_st_i64(TCGV128_LOW(val), base, offset + 8);
+ } else {
+ tcg_gen_st_i64(TCGV128_LOW(val), base, offset);
+ tcg_gen_st_i64(TCGV128_HIGH(val), base, offset + 8);
+ }
+}
+
/* QEMU specific operations. */
void tcg_gen_exit_tb(const TranslationBlock *tb, unsigned idx)
--
2.34.1
^ permalink raw reply related [flat|nested] 3+ messages in thread
* [PATCH 2/2] target/i386: Use i128 for 128 and 256-bit loads and stores
2023-08-31 3:07 [PATCH 0/2] tcg: Streamline vector load/store Richard Henderson
2023-08-31 3:07 ` [PATCH 1/2] tcg: Add tcg_gen_{ld,st}_i128 Richard Henderson
@ 2023-08-31 3:07 ` Richard Henderson
1 sibling, 0 replies; 3+ messages in thread
From: Richard Henderson @ 2023-08-31 3:07 UTC (permalink / raw)
To: qemu-devel
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
target/i386/tcg/translate.c | 61 ++++++++++++++++---------------------
1 file changed, 27 insertions(+), 34 deletions(-)
diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c
index 90c7b32f36..bbcb81e908 100644
--- a/target/i386/tcg/translate.c
+++ b/target/i386/tcg/translate.c
@@ -2919,58 +2919,51 @@ static inline void gen_stq_env_A0(DisasContext *s, int offset)
static inline void gen_ldo_env_A0(DisasContext *s, int offset, bool align)
{
int mem_index = s->mem_index;
- tcg_gen_qemu_ld_i64(s->tmp1_i64, s->A0, mem_index,
- MO_LEUQ | (align ? MO_ALIGN_16 : 0));
- tcg_gen_st_i64(s->tmp1_i64, cpu_env, offset + offsetof(XMMReg, XMM_Q(0)));
- tcg_gen_addi_tl(s->tmp0, s->A0, 8);
- tcg_gen_qemu_ld_i64(s->tmp1_i64, s->tmp0, mem_index, MO_LEUQ);
- tcg_gen_st_i64(s->tmp1_i64, cpu_env, offset + offsetof(XMMReg, XMM_Q(1)));
+ TCGv_i128 t = tcg_temp_new_i128();
+
+ tcg_gen_qemu_ld_i128(t, s->A0, mem_index,
+ MO_128 | MO_LE | (align ? MO_ALIGN_16 : 0));
+ tcg_gen_st_i128(t, cpu_env, offset);
}
static inline void gen_sto_env_A0(DisasContext *s, int offset, bool align)
{
int mem_index = s->mem_index;
- tcg_gen_ld_i64(s->tmp1_i64, cpu_env, offset + offsetof(XMMReg, XMM_Q(0)));
- tcg_gen_qemu_st_i64(s->tmp1_i64, s->A0, mem_index,
- MO_LEUQ | (align ? MO_ALIGN_16 : 0));
- tcg_gen_addi_tl(s->tmp0, s->A0, 8);
- tcg_gen_ld_i64(s->tmp1_i64, cpu_env, offset + offsetof(XMMReg, XMM_Q(1)));
- tcg_gen_qemu_st_i64(s->tmp1_i64, s->tmp0, mem_index, MO_LEUQ);
+ TCGv_i128 t = tcg_temp_new_i128();
+
+ tcg_gen_ld_i128(t, cpu_env, offset);
+ tcg_gen_qemu_st_i128(t, s->A0, mem_index,
+ MO_128 | MO_LE | (align ? MO_ALIGN_16 : 0));
}
static void gen_ldy_env_A0(DisasContext *s, int offset, bool align)
{
int mem_index = s->mem_index;
- tcg_gen_qemu_ld_i64(s->tmp1_i64, s->A0, mem_index,
- MO_LEUQ | (align ? MO_ALIGN_32 : 0));
- tcg_gen_st_i64(s->tmp1_i64, cpu_env, offset + offsetof(YMMReg, YMM_Q(0)));
- tcg_gen_addi_tl(s->tmp0, s->A0, 8);
- tcg_gen_qemu_ld_i64(s->tmp1_i64, s->tmp0, mem_index, MO_LEUQ);
- tcg_gen_st_i64(s->tmp1_i64, cpu_env, offset + offsetof(YMMReg, YMM_Q(1)));
+ TCGv_i128 t0 = tcg_temp_new_i128();
+ TCGv_i128 t1 = tcg_temp_new_i128();
+ tcg_gen_qemu_ld_i128(t0, s->A0, mem_index,
+ MO_128 | MO_LE | (align ? MO_ALIGN_32 : 0));
tcg_gen_addi_tl(s->tmp0, s->A0, 16);
- tcg_gen_qemu_ld_i64(s->tmp1_i64, s->tmp0, mem_index, MO_LEUQ);
- tcg_gen_st_i64(s->tmp1_i64, cpu_env, offset + offsetof(YMMReg, YMM_Q(2)));
- tcg_gen_addi_tl(s->tmp0, s->A0, 24);
- tcg_gen_qemu_ld_i64(s->tmp1_i64, s->tmp0, mem_index, MO_LEUQ);
- tcg_gen_st_i64(s->tmp1_i64, cpu_env, offset + offsetof(YMMReg, YMM_Q(3)));
+ tcg_gen_qemu_ld_i128(t1, s->tmp0, mem_index,
+ MO_128 | MO_LE | (align ? MO_ALIGN_16 : 0));
+
+ tcg_gen_st_i128(t0, cpu_env, offset + offsetof(YMMReg, YMM_X(0)));
+ tcg_gen_st_i128(t1, cpu_env, offset + offsetof(YMMReg, YMM_X(1)));
}
static void gen_sty_env_A0(DisasContext *s, int offset, bool align)
{
int mem_index = s->mem_index;
- tcg_gen_ld_i64(s->tmp1_i64, cpu_env, offset + offsetof(YMMReg, YMM_Q(0)));
- tcg_gen_qemu_st_i64(s->tmp1_i64, s->A0, mem_index,
- MO_LEUQ | (align ? MO_ALIGN_32 : 0));
- tcg_gen_addi_tl(s->tmp0, s->A0, 8);
- tcg_gen_ld_i64(s->tmp1_i64, cpu_env, offset + offsetof(YMMReg, YMM_Q(1)));
- tcg_gen_qemu_st_i64(s->tmp1_i64, s->tmp0, mem_index, MO_LEUQ);
+ TCGv_i128 t = tcg_temp_new_i128();
+
+ tcg_gen_ld_i128(t, cpu_env, offset + offsetof(YMMReg, YMM_X(0)));
+ tcg_gen_qemu_st_i128(t, s->A0, mem_index,
+ MO_128 | MO_LE | (align ? MO_ALIGN_32 : 0));
tcg_gen_addi_tl(s->tmp0, s->A0, 16);
- tcg_gen_ld_i64(s->tmp1_i64, cpu_env, offset + offsetof(YMMReg, YMM_Q(2)));
- tcg_gen_qemu_st_i64(s->tmp1_i64, s->tmp0, mem_index, MO_LEUQ);
- tcg_gen_addi_tl(s->tmp0, s->A0, 24);
- tcg_gen_ld_i64(s->tmp1_i64, cpu_env, offset + offsetof(YMMReg, YMM_Q(3)));
- tcg_gen_qemu_st_i64(s->tmp1_i64, s->tmp0, mem_index, MO_LEUQ);
+ tcg_gen_ld_i128(t, cpu_env, offset + offsetof(YMMReg, YMM_X(1)));
+ tcg_gen_qemu_st_i128(t, s->tmp0, mem_index,
+ MO_128 | MO_LE | (align ? MO_ALIGN_16 : 0));
}
#include "decode-new.h"
--
2.34.1
^ permalink raw reply related [flat|nested] 3+ messages in thread