[PATCH v2 22/37] target/i386: reimplement 0x0f 0x78-0x7f, add AVX

qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed

From: Paolo Bonzini <pbonzini@redhat.com>
To: qemu-devel@nongnu.org
Cc: richard.henderson@linaro.org, paul@nowt.org
Subject: [PATCH v2 22/37] target/i386: reimplement 0x0f 0x78-0x7f, add AVX
Date: Tue, 20 Sep 2022 19:24:52 +0200	[thread overview]
Message-ID: <20220920172507.95568-23-pbonzini@redhat.com> (raw)
In-Reply-To: <20220920172507.95568-1-pbonzini@redhat.com>

These are a mixed batch, including the first two horizontal
(66 and F2 only) operations, more moves, and SSE4a extract/insert.

Because SSE4a is pretty rare, I chose to leave the helper as they are,
but it is possible to unify them by loading index and length from the
source XMM register and generating deposit or extract TCG ops.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 target/i386/tcg/decode-new.c.inc | 51 +++++++++++++++++++
 target/i386/tcg/emit.c.inc       | 86 ++++++++++++++++++++++++++++++++
 target/i386/tcg/translate.c      |  1 +
 3 files changed, 138 insertions(+)

diff --git a/target/i386/tcg/decode-new.c.inc b/target/i386/tcg/decode-new.c.inc
index 5b753ea329..6220142cdb 100644
--- a/target/i386/tcg/decode-new.c.inc
+++ b/target/i386/tcg/decode-new.c.inc
@@ -168,6 +168,50 @@ static void decode_0F6F(DisasContext *s, CPUX86State *env, X86OpEntry *entry, ui
     *entry = *decode_by_prefix(s, opcodes_0F6F);
 }
 
+static void decode_0F78(DisasContext *s, CPUX86State *env, X86OpEntry *entry, uint8_t *b)
+{
+    static const X86OpEntry opcodes_0F78[4] = {
+        {},
+        X86_OP_ENTRY3(EXTRQ_i,       V,x, None,None, I,w,  cpuid(SSE4A)),
+        {},
+        X86_OP_ENTRY3(INSERTQ_i,     V,x, U,x, I,w,        cpuid(SSE4A)),
+    };
+    *entry = *decode_by_prefix(s, opcodes_0F78);
+}
+
+static void decode_0F79(DisasContext *s, CPUX86State *env, X86OpEntry *entry, uint8_t *b)
+{
+    if (s->prefix & PREFIX_REPNZ) {
+        entry->gen = gen_INSERTQ_r;
+    } else if (s->prefix & PREFIX_DATA) {
+        entry->gen = gen_EXTRQ_r;
+    } else {
+        entry->gen = NULL;
+    };
+}
+
+static void decode_0F7E(DisasContext *s, CPUX86State *env, X86OpEntry *entry, uint8_t *b)
+{
+    static const X86OpEntry opcodes_0F7E[4] = {
+        X86_OP_ENTRY3(MOVD_from,  E,y, None,None, P,y, vex5 mmx),
+        X86_OP_ENTRY3(MOVD_from,  E,y, None,None, V,y, vex5),
+        X86_OP_ENTRY3(MOVQ,       V,x, None,None, W,q, vex5),  /* wrong dest Vy on SDM! */
+        {},
+    };
+    *entry = *decode_by_prefix(s, opcodes_0F7E);
+}
+
+static void decode_0F7F(DisasContext *s, CPUX86State *env, X86OpEntry *entry, uint8_t *b)
+{
+    static const X86OpEntry opcodes_0F7F[4] = {
+        X86_OP_ENTRY3(MOVDQ,       W,x, None,None, V,x, vex1 mmx), /* movq */
+        X86_OP_ENTRY3(MOVDQ,       W,x, None,None, V,x, vex1), /* movdqa */
+        X86_OP_ENTRY3(MOVDQ,       W,x, None,None, V,x, vex4_unal), /* movdqu */
+        {},
+    };
+    *entry = *decode_by_prefix(s, opcodes_0F7F);
+}
+
 static const X86OpEntry opcodes_0F38_00toEF[240] = {
 };
 
@@ -317,6 +361,13 @@ static const X86OpEntry opcodes_0F[256] = {
     [0x6e] = X86_OP_ENTRY3(MOVD_to,    V,x, None,None, E,y, vex5 mmx p_00_66),  /* wrong dest Vy on SDM! */
     [0x6f] = X86_OP_GROUP0(0F6F),
 
+    [0x78] = X86_OP_GROUP0(0F78),
+    [0x79] = X86_OP_GROUP2(0F79,       V,x, U,x,       cpuid(SSE4A)),
+    [0x7c] = X86_OP_ENTRY3(VHADD,      V,x, H,x, W,x,  vex2 cpuid(SSE3) p_66_f2),
+    [0x7d] = X86_OP_ENTRY3(VHSUB,      V,x, H,x, W,x,  vex2 cpuid(SSE3) p_66_f2),
+    [0x7e] = X86_OP_GROUP0(0F7E),
+    [0x7f] = X86_OP_GROUP0(0F7F),
+
     /* Incorrectly missing from 2-17 */
     [0xd8] = X86_OP_ENTRY3(PSUBUSB,  V,x, H,x, W,x,  vex4 mmx avx2_256 p_00_66),
     [0xd9] = X86_OP_ENTRY3(PSUBUSW,  V,x, H,x, W,x,  vex4 mmx avx2_256 p_00_66),
diff --git a/target/i386/tcg/emit.c.inc b/target/i386/tcg/emit.c.inc
index 58b2fd7a2a..140a621abf 100644
--- a/target/i386/tcg/emit.c.inc
+++ b/target/i386/tcg/emit.c.inc
@@ -443,6 +443,30 @@ static void gen_##uname(DisasContext *s, CPUX86State *env, X86DecodedInsn *decod
 UNARY_FP32_SSE(VRSQRT, rsqrt)
 UNARY_FP32_SSE(VRCP, rcp)
 
+/*
+ * 66 = v*pd Vpd, Hpd, Wpd
+ * f2 = v*ps Vps, Hps, Wps
+ */
+static inline void gen_horizontal_fp_sse(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode,
+                                         SSEFunc_0_eppp pd_xmm, SSEFunc_0_eppp ps_xmm,
+                                         SSEFunc_0_eppp pd_ymm, SSEFunc_0_eppp ps_ymm)
+{
+    SSEFunc_0_eppp ps, pd, fn;
+    ps = s->vex_l ? ps_ymm : ps_xmm;
+    pd = s->vex_l ? pd_ymm : pd_xmm;
+    fn = s->prefix & PREFIX_DATA ? pd : ps;
+    fn(cpu_env, OP_PTR0, OP_PTR1, OP_PTR2);
+}
+#define HORIZONTAL_FP_SSE(uname, lname)                                            \
+static void gen_##uname(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) \
+{                                                                                  \
+    gen_horizontal_fp_sse(s, env, decode,                                          \
+                          gen_helper_##lname##pd_xmm, gen_helper_##lname##ps_xmm,  \
+                          gen_helper_##lname##pd_ymm, gen_helper_##lname##ps_ymm); \
+}
+HORIZONTAL_FP_SSE(VHADD, hadd)
+HORIZONTAL_FP_SSE(VHSUB, hsub)
+
 #define BINARY_INT_GVEC(uname, func, ...)                                          \
 static void gen_##uname(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) \
 {                                                                                  \
@@ -716,6 +740,32 @@ static void gen_CRC32(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
     gen_helper_crc32(s->T0, s->tmp2_i32, s->T1, tcg_constant_i32(8 << ot));
 }
 
+static void gen_EXTRQ_i(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+    TCGv_i32 length = tcg_constant_i32(decode->immediate & 63);
+    TCGv_i32 index = tcg_constant_i32((decode->immediate >> 8) & 63);
+
+    gen_helper_extrq_i(cpu_env, OP_PTR0, index, length);
+}
+
+static void gen_EXTRQ_r(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+    gen_helper_extrq_r(cpu_env, OP_PTR0, OP_PTR2);
+}
+
+static void gen_INSERTQ_i(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+    TCGv_i32 length = tcg_constant_i32(decode->immediate & 63);
+    TCGv_i32 index = tcg_constant_i32((decode->immediate >> 8) & 63);
+
+    gen_helper_insertq_i(cpu_env, OP_PTR0, OP_PTR1, index, length);
+}
+
+static void gen_INSERTQ_r(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+    gen_helper_insertq_r(cpu_env, OP_PTR0, OP_PTR2);
+}
+
 static void gen_MOVBE(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
 {
     MemOp ot = decode->op[0].ot;
@@ -728,6 +778,24 @@ static void gen_MOVBE(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
     }
 }
 
+static void gen_MOVD_from(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+    MemOp ot = decode->op[2].ot;
+
+    switch (ot) {
+    case MO_32:
+#ifdef TARGET_X86_64
+        tcg_gen_ld32u_tl(s->T0, cpu_env, decode->op[2].offset);
+        break;
+    case MO_64:
+#endif
+        tcg_gen_ld_tl(s->T0, cpu_env, decode->op[2].offset);
+        break;
+    default:
+        abort();
+    }
+}
+
 static void gen_MOVD_to(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
 {
     MemOp ot = decode->op[2].ot;
@@ -766,6 +834,24 @@ static void gen_MOVMSK(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode
     tcg_gen_extu_i32_tl(s->T0, s->tmp2_i32);
 }
 
+static void gen_MOVQ(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+    int vec_len = vector_len(s, decode);
+    int lo_ofs = vector_elem_offset(&decode->op[0], MO_64, 0);
+
+    tcg_gen_ld_i64(s->tmp1_i64, cpu_env, decode->op[2].offset);
+    /*
+     * tcg_gen_gvec_dup_i64(MO_64, op0.offset, 8, vec_len, s->tmp1_64) would
+     * seem to work, but it does not on big-endian platforms; the cleared parts
+     * are always at higher addresses, but cross-endian emulation inverts the
+     * byte order so that the cleared parts need to be at *lower* addresses.
+     * Because oprsz is 8, we see this here even for SSE; but more in general,
+     * it disqualifies using oprsz < maxsz to emulate VEX128.
+     */
+    tcg_gen_gvec_dup_imm(MO_64, decode->op[0].offset, vec_len, vec_len, 0);
+    tcg_gen_st_i64(s->tmp1_i64, cpu_env, lo_ofs);
+}
+
 static void gen_MULX(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
 {
     MemOp ot = decode->op[0].ot;
diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c
index 95172c30d0..4404440d87 100644
--- a/target/i386/tcg/translate.c
+++ b/target/i386/tcg/translate.c
@@ -4696,6 +4696,7 @@ static target_ulong disas_insn(DisasContext *s, CPUState *cpu)
 #endif
         if (use_new &&
             ((b >= 0x150 && b <= 0x16f) ||
+             (b >= 0x178 && b <= 0x17f) ||
              (b >= 0x1d8 && b <= 0x1ff && (b & 8)))) {
             disas_insn_new(s, cpu, b + 0x100);
             return s->pc;
-- 
2.37.2

next prev parent reply	other threads:[~2022-09-20 22:08 UTC|newest]

Thread overview: 61+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-09-20 17:24 [PATCH v2 00/37] target/i386: new decoder + AVX implementation Paolo Bonzini
2022-09-20 17:24 ` [PATCH v2 01/37] target/i386: Define XMMReg and access macros, align ZMM registers Paolo Bonzini
2022-09-20 17:24 ` [PATCH v2 02/37] target/i386: make ldo/sto operations consistent with ldq Paolo Bonzini
2022-09-24 15:00   ` Richard Henderson
2022-09-20 17:24 ` [PATCH v2 03/37] target/i386: REPZ and REPNZ are mutually exclusive Paolo Bonzini
2022-09-20 17:24 ` [PATCH v2 04/37] target/i386: introduce insn_get_addr Paolo Bonzini
2022-09-20 17:24 ` [PATCH v2 05/37] target/i386: add core of new i386 decoder Paolo Bonzini
2022-09-24 15:09   ` Richard Henderson
2022-09-20 17:24 ` [PATCH v2 06/37] target/i386: add ALU load/writeback core Paolo Bonzini
2022-09-24 15:13   ` Richard Henderson
2022-09-20 17:24 ` [PATCH v2 07/37] target/i386: add CPUID[EAX=7, ECX=0].ECX to DisasContext Paolo Bonzini
2022-09-20 17:24 ` [PATCH v2 08/37] target/i386: add CPUID feature checks to new decoder Paolo Bonzini
2022-09-20 17:24 ` [PATCH v2 09/37] target/i386: add AVX_EN hflag Paolo Bonzini
2022-09-20 17:24 ` [PATCH v2 10/37] target/i386: validate VEX prefixes via the instructions' exception classes Paolo Bonzini
2022-09-24 20:19   ` Richard Henderson
2022-09-20 17:24 ` [PATCH v2 11/37] target/i386: validate SSE prefixes directly in the decoding table Paolo Bonzini
2022-09-24 20:23   ` Richard Henderson
2022-09-20 17:24 ` [PATCH v2 12/37] target/i386: move scalar 0F 38 and 0F 3A instruction to new decoder Paolo Bonzini
2022-09-20 17:24 ` [PATCH v2 13/37] target/i386: Prepare ops_sse_header.h for 256 bit AVX Paolo Bonzini
2022-09-20 17:24 ` [PATCH v2 14/37] target/i386: extend helpers to support VEX.V 3- and 4- operand encodings Paolo Bonzini
2022-09-20 17:24 ` [PATCH v2 15/37] target/i386: support operand merging in binary scalar helpers Paolo Bonzini
2022-09-20 17:24 ` [PATCH v2 16/37] target/i386: provide 3-operand versions of unary " Paolo Bonzini
2022-09-20 17:24 ` [PATCH v2 17/37] target/i386: implement additional AVX comparison operators Paolo Bonzini
2022-09-20 17:24 ` [PATCH v2 18/37] target/i386: Introduce 256-bit vector helpers Paolo Bonzini
2022-09-20 17:24 ` [PATCH v2 19/37] target/i386: reimplement 0x0f 0x60-0x6f, add AVX Paolo Bonzini
2022-09-24 20:31   ` Richard Henderson
2022-09-20 17:24 ` [PATCH v2 20/37] target/i386: reimplement 0x0f 0xd8-0xdf, 0xe8-0xef, 0xf8-0xff, " Paolo Bonzini
2022-09-24 20:32   ` Richard Henderson
2022-09-20 17:24 ` [PATCH v2 21/37] target/i386: reimplement 0x0f 0x50-0x5f, " Paolo Bonzini
2022-09-24 20:37   ` Richard Henderson
2022-09-20 17:24 ` Paolo Bonzini [this message]
2022-09-24 20:43   ` [PATCH v2 22/37] target/i386: reimplement 0x0f 0x78-0x7f, " Richard Henderson
2022-09-26  7:24     ` Paolo Bonzini
2022-09-26  7:46       ` Richard Henderson
2022-09-20 17:24 ` [PATCH v2 23/37] target/i386: reimplement 0x0f 0x70-0x77, " Paolo Bonzini
2022-09-24 20:53   ` Richard Henderson
2022-09-20 17:24 ` [PATCH v2 24/37] target/i386: reimplement 0x0f 0xd0-0xd7, 0xe0-0xe7, 0xf0-0xf7, " Paolo Bonzini
2022-09-24 20:56   ` Richard Henderson
2022-09-26  7:56     ` Paolo Bonzini
2022-09-20 17:24 ` [PATCH v2 25/37] target/i386: clarify (un)signedness of immediates from 0F3Ah opcodes Paolo Bonzini
2022-09-24 20:57   ` Richard Henderson
2022-09-20 17:24 ` [PATCH v2 26/37] target/i386: reimplement 0x0f 0x3a, add AVX Paolo Bonzini
2022-09-24 21:02   ` Richard Henderson
2022-09-20 17:24 ` [PATCH v2 27/37] target/i386: Use tcg gvec ops for pmovmskb Paolo Bonzini
2022-09-24 21:08   ` Richard Henderson
2022-09-20 17:24 ` [PATCH v2 28/37] target/i386: reimplement 0x0f 0x38, add AVX Paolo Bonzini
2022-09-24 21:14   ` Richard Henderson
2022-09-20 17:24 ` [PATCH v2 29/37] target/i386: reimplement 0x0f 0xc2, 0xc4-0xc6, " Paolo Bonzini
2022-09-20 17:25 ` [PATCH v2 30/37] target/i386: reimplement 0x0f 0x10-0x17, " Paolo Bonzini
2022-09-24 21:16   ` Richard Henderson
2022-09-20 17:25 ` [PATCH v2 31/37] target/i386: reimplement 0x0f 0x28-0x2f, " Paolo Bonzini
2022-09-24 21:18   ` Richard Henderson
2022-09-20 17:25 ` [PATCH v2 32/37] target/i386: implement XSAVE and XRSTOR of AVX registers Paolo Bonzini
2022-09-24 21:19   ` Richard Henderson
2022-09-20 17:25 ` [PATCH v2 33/37] target/i386: implement VLDMXCSR/VSTMXCSR Paolo Bonzini
2022-09-24 21:20   ` Richard Henderson
2022-09-20 17:25 ` [PATCH v2 34/37] target/i386: Enable AVX cpuid bits when using TCG Paolo Bonzini
2022-09-20 17:25 ` [PATCH v2 35/37] tests/tcg: extend SSE tests to AVX Paolo Bonzini
2022-09-20 17:25 ` [PATCH v2 36/37] target/i386: move 3DNow to the new decoder Paolo Bonzini
2022-09-24 21:24   ` Richard Henderson
2022-09-20 17:25 ` [PATCH v2 37/37] target/i386: remove old SSE decoder Paolo Bonzini

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:5b753ea32 dfblob:6220142cd dfblob:58b2fd7a2 dfblob:140a621ab
dfblob:95172c30d dfblob:4404440d8 )
 OR (
bs:"[PATCH v2 22/37] target/i386: reimplement 0x0f 0x78-0x7f, add AVX" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20220920172507.95568-23-pbonzini@redhat.com \
    --to=pbonzini@redhat.com \
    --cc=paul@nowt.org \
    --cc=qemu-devel@nongnu.org \
    --cc=richard.henderson@linaro.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).