[PULL 14/31] tcg/i386: Implement cmp_vec with avx512 insns

qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed

From: Richard Henderson <richard.henderson@linaro.org>
To: qemu-devel@nongnu.org
Cc: peter.maydell@linaro.org
Subject: [PULL 14/31] tcg/i386: Implement cmp_vec with avx512 insns
Date: Sun, 22 Sep 2024 14:00:55 +0200	[thread overview]
Message-ID: <20240922120112.5067-15-richard.henderson@linaro.org> (raw)
In-Reply-To: <20240922120112.5067-1-richard.henderson@linaro.org>

The sse/avx instruction set only has EQ and GT as direct comparisons.
Other signed comparisons can be generated from swapping and inversion.
However unsigned comparisons are not available and must be transformed
to signed comparisons by biasing the inputs.

The avx512 instruction set has a complete set of comparisons, with
results placed into a predicate register.  We can produce the normal
cmp_vec result by using VPMOVM2*.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/i386/tcg-target.c.inc | 64 ++++++++++++++++++++++++++++++++++++++-
 1 file changed, 63 insertions(+), 1 deletion(-)

diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
index 210389955d..b1d642fc67 100644
--- a/tcg/i386/tcg-target.c.inc
+++ b/tcg/i386/tcg-target.c.inc
@@ -413,6 +413,14 @@ static bool tcg_target_const_match(int64_t val, int ct,
 #define OPC_UD2         (0x0b | P_EXT)
 #define OPC_VPBLENDD    (0x02 | P_EXT3A | P_DATA16)
 #define OPC_VPBLENDVB   (0x4c | P_EXT3A | P_DATA16)
+#define OPC_VPCMPB      (0x3f | P_EXT3A | P_DATA16 | P_EVEX)
+#define OPC_VPCMPUB     (0x3e | P_EXT3A | P_DATA16 | P_EVEX)
+#define OPC_VPCMPW      (0x3f | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX)
+#define OPC_VPCMPUW     (0x3e | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX)
+#define OPC_VPCMPD      (0x1f | P_EXT3A | P_DATA16 | P_EVEX)
+#define OPC_VPCMPUD     (0x1e | P_EXT3A | P_DATA16 | P_EVEX)
+#define OPC_VPCMPQ      (0x1f | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX)
+#define OPC_VPCMPUQ     (0x1e | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX)
 #define OPC_VPINSRB     (0x20 | P_EXT3A | P_DATA16)
 #define OPC_VPINSRW     (0xc4 | P_EXT | P_DATA16)
 #define OPC_VBROADCASTSS (0x18 | P_EXT38 | P_DATA16)
@@ -421,6 +429,10 @@ static bool tcg_target_const_match(int64_t val, int ct,
 #define OPC_VPBROADCASTW (0x79 | P_EXT38 | P_DATA16)
 #define OPC_VPBROADCASTD (0x58 | P_EXT38 | P_DATA16)
 #define OPC_VPBROADCASTQ (0x59 | P_EXT38 | P_DATA16)
+#define OPC_VPMOVM2B    (0x28 | P_EXT38 | P_SIMDF3 | P_EVEX)
+#define OPC_VPMOVM2W    (0x28 | P_EXT38 | P_SIMDF3 | P_VEXW | P_EVEX)
+#define OPC_VPMOVM2D    (0x38 | P_EXT38 | P_SIMDF3 | P_EVEX)
+#define OPC_VPMOVM2Q    (0x38 | P_EXT38 | P_SIMDF3 | P_VEXW | P_EVEX)
 #define OPC_VPERMQ      (0x00 | P_EXT3A | P_DATA16 | P_VEXW)
 #define OPC_VPERM2I128  (0x46 | P_EXT3A | P_DATA16 | P_VEXL)
 #define OPC_VPROLVD     (0x15 | P_EXT38 | P_DATA16 | P_EVEX)
@@ -3110,9 +3122,59 @@ static bool tcg_out_cmp_vec_noinv(TCGContext *s, TCGType type, unsigned vece,
     return fixup & NEED_INV;
 }
 
+static void tcg_out_cmp_vec_k1(TCGContext *s, TCGType type, unsigned vece,
+                               TCGReg v1, TCGReg v2, TCGCond cond)
+{
+    static const int cmpm_insn[2][4] = {
+        { OPC_VPCMPB, OPC_VPCMPW, OPC_VPCMPD, OPC_VPCMPQ },
+        { OPC_VPCMPUB, OPC_VPCMPUW, OPC_VPCMPUD, OPC_VPCMPUQ }
+    };
+    static const int cond_ext[16] = {
+        [TCG_COND_EQ] = 0,
+        [TCG_COND_NE] = 4,
+        [TCG_COND_LT] = 1,
+        [TCG_COND_LTU] = 1,
+        [TCG_COND_LE] = 2,
+        [TCG_COND_LEU] = 2,
+        [TCG_COND_NEVER] = 3,
+        [TCG_COND_GE] = 5,
+        [TCG_COND_GEU] = 5,
+        [TCG_COND_GT] = 6,
+        [TCG_COND_GTU] = 6,
+        [TCG_COND_ALWAYS] = 7,
+    };
+
+    tcg_out_vex_modrm_type(s, cmpm_insn[is_unsigned_cond(cond)][vece],
+                           /* k1 */ 1, v1, v2, type);
+    tcg_out8(s, cond_ext[cond]);
+}
+
+static void tcg_out_k1_to_vec(TCGContext *s, TCGType type,
+                              unsigned vece, TCGReg dest)
+{
+    static const int movm_insn[] = {
+        OPC_VPMOVM2B, OPC_VPMOVM2W, OPC_VPMOVM2D, OPC_VPMOVM2Q
+    };
+    tcg_out_vex_modrm_type(s, movm_insn[vece], dest, 0, /* k1 */ 1, type);
+}
+
 static void tcg_out_cmp_vec(TCGContext *s, TCGType type, unsigned vece,
                             TCGReg v0, TCGReg v1, TCGReg v2, TCGCond cond)
 {
+    /*
+     * With avx512, we have a complete set of comparisons into mask.
+     * Unless there's a single insn expansion for the comparision,
+     * expand via a mask in k1.
+     */
+    if ((vece <= MO_16 ? have_avx512bw : have_avx512dq)
+        && cond != TCG_COND_EQ
+        && cond != TCG_COND_LT
+        && cond != TCG_COND_GT) {
+        tcg_out_cmp_vec_k1(s, type, vece, v1, v2, cond);
+        tcg_out_k1_to_vec(s, type, vece, v0);
+        return;
+    }
+
     if (tcg_out_cmp_vec_noinv(s, type, vece, v0, v1, v2, cond)) {
         tcg_out_dupi_vec(s, type, vece, TCG_TMP_VEC, -1);
         tcg_out_vex_modrm_type(s, OPC_PXOR, v0, v0, TCG_TMP_VEC, type);
@@ -4078,7 +4140,7 @@ static TCGCond expand_vec_cond(TCGType type, unsigned vece,
      * We must bias the inputs so that they become signed.
      * All other swapping and inversion are handled during code generation.
      */
-    if (vece == MO_64 && is_unsigned_cond(cond)) {
+    if (vece == MO_64 && !have_avx512dq && is_unsigned_cond(cond)) {
         TCGv_vec v1 = temp_tcgv_vec(arg_temp(*a1));
         TCGv_vec v2 = temp_tcgv_vec(arg_temp(*a2));
         TCGv_vec t1 = tcg_temp_new_vec(type);
-- 
2.43.0

next prev parent reply	other threads:[~2024-09-22 12:02 UTC|newest]

Thread overview: 33+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-09-22 12:00 [PULL 00/31] tcg + linux-user patch queue Richard Henderson
2024-09-22 12:00 ` [PULL 01/31] tcg: Return TCGOp from tcg_gen_op[1-6] Richard Henderson
2024-09-22 12:00 ` [PULL 02/31] tcg: Propagate new TCGOp to add_as_label_use Richard Henderson
2024-09-22 12:00 ` [PULL 03/31] tcg: Fix iteration step in 32-bit gvec operation Richard Henderson
2024-09-22 12:00 ` [PULL 04/31] tcg: Export vec_gen_6 Richard Henderson
2024-09-22 12:00 ` [PULL 05/31] tcg/i386: Split out tcg_out_vex_modrm_type Richard Henderson
2024-09-22 12:00 ` [PULL 06/31] tcg/i386: Do not expand cmp_vec early Richard Henderson
2024-09-22 12:00 ` [PULL 07/31] tcg/i386: Do not expand cmpsel_vec early Richard Henderson
2024-09-22 12:00 ` [PULL 08/31] tcg/ppc: Do not expand cmp_vec early Richard Henderson
2024-09-22 12:00 ` [PULL 09/31] tcg/s390x: " Richard Henderson
2024-09-22 12:00 ` [PULL 10/31] tcg/optimize: Fold movcond with true and false values identical Richard Henderson
2024-09-22 12:00 ` [PULL 11/31] tcg/optimize: Optimize cmp_vec and cmpsel_vec Richard Henderson
2024-09-22 12:00 ` [PULL 12/31] tcg/optimize: Optimize bitsel_vec Richard Henderson
2024-09-22 12:00 ` [PULL 13/31] tcg/i386: Optimize cmpsel with constant 0 operand 3 Richard Henderson
2024-09-22 12:00 ` Richard Henderson [this message]
2024-09-22 12:00 ` [PULL 15/31] tcg/i386: Add predicate parameters to tcg_out_evex_opc Richard Henderson
2024-09-22 12:00 ` [PULL 16/31] tcg/i386: Implement cmpsel_vec with avx512 insns Richard Henderson
2024-09-22 12:00 ` [PULL 17/31] tcg/i386: Implement vector TST{EQ,NE} for avx512 Richard Henderson
2024-09-22 12:00 ` [PULL 18/31] tcg/ppc: Implement cmpsel_vec Richard Henderson
2024-09-22 12:01 ` [PULL 19/31] tcg/ppc: Optimize cmpsel with constant 0/-1 arguments Richard Henderson
2024-09-22 12:01 ` [PULL 20/31] tcg/s390x: Implement cmpsel_vec Richard Henderson
2024-09-22 12:01 ` [PULL 21/31] tcg/s390x: Optimize cmpsel with constant 0/-1 arguments Richard Henderson
2024-09-22 12:01 ` [PULL 22/31] target/ppc: Fix lxvx/stxvx facility check Richard Henderson
2024-09-22 12:01 ` [PULL 23/31] linux-user: update syscall_nr.h to Linux v6.10 Richard Henderson
2024-09-22 12:01 ` [PULL 24/31] linux-user, mips: update syscall-args-o32.c.inc " Richard Henderson
2024-09-22 12:01 ` [PULL 25/31] linux-user: update syscall.tbl " Richard Henderson
2024-09-22 12:01 ` [PULL 26/31] linux-user,aarch64: move to syscalltbl file Richard Henderson
2024-09-22 12:01 ` [PULL 27/31] linux-user,openrisc: " Richard Henderson
2024-09-22 12:01 ` [PULL 28/31] linux-user,riscv: " Richard Henderson
2024-09-22 12:01 ` [PULL 29/31] linux-user,hexagon: " Richard Henderson
2024-09-22 12:01 ` [PULL 30/31] linux-user,loongarch: " Richard Henderson
2024-09-22 12:01 ` [PULL 31/31] linux-user: update syscall.tbl to Linux v6.11 Richard Henderson
2024-09-27 15:18 ` [PULL 00/31] tcg + linux-user patch queue Peter Maydell

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:210389955 dfblob:b1d642fc6 )
 OR (
bs:"[PULL 14/31] tcg/i386: Implement cmp_vec with avx512 insns" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20240922120112.5067-15-richard.henderson@linaro.org \
    --to=richard.henderson@linaro.org \
    --cc=peter.maydell@linaro.org \
    --cc=qemu-devel@nongnu.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).