[Qemu-devel] [PATCH 48/65] tcg/i386: Handle ctz and clz opcodes

qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed

From: Richard Henderson <rth@twiddle.net>
To: qemu-devel@nongnu.org
Subject: [Qemu-devel] [PATCH 48/65] tcg/i386: Handle ctz and clz opcodes
Date: Fri, 23 Dec 2016 20:00:25 -0800	[thread overview]
Message-ID: <20161224040042.12654-49-rth@twiddle.net> (raw)
In-Reply-To: <20161224040042.12654-1-rth@twiddle.net>

Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 tcg/i386/tcg-target.h     |   8 +--
 tcg/i386/tcg-target.inc.c | 125 ++++++++++++++++++++++++++++++++++++++++++----
 2 files changed, 120 insertions(+), 13 deletions(-)

diff --git a/tcg/i386/tcg-target.h b/tcg/i386/tcg-target.h
index f2d9955..8fff287 100644
--- a/tcg/i386/tcg-target.h
+++ b/tcg/i386/tcg-target.h
@@ -93,8 +93,8 @@ extern bool have_bmi1;
 #define TCG_TARGET_HAS_eqv_i32          0
 #define TCG_TARGET_HAS_nand_i32         0
 #define TCG_TARGET_HAS_nor_i32          0
-#define TCG_TARGET_HAS_clz_i32          0
-#define TCG_TARGET_HAS_ctz_i32          0
+#define TCG_TARGET_HAS_clz_i32          1
+#define TCG_TARGET_HAS_ctz_i32          1
 #define TCG_TARGET_HAS_deposit_i32      1
 #define TCG_TARGET_HAS_extract_i32      1
 #define TCG_TARGET_HAS_sextract_i32     1
@@ -127,8 +127,8 @@ extern bool have_bmi1;
 #define TCG_TARGET_HAS_eqv_i64          0
 #define TCG_TARGET_HAS_nand_i64         0
 #define TCG_TARGET_HAS_nor_i64          0
-#define TCG_TARGET_HAS_clz_i64          0
-#define TCG_TARGET_HAS_ctz_i64          0
+#define TCG_TARGET_HAS_clz_i64          1
+#define TCG_TARGET_HAS_ctz_i64          1
 #define TCG_TARGET_HAS_deposit_i64      1
 #define TCG_TARGET_HAS_extract_i64      1
 #define TCG_TARGET_HAS_sextract_i64     0
diff --git a/tcg/i386/tcg-target.inc.c b/tcg/i386/tcg-target.inc.c
index 651d96c..3ed8cd1 100644
--- a/tcg/i386/tcg-target.inc.c
+++ b/tcg/i386/tcg-target.inc.c
@@ -92,6 +92,7 @@ static const int tcg_target_call_oarg_regs[] = {
 #define TCG_CT_CONST_S32 0x100
 #define TCG_CT_CONST_U32 0x200
 #define TCG_CT_CONST_I32 0x400
+#define TCG_CT_CONST_WSZ 0x800
 
 /* Registers used with L constraint, which are the first argument 
    registers on x86_64, and two random call clobbered registers on
@@ -138,6 +139,11 @@ static bool have_bmi2;
 #else
 # define have_bmi2 0
 #endif
+#if defined(CONFIG_CPUID_H) && defined(bit_LZCNT)
+static bool have_lzcnt;
+#else
+# define have_lzcnt 0
+#endif
 
 static tcg_insn_unit *tb_ret_addr;
 
@@ -214,6 +220,10 @@ static const char *target_parse_constraint(TCGArgConstraint *ct,
             tcg_regset_set32(ct->u.regs, 0, 0xff);
         }
         break;
+    case 'W':
+        /* With TZCNT/LZCNT, we can have operand-size as an input.  */
+        ct->ct |= TCG_CT_CONST_WSZ;
+        break;
 
         /* qemu_ld/st address constraint */
     case 'L':
@@ -260,6 +270,9 @@ static inline int tcg_target_const_match(tcg_target_long val, TCGType type,
     if ((ct & TCG_CT_CONST_I32) && ~val == (int32_t)~val) {
         return 1;
     }
+    if ((ct & TCG_CT_CONST_WSZ) && val == (type == TCG_TYPE_I32 ? 32 : 64)) {
+        return 1;
+    }
     return 0;
 }
 
@@ -293,6 +306,8 @@ static inline int tcg_target_const_match(tcg_target_long val, TCGType type,
 #define OPC_ARITH_GvEv	(0x03)		/* ... plus (ARITH_FOO << 3) */
 #define OPC_ANDN        (0xf2 | P_EXT38)
 #define OPC_ADD_GvEv	(OPC_ARITH_GvEv | (ARITH_ADD << 3))
+#define OPC_BSF         (0xbc | P_EXT)
+#define OPC_BSR         (0xbd | P_EXT)
 #define OPC_BSWAP	(0xc8 | P_EXT)
 #define OPC_CALL_Jz	(0xe8)
 #define OPC_CMOVCC      (0x40 | P_EXT)  /* ... plus condition code */
@@ -307,6 +322,7 @@ static inline int tcg_target_const_match(tcg_target_long val, TCGType type,
 #define OPC_JMP_long	(0xe9)
 #define OPC_JMP_short	(0xeb)
 #define OPC_LEA         (0x8d)
+#define OPC_LZCNT       (0xbd | P_EXT | P_SIMDF3)
 #define OPC_MOVB_EvGv	(0x88)		/* stores, more or less */
 #define OPC_MOVL_EvGv	(0x89)		/* stores, more or less */
 #define OPC_MOVL_GvEv	(0x8b)		/* loads, more or less */
@@ -333,6 +349,7 @@ static inline int tcg_target_const_match(tcg_target_long val, TCGType type,
 #define OPC_SHLX        (0xf7 | P_EXT38 | P_DATA16)
 #define OPC_SHRX        (0xf7 | P_EXT38 | P_SIMDF2)
 #define OPC_TESTL	(0x85)
+#define OPC_TZCNT       (0xbc | P_EXT | P_SIMDF3)
 #define OPC_XCHG_ax_r32	(0x90)
 
 #define OPC_GRP3_Ev	(0xf7)
@@ -418,6 +435,11 @@ static void tcg_out_opc(TCGContext *s, int opc, int r, int rm, int x)
     if (opc & P_ADDR32) {
         tcg_out8(s, 0x67);
     }
+    if (opc & P_SIMDF3) {
+        tcg_out8(s, 0xf3);
+    } else if (opc & P_SIMDF2) {
+        tcg_out8(s, 0xf2);
+    }
 
     rex = 0;
     rex |= (opc & P_REXW) ? 0x8 : 0x0;  /* REX.W */
@@ -452,6 +474,11 @@ static void tcg_out_opc(TCGContext *s, int opc)
     if (opc & P_DATA16) {
         tcg_out8(s, 0x66);
     }
+    if (opc & P_SIMDF3) {
+        tcg_out8(s, 0xf3);
+    } else if (opc & P_SIMDF2) {
+        tcg_out8(s, 0xf2);
+    }
     if (opc & (P_EXT | P_EXT38)) {
         tcg_out8(s, 0x0f);
         if (opc & P_EXT38) {
@@ -1080,13 +1107,11 @@ static void tcg_out_setcond2(TCGContext *s, const TCGArg *args,
 }
 #endif
 
-static void tcg_out_movcond32(TCGContext *s, TCGCond cond, TCGArg dest,
-                              TCGArg c1, TCGArg c2, int const_c2,
-                              TCGArg v1)
+static void tcg_out_cmov(TCGContext *s, TCGCond cond, int rexw,
+                         TCGReg dest, TCGReg v1)
 {
-    tcg_out_cmp(s, c1, c2, const_c2, 0);
     if (have_cmov) {
-        tcg_out_modrm(s, OPC_CMOVCC | tcg_cond_to_jcc[cond], dest, v1);
+        tcg_out_modrm(s, OPC_CMOVCC | tcg_cond_to_jcc[cond] | rexw, dest, v1);
     } else {
         TCGLabel *over = gen_new_label();
         tcg_out_jxx(s, tcg_cond_to_jcc[tcg_invert_cond(cond)], over, 1);
@@ -1095,16 +1120,64 @@ static void tcg_out_movcond32(TCGContext *s, TCGCond cond, TCGArg dest,
     }
 }
 
+static void tcg_out_movcond32(TCGContext *s, TCGCond cond, TCGReg dest,
+                              TCGReg c1, TCGArg c2, int const_c2,
+                              TCGReg v1)
+{
+    tcg_out_cmp(s, c1, c2, const_c2, 0);
+    tcg_out_cmov(s, cond, 0, dest, v1);
+}
+
 #if TCG_TARGET_REG_BITS == 64
-static void tcg_out_movcond64(TCGContext *s, TCGCond cond, TCGArg dest,
-                              TCGArg c1, TCGArg c2, int const_c2,
-                              TCGArg v1)
+static void tcg_out_movcond64(TCGContext *s, TCGCond cond, TCGReg dest,
+                              TCGReg c1, TCGArg c2, int const_c2,
+                              TCGReg v1)
 {
     tcg_out_cmp(s, c1, c2, const_c2, P_REXW);
-    tcg_out_modrm(s, OPC_CMOVCC | tcg_cond_to_jcc[cond] | P_REXW, dest, v1);
+    tcg_out_cmov(s, cond, P_REXW, dest, v1);
 }
 #endif
 
+static void tcg_out_ctz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1,
+                        TCGArg arg2, bool const_a2)
+{
+    if (const_a2) {
+        tcg_debug_assert(have_bmi1);
+        tcg_debug_assert(arg2 == (rexw ? 64 : 32));
+        tcg_out_modrm(s, OPC_TZCNT + rexw, dest, arg1);
+    } else {
+        tcg_debug_assert(dest != arg2);
+        tcg_out_modrm(s, OPC_BSF + rexw, dest, arg1);
+        tcg_out_cmov(s, TCG_COND_EQ, rexw, dest, arg2);
+    }
+}
+
+static void tcg_out_clz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1,
+                        TCGArg arg2, bool const_a2)
+{
+    if (have_lzcnt) {
+        tcg_out_modrm(s, OPC_LZCNT + rexw, dest, arg1);
+        if (const_a2) {
+            tcg_debug_assert(arg2 == (rexw ? 64 : 32));
+        } else {
+            tcg_debug_assert(dest != arg2);
+            tcg_out_cmov(s, TCG_COND_LTU, rexw, dest, arg2);
+        }
+    } else {
+        tcg_debug_assert(!const_a2);
+        tcg_debug_assert(dest != arg1);
+        tcg_debug_assert(dest != arg2);
+
+        /* Recall that the output of BSR is the index not the count.  */
+        tcg_out_modrm(s, OPC_BSR + rexw, dest, arg1);
+        tgen_arithi(s, ARITH_XOR + rexw, dest, rexw ? 63 : 31, 0);
+
+        /* Since we have destroyed the flags from BSR, we have to re-test.  */
+        tcg_out_cmp(s, arg1, 0, 1, rexw);
+        tcg_out_cmov(s, TCG_COND_EQ, rexw, dest, arg2);
+    }
+}
+
 static void tcg_out_branch(TCGContext *s, int call, tcg_insn_unit *dest)
 {
     intptr_t disp = tcg_pcrel_diff(s, dest) - 5;
@@ -1995,6 +2068,13 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
         }
         break;
 
+    OP_32_64(ctz):
+        tcg_out_ctz(s, rexw, args[0], args[1], args[2], const_args[2]);
+        break;
+    OP_32_64(clz):
+        tcg_out_clz(s, rexw, args[0], args[1], args[2], const_args[2]);
+        break;
+
     case INDEX_op_brcond_i32:
         tcg_out_brcond32(s, a2, a0, a1, const_args[1], arg_label(args[3]), 0);
         break;
@@ -2359,6 +2439,24 @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
                 = { .args_ct_str = { "r", "r", "0", "1", "re", "re" } };
             return &arith2;
         }
+    case INDEX_op_ctz_i32:
+    case INDEX_op_ctz_i64:
+        {
+            static const TCGTargetOpDef ctz[2] = {
+                { .args_ct_str = { "&r", "r", "r" } },
+                { .args_ct_str = { "&r", "r", "rW" } },
+            };
+            return &ctz[have_bmi1];
+        }
+    case INDEX_op_clz_i32:
+    case INDEX_op_clz_i64:
+        {
+            static const TCGTargetOpDef clz[2] = {
+                { .args_ct_str = { "&r", "r", "r" } },
+                { .args_ct_str = { "&r", "r", "rW" } },
+            };
+            return &clz[have_lzcnt];
+        }
 
     case INDEX_op_qemu_ld_i32:
         return TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? &r_L : &r_L_L;
@@ -2509,6 +2607,15 @@ static void tcg_target_init(TCGContext *s)
     }
 #endif
 
+#ifndef have_lzcnt
+    max = __get_cpuid_max(0x8000000, 0);
+    if (max >= 1) {
+        __cpuid(0x80000001, a, b, c, d);
+        /* LZCNT was introduced with AMD Barcelona and Intel Haswell CPUs.  */
+        have_lzcnt = (c & bit_LZCNT) != 0;
+    }
+#endif
+
     if (TCG_TARGET_REG_BITS == 64) {
         tcg_regset_set32(tcg_target_available_regs[TCG_TYPE_I32], 0, 0xffff);
         tcg_regset_set32(tcg_target_available_regs[TCG_TYPE_I64], 0, 0xffff);
-- 
2.9.3

next prev parent reply	other threads:[~2016-12-24  4:01 UTC|newest]

Thread overview: 69+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2016-12-24  3:59 [Qemu-devel] [PATCH v5 00/65] tcg 2.9 patch queue Richard Henderson
2016-12-24  3:59 ` [Qemu-devel] [PATCH 01/65] tcg: Add field extraction primitives Richard Henderson
2016-12-24  3:59 ` [Qemu-devel] [PATCH 02/65] tcg: Minor adjustments to deposit expanders Richard Henderson
2016-12-24  3:59 ` [Qemu-devel] [PATCH 03/65] tcg: Add deposit_z expander Richard Henderson
2016-12-24  3:59 ` [Qemu-devel] [PATCH 04/65] tcg/aarch64: Implement field extraction opcodes Richard Henderson
2016-12-24  3:59 ` [Qemu-devel] [PATCH 05/65] tcg/arm: Move isa detection to tcg-target.h Richard Henderson
2016-12-24  3:59 ` [Qemu-devel] [PATCH 06/65] tcg/arm: Implement field extraction opcodes Richard Henderson
2016-12-24  3:59 ` [Qemu-devel] [PATCH 07/65] tcg/i386: " Richard Henderson
2016-12-24  3:59 ` [Qemu-devel] [PATCH 08/65] tcg/mips: " Richard Henderson
2016-12-24  3:59 ` [Qemu-devel] [PATCH 09/65] tcg/ppc: " Richard Henderson
2016-12-24  3:59 ` [Qemu-devel] [PATCH 10/65] tcg/s390: Expose host facilities to tcg-target.h Richard Henderson
2016-12-24  3:59 ` [Qemu-devel] [PATCH 11/65] tcg/s390: Implement field extraction opcodes Richard Henderson
2016-12-24  3:59 ` [Qemu-devel] [PATCH 12/65] tcg/s390: Support deposit into zero Richard Henderson
2016-12-24  3:59 ` [Qemu-devel] [PATCH 13/65] target-alpha: Use deposit and extract ops Richard Henderson
2016-12-24  3:59 ` [Qemu-devel] [PATCH 14/65] target-arm: Use new " Richard Henderson
2016-12-24  3:59 ` [Qemu-devel] [PATCH 15/65] target-i386: " Richard Henderson
2016-12-24  3:59 ` [Qemu-devel] [PATCH 16/65] target-mips: Use the new extract op Richard Henderson
2016-12-24  3:59 ` [Qemu-devel] [PATCH 17/65] target-ppc: Use the new deposit and extract ops Richard Henderson
2016-12-24  3:59 ` [Qemu-devel] [PATCH 18/65] target-s390x: " Richard Henderson
2016-12-24  3:59 ` [Qemu-devel] [PATCH 19/65] tcg/optimize: Fold movcond 0/1 into setcond Richard Henderson
2016-12-24  3:59 ` [Qemu-devel] [PATCH 20/65] tcg: Add markup for output requires new register Richard Henderson
2016-12-24  3:59 ` [Qemu-devel] [PATCH 21/65] tcg: Transition flat op_defs array to a target callback Richard Henderson
2016-12-24  3:59 ` [Qemu-devel] [PATCH 22/65] tcg: Pass the opcode width to target_parse_constraint Richard Henderson
2016-12-24  4:00 ` [Qemu-devel] [PATCH 23/65] tcg: Allow an operand to be matching or a constant Richard Henderson
2016-12-24  4:00 ` [Qemu-devel] [PATCH 24/65] tcg: Add clz and ctz opcodes Richard Henderson
2016-12-24  4:00 ` [Qemu-devel] [PATCH 25/65] disas/i386.c: Handle tzcnt Richard Henderson
2016-12-24  4:00 ` [Qemu-devel] [PATCH 26/65] disas/ppc: Handle popcnt and cnttz Richard Henderson
2016-12-24  4:00 ` [Qemu-devel] [PATCH 27/65] target-alpha: Use the ctz and clz opcodes Richard Henderson
2016-12-24  4:00 ` [Qemu-devel] [PATCH 28/65] target-cris: Use clz opcode Richard Henderson
2016-12-24  4:00 ` [Qemu-devel] [PATCH 29/65] target-microblaze: " Richard Henderson
2016-12-24  4:00 ` [Qemu-devel] [PATCH 30/65] target-mips: " Richard Henderson
2016-12-24  4:00 ` [Qemu-devel] [PATCH 31/65] target-openrisc: Use clz and ctz opcodes Richard Henderson
2016-12-24  4:00 ` [Qemu-devel] [PATCH 32/65] target-ppc: " Richard Henderson
2016-12-24  4:00 ` [Qemu-devel] [PATCH 33/65] target-s390x: Use clz opcode Richard Henderson
2016-12-24  4:00 ` [Qemu-devel] [PATCH 34/65] target-tilegx: Use clz and ctz opcodes Richard Henderson
2016-12-24  4:00 ` [Qemu-devel] [PATCH 35/65] target-tricore: Use clz opcode Richard Henderson
2016-12-24  4:00 ` [Qemu-devel] [PATCH 36/65] target-unicore32: " Richard Henderson
2016-12-24  4:00 ` [Qemu-devel] [PATCH 37/65] target-xtensa: " Richard Henderson
2016-12-24  4:00 ` [Qemu-devel] [PATCH 38/65] target-arm: " Richard Henderson
2016-12-24  4:00 ` [Qemu-devel] [PATCH 39/65] target-i386: Use clz and ctz opcodes Richard Henderson
2016-12-24  4:00 ` [Qemu-devel] [PATCH 40/65] tcg/ppc: Handle ctz and clz opcodes Richard Henderson
2016-12-24  4:00 ` [Qemu-devel] [PATCH 41/65] tcg/aarch64: " Richard Henderson
2016-12-24  4:00 ` [Qemu-devel] [PATCH 42/65] tcg/arm: " Richard Henderson
2016-12-24  4:00 ` [Qemu-devel] [PATCH 43/65] tcg/mips: Handle clz opcode Richard Henderson
2016-12-24  4:00 ` [Qemu-devel] [PATCH 44/65] tcg/s390: " Richard Henderson
2016-12-24  4:00 ` [Qemu-devel] [PATCH 45/65] tcg/i386: Fuly convert tcg_target_op_def Richard Henderson
2016-12-24  4:00 ` [Qemu-devel] [PATCH 46/65] tcg/i386: Hoist common arguments in tcg_out_op Richard Henderson
2016-12-24  4:00 ` [Qemu-devel] [PATCH 47/65] tcg/i386: Allow bmi2 shiftx to have non-matching operands Richard Henderson
2016-12-24  4:00 ` Richard Henderson [this message]
2016-12-24  4:00 ` [Qemu-devel] [PATCH 49/65] tcg/i386: Rely on undefined/undocumented behaviour of BSF/BSR Richard Henderson
2017-01-16 19:19   ` Eduardo Habkost
2017-01-16 19:35     ` Eduardo Habkost
2016-12-24  4:00 ` [Qemu-devel] [PATCH 50/65] tcg: Add helpers for clrsb Richard Henderson
2016-12-24  4:00 ` [Qemu-devel] [PATCH 51/65] target-arm: Use clrsb helper Richard Henderson
2016-12-24  4:00 ` [Qemu-devel] [PATCH 52/65] target-tricore: " Richard Henderson
2016-12-24  4:00 ` [Qemu-devel] [PATCH 53/65] target-xtensa: " Richard Henderson
2016-12-24  4:00 ` [Qemu-devel] [PATCH 54/65] tcg: Add opcode for ctpop Richard Henderson
2016-12-24  4:00 ` [Qemu-devel] [PATCH 55/65] target-alpha: Use ctpop helper Richard Henderson
2016-12-24  4:00 ` [Qemu-devel] [PATCH 56/65] target-ppc: " Richard Henderson
2016-12-24  4:00 ` [Qemu-devel] [PATCH 57/65] target-s390x: Avoid a loop for popcnt Richard Henderson
2016-12-24  4:00 ` [Qemu-devel] [PATCH 58/65] target-sparc: Use ctpop helper Richard Henderson
2016-12-30 18:25   ` Mark Cave-Ayland
2016-12-24  4:00 ` [Qemu-devel] [PATCH 59/65] target-tilegx: " Richard Henderson
2016-12-24  4:00 ` [Qemu-devel] [PATCH 60/65] target-i386: " Richard Henderson
2016-12-24  4:00 ` [Qemu-devel] [PATCH 61/65] qemu/host-utils.h: Reduce the operation count in the fallback ctpop Richard Henderson
2016-12-24  4:00 ` [Qemu-devel] [PATCH 62/65] tests: New test-bitcnt Richard Henderson
2016-12-24  4:00 ` [Qemu-devel] [PATCH 63/65] tcg: Use ctpop to generate ctz if needed Richard Henderson
2016-12-24  4:00 ` [Qemu-devel] [PATCH 64/65] tcg/ppc: Handle ctpop opcode Richard Henderson
2016-12-24  4:00 ` [Qemu-devel] [PATCH 65/65] tcg/i386: " Richard Henderson

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:f2d9955 dfblob:8fff287 dfblob:651d96c dfblob:3ed8cd1 )
 OR (
bs:"[Qemu-devel] [PATCH 48/65] tcg/i386: Handle ctz and clz opcodes" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20161224040042.12654-49-rth@twiddle.net \
    --to=rth@twiddle.net \
    --cc=qemu-devel@nongnu.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).